# Multimodal Integration

Combine documents and audio in a unified pipeline.

## Setup

In [1]:
from pathlib import Path
from vertector_data_ingestion import (
    UniversalConverter,
    LocalMpsConfig,
    HybridChunker,
    ChromaAdapter,
    create_audio_transcriber,
    AudioConfig,
    WhisperModelSize,
    setup_logging,
)

from vertector_data_ingestion.models.config import ChunkingConfig

setup_logging(log_level="INFO")

[32m2026-01-02 21:26:19[0m | [1mINFO    [0m | [36mvertector_data_ingestion.monitoring.logger[0m:[36msetup_logging[0m:[36m51[0m - [1mLogging initialized at INFO level[0m


## Multimodal Pipeline

In [2]:
class MultimodalPipeline:
    def __init__(self):
        self.converter = UniversalConverter(LocalMpsConfig())
        self.chunk_config = ChunkingConfig(
            tokenizer="Qwen/Qwen3-Embedding-0.6B",
            max_tokens=512,
        )
        self.chunker = HybridChunker(config=self.chunk_config)
        self.audio_transcriber = create_audio_transcriber(
            AudioConfig(model_size=WhisperModelSize.BASE)
        )
        self.vector_store = ChromaAdapter(
            collection_name="multimodal",
            embedding_model="Qwen/Qwen3-Embedding-0.6B"
        )
    
    def process_document(self, path: Path):
        print(f"Processing: {path.name}")
        doc = self.converter.convert(path)
        chunks = self.chunker.chunk_document(doc)
        
        for chunk in chunks.chunks:
            chunk.metadata["modality"] = "document"
            chunk.metadata["source"] = path.name
        
        self.vector_store.add_chunks(chunks.chunks)
        print(f"  Added {len(chunks.chunks)} chunks")
        return len(chunks.chunks)
    
    def process_audio(self, path: Path):
        print(f"Processing: {path.name}")
        result = self.audio_transcriber.transcribe(path)
        
        from vertector_data_ingestion.models.chunk import DocumentChunk
        from transformers import AutoTokenizer
        
        # Load tokenizer for token counting
        tokenizer = AutoTokenizer.from_pretrained(self.chunk_config.tokenizer)
        
        chunks = []
        for i, segment in enumerate(result.segments):
            # Count tokens in the segment text
            tokens = tokenizer.encode(segment.text, add_special_tokens=False)
            
            chunk = DocumentChunk(
                chunk_id=f"{path.stem}_{i}",
                text=segment.text,
                token_count=len(tokens),
                source_path=path,
                chunk_index=i,
                metadata={
                    "modality": "audio",
                    "source": path.name,
                    "start_time": segment.start,
                    "end_time": segment.end,
                    "duration": segment.end - segment.start,
                }
            )
            chunks.append(chunk)
        
        self.vector_store.add_chunks(chunks)
        print(f"  Added {len(chunks)} audio segments")
        return len(chunks)
    
    def search(self, query: str, top_k: int = 5):
        return self.vector_store.search(query, top_k=top_k)

pipeline = MultimodalPipeline()
print("✓ Pipeline ready")

[32m2026-01-02 21:26:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion.core.hardware_detector[0m:[36mdetect[0m:[36m50[0m - [1mDetected Apple Silicon with MPS support[0m
[32m2026-01-02 21:26:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion.core.hardware_detector[0m:[36mdetect[0m:[36m50[0m - [1mDetected Apple Silicon with MPS support[0m
[32m2026-01-02 21:26:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion.core.pipeline_router[0m:[36m__init__[0m:[36m55[0m - [1mHardware detected: mps[0m
[32m2026-01-02 21:26:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion.core.universal_converter[0m:[36m__init__[0m:[36m44[0m - [1mInitialized UniversalConverter on mps[0m
[32m2026-01-02 21:26:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion.core.universal_converter[0m:[36m_ensure_models_available[0m:[36m67[0m - [1mChecking model availability...[0m
[32m2026-01-02 21:26:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion

✓ Pipeline ready


## Process Documents and Audio

In [3]:
# Process documents
doc_path = Path("../test_documents/2112.13734v2.pdf")
if doc_path.exists():
    pipeline.process_document(doc_path)

# Process audio
audio_path = Path("../test_documents/harvard.wav")
if audio_path.exists():
    pipeline.process_audio(audio_path)

Processing: 2112.13734v2.pdf
Consider using the pymupdf_layout package for a greatly improved page layout analysis.


[32m2026-01-02 21:27:35[0m | [1mINFO    [0m | [36mvertector_data_ingestion.core.pipeline_router[0m:[36mdetermine_pipeline[0m:[36m105[0m - [1mUsing Classic pipeline (default) for 2112.13734v2.pdf[0m
[32m2026-01-02 21:27:35[0m | [1mINFO    [0m | [36mvertector_data_ingestion.core.universal_converter[0m:[36m_convert_with_retry[0m:[36m175[0m - [1mConverting 2112.13734v2.pdf with classic pipeline[0m
2026-01-02 21:27:35,939 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-02 21:27:36,012 - INFO - Going to convert document batch...
2026-01-02 21:27:36,013 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 870e160bad93d15722a8ae8d62725e09
2026-01-02 21:27:36,027 - INFO - Loading plugin 'docling_defaults'
2026-01-02 21:27:36,029 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-01-02 21:27:36,042 - INFO - Loading plugin 'docling_defaults'
2026-01-02 21:27:36,048 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', '

  Added 22 chunks
Processing: harvard.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[32m2026-01-02 21:28:30[0m | [1mINFO    [0m | [36mvertector_data_ingestion.audio.whisper_transcriber[0m:[36mtranscribe[0m:[36m193[0m - [1mTranscription complete in 3.65s: 216 chars, 6 segments[0m
[32m2026-01-02 21:28:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion.vector.chroma_adapter[0m:[36madd_chunks[0m:[36m75[0m - [1mAdding 6 chunks to ChromaDB (batch_size=16)[0m
[32m2026-01-02 21:28:31[0m | [1mINFO    [0m | [36mvertector_data_ingestion.vector.chroma_adapter[0m:[36madd_chunks[0m:[36m147[0m - [1mSuccessfully added 6 chunks in 1 batches[0m


  Added 6 audio segments


## Cross-Modal Search

In [5]:
results = pipeline.search("How does the salt pickle taste?", top_k=3)

for i, result in enumerate(results, 1):
    modality = result['metadata'].get('modality', 'unknown')
    source = result['metadata'].get('source', 'unknown')
    
    print(f"\nResult {i} [{modality.upper()}]:")
    print(f"  Source: {source}")
    print(f"  Text: {result['text'][:100]}...")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Result 1 [AUDIO]:
  Source: harvard.wav
  Text: A salt pickle tastes fine with ham....

Result 2 [AUDIO]:
  Source: harvard.wav
  Text: A cold dip restores health and zest....

Result 3 [AUDIO]:
  Source: harvard.wav
  Text: It takes heat to bring out the odor....


In [6]:
results = pipeline.search("medical imaging", top_k=3)

for i, result in enumerate(results, 1):
    modality = result['metadata'].get('modality', 'unknown')
    source = result['metadata'].get('source', 'unknown')
    
    print(f"\nResult {i} [{modality.upper()}]:")
    print(f"  Source: {source}")
    print(f"  Text: {result['text'][:100]}...")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Result 1 [DOCUMENT]:
  Source: 2112.13734v2.pdf
  Text: Learning models that generalize under different distribution shifts in medical imaging has been a lo...

Result 2 [DOCUMENT]:
  Source: 2112.13734v2.pdf
  Text: We aim to classify 4 chest X-ray pathologies, namely Cardiomegaly, Consolidation, Edema, and Effusio...

Result 3 [DOCUMENT]:
  Source: 2112.13734v2.pdf
  Text: - [1] A. Bustos, A. Pertusa, J. M. Salinas, and M. Iglesia-Vayá. Padchest: A large chest x-ray image...


## Summary

Demonstrated:
- Unified multimodal pipeline
- Document and audio processing
- Cross-modal search

See documentation for more examples.