# Multimodal Integration

Combine documents and audio in a unified pipeline.

## Setup

In [None]:
from pathlib import Path
from vertector_data_ingestion import (
    UniversalConverter,
    LocalMpsConfig,
    HybridChunker,
    ChromaAdapter,
    create_audio_transcriber,
    AudioConfig,
    WhisperModelSize,
    setup_logging,
)

from vertector_data_ingestion.models.config import ChunkingConfig

setup_logging(log_level="INFO")

## Multimodal Pipeline

In [None]:
class MultimodalPipeline:
    def __init__(self):
        self.converter = UniversalConverter(LocalMpsConfig())
        self.chunk_config = ChunkingConfig(
            tokenizer="Qwen/Qwen3-Embedding-0.6B",
            max_tokens=512,
        )
        self.chunker = HybridChunker(config=self.chunk_config)
        self.audio_transcriber = create_audio_transcriber(
            AudioConfig(model_size=WhisperModelSize.BASE)
        )
        self.vector_store = ChromaAdapter(
            collection_name="multimodal",
            embedding_model="Qwen/Qwen3-Embedding-0.6B"
        )
    
    def process_document(self, path: Path):
        print(f"Processing: {path.name}")
        doc = self.converter.convert(path)
        chunks = self.chunker.chunk_document(doc)
        
        for chunk in chunks.chunks:
            chunk.metadata["modality"] = "document"
            chunk.metadata["source"] = path.name
        
        self.vector_store.add_chunks(chunks.chunks)
        print(f"  Added {len(chunks.chunks)} chunks")
        return len(chunks.chunks)
    
    def process_audio(self, path: Path):
        print(f"Processing: {path.name}")
        result = self.audio_transcriber.transcribe(path)
        
        from vertector_data_ingestion.models.chunk import DocumentChunk
        from transformers import AutoTokenizer
        
        # Load tokenizer for token counting
        tokenizer = AutoTokenizer.from_pretrained(self.chunk_config.tokenizer)
        
        chunks = []
        for i, segment in enumerate(result.segments):
            # Count tokens in the segment text
            tokens = tokenizer.encode(segment.text, add_special_tokens=False)
            
            chunk = DocumentChunk(
                chunk_id=f"{path.stem}_{i}",
                text=segment.text,
                token_count=len(tokens),
                source_path=path,
                chunk_index=i,
                metadata={
                    "modality": "audio",
                    "source": path.name,
                    "start_time": segment.start,
                    "end_time": segment.end,
                    "duration": segment.end - segment.start,
                }
            )
            chunks.append(chunk)
        
        self.vector_store.add_chunks(chunks)
        print(f"  Added {len(chunks)} audio segments")
        return len(chunks)
    
    def search(self, query: str, top_k: int = 5):
        return self.vector_store.search(query, top_k=top_k)

pipeline = MultimodalPipeline()
print("âœ“ Pipeline ready")

## Process Documents and Audio

In [None]:
# Process documents
doc_path = Path("../test_documents/2112.13734v2.pdf")
if doc_path.exists():
    pipeline.process_document(doc_path)

# Process audio
audio_path = Path("../test_documents/harvard.wav")
if audio_path.exists():
    pipeline.process_audio(audio_path)

## Cross-Modal Search

In [None]:
results = pipeline.search("How does the salt pickle taste?", top_k=3)

for i, result in enumerate(results, 1):
    modality = result['metadata'].get('modality', 'unknown')
    source = result['metadata'].get('source', 'unknown')
    
    print(f"\nResult {i} [{modality.upper()}]:")
    print(f"  Source: {source}")
    print(f"  Text: {result['text'][:100]}...")

In [None]:
results = pipeline.search("medical imaging", top_k=3)

for i, result in enumerate(results, 1):
    modality = result['metadata'].get('modality', 'unknown')
    source = result['metadata'].get('source', 'unknown')
    
    print(f"\nResult {i} [{modality.upper()}]:")
    print(f"  Source: {source}")
    print(f"  Text: {result['text'][:100]}...")

## Summary

Demonstrated:
- Unified multimodal pipeline
- Document and audio processing
- Cross-modal search

See documentation for more examples.