# RAG Pipeline with Vertector

Complete RAG pipeline including:
- Document chunking
- Vector store integration
- Semantic search
- Batch ingestion

## Setup

In [None]:
from pathlib import Path
from vertector_data_ingestion import (
    UniversalConverter,
    LocalMpsConfig,
    HybridChunker,
    ChromaAdapter,
    ExportFormat,
    setup_logging,
)
from vertector_data_ingestion.models.config import ChunkingConfig

setup_logging(log_level="INFO")

## Basic RAG Pipeline

In [None]:
# Configure chunker with Qwen3-Embedding-0.6B (smaller, faster)
chunk_config = ChunkingConfig(
    tokenizer="Qwen/Qwen3-Embedding-0.6B",
    max_tokens=512,
)

converter = UniversalConverter(LocalMpsConfig())
doc_path = Path("../test_documents/arxiv_sample.pdf")

if doc_path.exists():
    # Step 1: Convert
    print("Step 1: Converting Document")
    doc = converter.convert(doc_path)
    print(f"✓ Converted: {doc.metadata.num_pages} pages")
    
    # Step 2: Chunk with custom config
    print("\nStep 2: Creating Chunks")
    print(f"Using tokenizer: {chunk_config.tokenizer}")
    chunker = HybridChunker(config=chunk_config)
    chunks = chunker.chunk_document(doc)
    print(f"✓ Created: {chunks.total_chunks} chunks")
    
    # Step 3: Store with matching embedding model
    print("\nStep 3: Storing in Vector DB")
    vector_store = ChromaAdapter(
        collection_name="rag_pipeline",
        embedding_model="Qwen/Qwen3-Embedding-0.6B"
    )
    vector_store.add_chunks(chunks.chunks, batch_size=4)
    print(f"✓ Stored: {len(chunks.chunks)} chunks")
    
    # Step 4: Search
    print("\nStep 4: Semantic Search")
    results = vector_store.search("Who is the main author of this paper?", top_k=3)
    for i, result in enumerate(results, 1):
        print(f"\nResult {i}:")
        print(f"  Score: {result['score']:.3f}")
        print(f"  Text: {result['text'][:100]}...")
else:
    print(f"File not found: {doc_path}")

## Batch Document Ingestion

In [None]:
documents_dir = Path("../test_documents/")

if documents_dir.exists():
    pdf_files = list(documents_dir.glob("*.pdf"))[:5]
    
    if pdf_files:
        print(f"Ingesting {len(pdf_files)} documents...\n")
        
        # Configure with Qwen3-Embedding-0.6B (default)
        chunk_config = ChunkingConfig(
            tokenizer="Qwen/Qwen3-Embedding-0.6B",
            max_tokens=512,
        )
        
        converter = UniversalConverter()
        chunker = HybridChunker(config=chunk_config)
        vector_store = ChromaAdapter(
            collection_name="rag_pipeline",
            embedding_model="Qwen/Qwen3-Embedding-0.6B"
        )
        
        # Convert all documents
        docs = converter.convert(pdf_files, parallel=True)
        
        # Chunk and store all
        all_chunks = []
        for doc in docs:
            chunks = chunker.chunk_document(doc)
            for chunk in chunks.chunks:
                chunk.metadata["source_file"] = doc.metadata.source_path.name
            all_chunks.extend(chunks.chunks)
        
        vector_store.add_chunks(all_chunks, batch_size=4)
        print(f"\n✓ Ingested {len(all_chunks)} chunks from {len(docs)} documents")
    else:
        print("No PDF files found")
else:
    print("Create a 'documents/' directory")

## Advanced Search

In [None]:
vector_store = ChromaAdapter(
    collection_name="rag_pipeline",
    embedding_model="Qwen/Qwen3-Embedding-0.6B"
)

queries = [
    "methodology",
    "main findings",
    "limitations",
]

for query in queries:
    results = vector_store.search(query, top_k=1)
    if results:
        print(f"Q: {query}")
        print(f"A: {results[0]['text'][:150]}...\n")

## Summary

Demonstrated:
- Complete RAG pipeline
- Batch document ingestion with unified `convert()`
- Vector search

Next: `04_multimodal_integration.ipynb`