# RAG Document Processing Pipeline

This notebook processes educational documents in batch from a local folder and uploads them to ChromaDB for use by the backend system.

## Features
- Processes PDF, DOCX, TXT, and Markdown files
- Automatic metadata extraction from filenames
- Document chunking with overlap
- Embedding generation using sentence transformers
- Batch upload to ChromaDB with progress tracking
- Error handling and logging

## Usage
1. Place your documents in the `./documents/` folder
2. Configure the settings below
3. Run all cells to process and upload documents
4. Verify results using the testing section

## 1. Setup Environment and Imports

In [None]:
# Import standard libraries
import sys
import os
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from tqdm import tqdm
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

# Add src directory to path for imports
current_dir = Path.cwd()
src_path = current_dir / "src" 
sys.path.insert(0, str(src_path))

print(f"📂 Current directory: {current_dir}")
print(f"📁 Source path added: {src_path}")

# Import RAG pipeline components
try:
    from src import DocumentProcessor, DocumentChunk, VectorStore, EmbeddingService, get_settings
    print("✅ All RAG components imported successfully!")
except ImportError as e:
    print(f"⚠️ Import error: {e}")
    print("📝 Trying individual imports...")
    
    # Try individual imports as fallback
    try:
        from src.document_processor import DocumentProcessor, DocumentChunk
        from src.vector_store import VectorStore  
        from src.embeddings import EmbeddingService
        from src.config import get_settings
        print("✅ Individual imports successful!")
    except ImportError as e2:
        print(f"❌ Import failed: {e2}")
        print("💡 Make sure you're running this notebook from the rag-pipeline directory")
        raise

print("\n📦 Available components:")
print("  - DocumentProcessor: Handles document chunking and metadata extraction") 
print("  - DocumentChunk: Container for processed document segments")
print("  - VectorStore: Manages ChromaDB storage and similarity search")
print("  - EmbeddingService: Generates embeddings using sentence transformers")
print("  - get_settings: Loads configuration from .env file")
print("\n🚀 Ready to process documents!")

## 2. Configuration and Settings

In [None]:
# Load configuration from .env file
try:
    settings = get_settings()
    print("✅ Configuration loaded successfully!")
except Exception as e:
    print(f"⚠️ Warning: Could not load .env file. Using default settings. Error: {e}")
    
    # Fallback configuration
    class Settings:
        def __init__(self):
            self.openai_api_key = os.getenv("OPENAI_API_KEY", "")
            self.vector_db_path = "../backend/chroma_db"
            self.collection_name = "school_knowledge"
            self.local_embedding_model = "all-MiniLM-L6-v2"
            self.use_openai_embeddings = False
            self.chunk_size = 1000
            self.chunk_overlap = 200
            self.min_chunk_size = 100
            self.batch_size = 32
            self.documents_dir = "./documents"
    
    settings = Settings()

# Display configuration
print("\n📋 Current Configuration:")
print(f"🗂️ Documents directory: {settings.documents_dir}")
print(f"🗃️ Vector DB path: {settings.vector_db_path}")
print(f"📚 Collection name: {settings.collection_name}")
print(f"🤖 Embedding model: {settings.local_embedding_model}")
print(f"🔢 Chunk size: {settings.chunk_size}")
print(f"📊 Batch size: {settings.batch_size}")
print(f"🔑 OpenAI API key configured: {'Yes' if settings.openai_api_key else 'No'}")

# Create documents directory if it doesn't exist
documents_path = Path(settings.documents_dir)
documents_path.mkdir(exist_ok=True)
print(f"📁 Documents directory ready: {documents_path.absolute()}")

## 3. Initialize Components

In [None]:
# Initialize document processor
print("🔄 Initializing document processor...")
doc_processor = DocumentProcessor(
    chunk_size=settings.chunk_size,
    chunk_overlap=settings.chunk_overlap,
    min_chunk_size=settings.min_chunk_size
)
print("✅ Document processor initialized")

# Initialize embedding service
print("🔄 Initializing embedding service...")
embedding_service = EmbeddingService(
    openai_api_key=settings.openai_api_key if hasattr(settings, 'openai_api_key') else None,
    model_name=settings.local_embedding_model
)
print("✅ Embedding service initialized")

# Initialize vector store (connects to backend ChromaDB)
print("🔄 Connecting to ChromaDB...")
try:
    vector_store = VectorStore(
        collection_name=settings.collection_name,
        persist_directory=settings.vector_db_path
    )
    print("✅ Connected to ChromaDB successfully")
    print(f"📊 Current document count: {vector_store.count_documents()}")
except Exception as e:
    print(f"❌ Error connecting to ChromaDB: {e}")
    raise

## 4. Document Discovery and Loading

In [None]:
def discover_documents(documents_dir: str) -> List[Path]:
    """Discover all supported document files in the directory."""
    supported_extensions = {'.pdf', '.docx', '.doc', '.txt', '.md', '.markdown'}
    documents = []
    
    documents_path = Path(documents_dir)
    
    for ext in supported_extensions:
        pattern = f"*{ext}"
        files = list(documents_path.glob(pattern))
        documents.extend(files)
        if files:
            print(f"📄 Found {len(files)} {ext} files")
    
    return sorted(documents)

# Discover documents in the documents folder
print("🔍 Discovering documents...")
document_files = discover_documents(settings.documents_dir)

if not document_files:
    print(f"⚠️ No documents found in {settings.documents_dir}")
    print("📝 Supported formats: PDF, DOCX, DOC, TXT, MD")
    print("💡 Please add some documents to the documents folder and re-run this cell")
else:
    print(f"✅ Found {len(document_files)} documents to process:")
    for doc in document_files:
        size_mb = doc.stat().st_size / (1024 * 1024)
        print(f"  📄 {doc.name} ({size_mb:.2f} MB)")

## 5. Document Processing and Chunking

In [None]:
def process_document_with_metadata(file_path: Path) -> List[DocumentChunk]:
    """Process a document file and extract metadata from filename."""
    
    # Extract metadata from filename
    filename_metadata = doc_processor.extract_metadata_from_filename(file_path.name)
    
    # Add source metadata
    source_metadata = {
        'processed_at': datetime.now().isoformat(),
        'file_path': str(file_path),
        **filename_metadata
    }
    
    try:
        # Process the document
        chunks = doc_processor.process_file(file_path, source_metadata)
        print(f"  ✅ {file_path.name}: {len(chunks)} chunks")
        return chunks
    except Exception as e:
        print(f"  ❌ {file_path.name}: Error - {str(e)}")
        return []

# Process all documents
if document_files:
    print("⚙️ Processing documents...")
    all_chunks = []
    
    for doc_file in tqdm(document_files, desc="Processing documents"):
        chunks = process_document_with_metadata(doc_file)
        all_chunks.extend(chunks)
    
    print(f"\n✅ Processing complete!")
    print(f"📊 Total documents processed: {len(document_files)}")
    print(f"📊 Total chunks created: {len(all_chunks)}")
    
    if all_chunks:
        avg_chunk_size = sum(len(chunk.content) for chunk in all_chunks) / len(all_chunks)
        print(f"📊 Average chunk size: {avg_chunk_size:.0f} characters")
        
        # Show sample metadata
        print(f"\n📋 Sample metadata from first chunk:")
        sample_metadata = all_chunks[0].metadata
        for key, value in sample_metadata.items():
            print(f"  {key}: {value}")
else:
    print("⏭️ Skipping processing - no documents found")

## 6. Embedding Generation

In [None]:
async def generate_embeddings_for_chunks(chunks: List[DocumentChunk]) -> List[np.ndarray]:
    """Generate embeddings for all document chunks."""
    if not chunks:
        return []
    
    # Extract text content from chunks
    texts = [chunk.content for chunk in chunks]
    
    print(f"🧠 Generating embeddings for {len(texts)} chunks...")
    
    # Generate embeddings in batches
    use_openai = hasattr(settings, 'use_openai_embeddings') and settings.use_openai_embeddings
    
    embeddings = await embedding_service.embed_documents(
        texts, 
        use_openai=use_openai, 
        batch_size=settings.batch_size
    )
    
    print(f"✅ Generated {len(embeddings)} embeddings")
    return embeddings

# Generate embeddings if we have chunks
if 'all_chunks' in locals() and all_chunks:
    print("🔄 Starting embedding generation...")
    embeddings = await generate_embeddings_for_chunks(all_chunks)
    
    if embeddings:
        embedding_dim = len(embeddings[0]) if embeddings else 0
        print(f"📊 Embedding dimension: {embedding_dim}")
        print(f"📊 Total embeddings: {len(embeddings)}")
    else:
        print("❌ No embeddings generated")
else:
    print("⏭️ Skipping embedding generation - no chunks available")
    embeddings = []

## 7. Batch Upload to ChromaDB

In [None]:
def upload_to_chromadb(chunks: List[DocumentChunk], embeddings: List[np.ndarray]):
    """Upload chunks and embeddings to ChromaDB."""
    if not chunks or not embeddings:
        print("⚠️ No data to upload")
        return
    
    if len(chunks) != len(embeddings):
        print(f"❌ Mismatch: {len(chunks)} chunks vs {len(embeddings)} embeddings")
        return
    
    print(f"📤 Uploading {len(chunks)} chunks to ChromaDB...")
    
    # Prepare data for ChromaDB
    documents = [chunk.content for chunk in chunks]
    metadatas = [chunk.metadata for chunk in chunks]
    ids = [chunk.chunk_id for chunk in chunks]
    
    # Upload in batches
    batch_size = settings.batch_size
    total_uploaded = 0
    
    for i in tqdm(range(0, len(chunks), batch_size), desc="Uploading batches"):
        end_idx = min(i + batch_size, len(chunks))
        
        batch_documents = documents[i:end_idx]
        batch_metadatas = metadatas[i:end_idx]
        batch_embeddings = embeddings[i:end_idx]
        batch_ids = ids[i:end_idx]
        
        try:
            vector_store.add_documents(
                documents=batch_documents,
                metadatas=batch_metadatas,
                embeddings=batch_embeddings,
                ids=batch_ids
            )
            total_uploaded += len(batch_documents)
        except Exception as e:
            print(f"❌ Error uploading batch {i//batch_size + 1}: {e}")
            continue
    
    print(f"✅ Upload complete! {total_uploaded} chunks uploaded")
    print(f"📊 New total document count: {vector_store.count_documents()}")

# Upload data if available
if 'all_chunks' in locals() and 'embeddings' in locals() and all_chunks and embeddings:
    print("🚀 Starting upload to ChromaDB...")
    upload_to_chromadb(all_chunks, embeddings)
else:
    print("⏭️ Skipping upload - no data available")

## 8. Verification and Testing

In [None]:
def test_search_functionality():
    """Test the search functionality of the uploaded documents."""
    
    print("🔍 Testing search functionality...")
    
    # Get some statistics
    total_docs = vector_store.count_documents()
    print(f"📊 Total documents in ChromaDB: {total_docs}")
    
    if total_docs == 0:
        print("⚠️ No documents to search")
        return
    
    # Test basic search
    test_queries = [
        "mathematics",
        "science",
        "history", 
        "algebra",
        "biology"
    ]
    
    for query in test_queries:
        print(f"\n🔍 Testing query: '{query}'")
        try:
            results = vector_store.similarity_search(query, n_results=3)
            
            if results['documents'] and results['documents'][0]:
                print(f"  ✅ Found {len(results['documents'][0])} results")
                
                # Show first result
                first_doc = results['documents'][0][0]
                first_metadata = results['metadatas'][0][0] if results['metadatas'][0] else {}
                first_distance = results['distances'][0][0] if results['distances'][0] else 0
                
                print(f"  📄 Top result (distance: {first_distance:.3f}):")
                print(f"    Subject: {first_metadata.get('subject', 'Unknown')}")
                print(f"    Grade: {first_metadata.get('grade', 'Unknown')}")
                print(f"    Filename: {first_metadata.get('filename', 'Unknown')}")
                print(f"    Preview: {first_doc[:100]}...")
            else:
                print("  ❌ No results found")
        except Exception as e:
            print(f"  ❌ Search error: {e}")

def get_knowledge_base_stats():
    """Get statistics about the knowledge base."""
    print("📊 Knowledge Base Statistics:")
    
    try:
        # Get all documents
        all_docs = vector_store.get_documents_by_metadata({})
        
        if not all_docs['metadatas']:
            print("  No metadata available")
            return
        
        metadatas = all_docs['metadatas']
        
        # Count subjects
        subjects = set()
        grades = set()
        files = set()
        
        for metadata in metadatas:
            if 'subject' in metadata:
                subjects.add(metadata['subject'])
            if 'grade' in metadata:
                grades.add(metadata['grade'])
            if 'filename' in metadata:
                files.add(metadata['filename'])
        
        print(f"  📚 Unique subjects: {len(subjects)} - {sorted(subjects)}")
        print(f"  🎓 Unique grades: {len(grades)} - {sorted(grades)}")
        print(f"  📄 Unique files: {len(files)}")
        print(f"  📊 Total chunks: {len(metadatas)}")
        
    except Exception as e:
        print(f"  ❌ Error getting stats: {e}")

# Run verification tests
print("🧪 Running verification tests...")
get_knowledge_base_stats()
print("\n" + "="*50)
test_search_functionality()

## 9. Utility Functions

Additional utility functions for managing the knowledge base.

In [None]:
def reset_knowledge_base():
    """Reset (clear) the entire knowledge base. Use with caution!"""
    confirm = input("⚠️ This will delete ALL documents from ChromaDB. Type 'CONFIRM' to proceed: ")
    
    if confirm == "CONFIRM":
        try:
            vector_store.reset_collection()
            print("✅ Knowledge base reset successfully")
            print(f"📊 New document count: {vector_store.count_documents()}")
        except Exception as e:
            print(f"❌ Error resetting knowledge base: {e}")
    else:
        print("❌ Reset cancelled")

def export_processing_report():
    """Export a processing report with statistics."""
    
    report = {
        "timestamp": datetime.now().isoformat(),
        "configuration": {
            "documents_dir": settings.documents_dir,
            "vector_db_path": settings.vector_db_path,
            "collection_name": settings.collection_name,
            "chunk_size": settings.chunk_size,
            "batch_size": settings.batch_size
        },
        "processing_results": {
            "documents_found": len(document_files) if 'document_files' in locals() else 0,
            "chunks_created": len(all_chunks) if 'all_chunks' in locals() else 0,
            "embeddings_generated": len(embeddings) if 'embeddings' in locals() else 0,
            "final_db_count": vector_store.count_documents()
        }
    }
    
    report_file = f"processing_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    with open(report_file, 'w') as f:
        json.dump(report, f, indent=2)
    
    print(f"📊 Processing report exported to: {report_file}")
    return report

# Uncomment the following lines to use these utilities:

# Reset knowledge base (CAUTION!)
# reset_knowledge_base()

# Export processing report
report = export_processing_report()
print(f"\n📋 Processing Summary:")
print(f"  Documents processed: {report['processing_results']['documents_found']}")
print(f"  Chunks created: {report['processing_results']['chunks_created']}")
print(f"  Total documents in DB: {report['processing_results']['final_db_count']}")