# RAG Document Processing Pipeline

This notebook processes educational documents in batch from a local folder and uploads them to ChromaDB for use by the backend system.

## Features
- Processes PDF, DOCX, TXT, and Markdown files
- Automatic metadata extraction from filenames
- Document chunking with overlap
- Embedding generation using sentence transformers
- Batch upload to ChromaDB with progress tracking
- Error handling and logging

## Usage
1. Place your documents in the `./documents/` folder
2. Configure the settings below
3. Run all cells to process and upload documents
4. Verify results using the testing section

## 1. Setup Environment and Dependencies

**Important:** This notebook requires several Python packages. If you get import errors, run the installation cell below first.

In [6]:
# Install required dependencies
# Run this cell if you get ModuleNotFoundError or import errors

import subprocess
import sys
import os

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ Successfully installed {package}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package}: {e}")
        return False

def check_package(package_name, import_name=None):
    """Check if a package is available for import."""
    if import_name is None:
        import_name = package_name.replace("-", "_")
    
    try:
        __import__(import_name)
        return True
    except ImportError:
        return False

# Required packages with their import names
required_packages = [
    ("chromadb", "chromadb"),
    ("sentence-transformers", "sentence_transformers"), 
    ("openai", "openai"),
    ("PyPDF2", "PyPDF2"),
    ("python-docx", "docx"),
    ("tqdm", "tqdm"),
    ("numpy", "numpy"),
    ("pydantic-settings", "pydantic_settings"),
    ("pydantic", "pydantic"),
    ("python-dotenv", "dotenv"),
    ("tiktoken", "tiktoken"),
    ("transformers", "transformers"),
    ("torch", "torch")
]

print("🔧 Installing required packages...")
print("This may take several minutes for first-time installation.")
print("-" * 60)

installed_count = 0
failed_count = 0

for package_name, import_name in required_packages:
    if check_package(package_name, import_name):
        print(f"✅ {package_name} already installed")
        installed_count += 1
    else:
        print(f"📦 Installing {package_name}...")
        if install_package(package_name):
            installed_count += 1
        else:
            failed_count += 1

print("-" * 60)
print(f"📊 Installation Summary:")
print(f"   ✅ Installed/Available: {installed_count}")
print(f"   ❌ Failed: {failed_count}")

if failed_count == 0:
    print("✅ All dependencies are ready!")
else:
    print(f"⚠️  Some packages failed to install. You may encounter import errors.")

print("💡 You can now run the rest of the notebook cells.")

# Verify critical imports
print("\n🔍 Verifying critical imports...")
critical_imports = [
    ("chromadb", "ChromaDB for vector storage"),
    ("sentence_transformers", "Sentence transformers for embeddings"),
    ("PyPDF2", "PDF processing"),
    ("numpy", "Numerical operations")
]

all_critical_ok = True
for import_name, description in critical_imports:
    try:
        __import__(import_name)
        print(f"✅ {import_name}: OK")
    except ImportError as e:
        print(f"❌ {import_name}: MISSING - {description}")
        all_critical_ok = False

if all_critical_ok:
    print("🎉 All critical dependencies verified!")
else:
    print("⚠️  Some critical dependencies are missing. Please check the errors above.")

🔧 Installing required packages...
This may take a few minutes for first-time installation.
--------------------------------------------------
📦 Installing chromadb...
Collecting chromadb
  Using cached chromadb-1.1.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pydantic>=1.9 (from chromadb)
  Using cached pydantic-2.12.0-py3-none-any.whl.metadata (83 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Using cached posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Using cached onnxruntime-1.23.1-cp313-cp313-macosx_13_0_arm64.whl.metadata (5.0 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Using cached opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Using cached opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.m


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


✅ Successfully installed chromadb
📦 Installing sentence-transformers...
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sent


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Collecting openai
  Using cached openai-2.2.0-py3-none-any.whl.metadata (29 kB)
Using cached openai-2.2.0-py3-none-any.whl (998 kB)
Installing collected packages: openai
Successfully installed openai-2.2.0
✅ Successfully installed openai
✅ PyPDF2 already installed
📦 Installing python-docx...
Successfully installed openai-2.2.0
✅ Successfully installed openai
✅ PyPDF2 already installed
📦 Installing python-docx...



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Collecting python-docx
  Using cached python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Using cached python_docx-1.2.0-py3-none-any.whl (252 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.2.0



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


✅ Successfully installed python-docx
✅ tqdm already installed
✅ numpy already installed
📦 Installing pydantic-settings...
Collecting pydantic-settings
  Using cached pydantic_settings-2.11.0-py3-none-any.whl.metadata (3.4 kB)
Using cached pydantic_settings-2.11.0-py3-none-any.whl (48 kB)
Installing collected packages: pydantic-settings
Successfully installed pydantic-settings-2.11.0
✅ Successfully installed pydantic-settings
--------------------------------------------------
✅ Dependency installation complete!
💡 You can now run the rest of the notebook cells.
Collecting pydantic-settings
  Using cached pydantic_settings-2.11.0-py3-none-any.whl.metadata (3.4 kB)
Using cached pydantic_settings-2.11.0-py3-none-any.whl (48 kB)
Installing collected packages: pydantic-settings
Successfully installed pydantic-settings-2.11.0
✅ Successfully installed pydantic-settings
--------------------------------------------------
✅ Dependency installation complete!
💡 You can now run the rest of the notebo


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# Special handling for ChromaDB and related dependencies
# Run this cell if you still get import errors after the main installation

print("🔧 Installing ChromaDB and related dependencies...")

# ChromaDB often needs specific versions and additional dependencies
chromadb_packages = [
    "chromadb>=0.4.0",
    "hnswlib",
    "sentence-transformers>=2.2.0",
    "transformers>=4.30.0",
    "torch",
    "numpy<2.0.0",  # ChromaDB may have issues with numpy 2.0+
    "pydantic>=2.0.0,<3.0.0",
    "tiktoken",
    "openai>=1.0.0"
]

print("Installing ChromaDB ecosystem packages...")
for package in chromadb_packages:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ {package}")
    except subprocess.CalledProcessError as e:
        print(f"⚠️  {package}: {e}")

# Test ChromaDB specifically
print("\n🧪 Testing ChromaDB installation...")
try:
    import chromadb
    client = chromadb.Client()
    print("✅ ChromaDB is working correctly!")
except Exception as e:
    print(f"❌ ChromaDB test failed: {e}")
    print("💡 You may need to restart your kernel and try again")

print("✅ ChromaDB setup complete!")

In [None]:
# Import standard libraries
import sys
import os
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional

# Try to import optional dependencies with fallbacks
try:
    import numpy as np
    HAS_NUMPY = True
except ImportError:
    print("⚠️ NumPy not available - some features may be limited")
    HAS_NUMPY = False

try:
    from tqdm import tqdm
    HAS_TQDM = True
except ImportError:
    print("⚠️ tqdm not available - progress bars will be disabled")
    # Fallback tqdm that does nothing
    def tqdm(iterable, *args, **kwargs):
        return iterable
    HAS_TQDM = False

import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

# Add src directory to path for imports
current_dir = Path.cwd()
src_path = current_dir / "src" 
sys.path.insert(0, str(src_path))

print(f"📂 Current directory: {current_dir}")
print(f"📁 Source path added: {src_path}")

# Import RAG pipeline components with comprehensive error handling
print("🔄 Importing RAG pipeline components...")

# Test critical dependencies first
missing_deps = []

try:
    import chromadb
    print("✅ ChromaDB imported successfully")
except ImportError as e:
    print(f"❌ ChromaDB import failed: {e}")
    missing_deps.append("chromadb")

try:
    import sentence_transformers
    print("✅ Sentence Transformers imported successfully")
except ImportError as e:
    print(f"❌ Sentence Transformers import failed: {e}")
    missing_deps.append("sentence-transformers")

try:
    import PyPDF2
    print("✅ PyPDF2 imported successfully")
except ImportError as e:
    print(f"❌ PyPDF2 import failed: {e}")
    missing_deps.append("PyPDF2")

if missing_deps:
    print(f"\n❌ Missing critical dependencies: {', '.join(missing_deps)}")
    print("💡 Please run the dependency installation cells above")
    print("💡 You may need to restart your kernel after installation")
    
    # Don't completely fail - let user try to install dependencies
    print("\n⚠️ Attempting to continue with limited functionality...")
else:
    print("✅ All critical dependencies are available")

# Try to import our custom components
try:
    from src import DocumentProcessor, DocumentChunk, VectorStore, EmbeddingService, get_settings
    print("✅ All RAG components imported successfully!")
except ImportError as e:
    print(f"⚠️ Bundle import error: {e}")
    print("📝 Trying individual imports...")
    
    # Try individual imports as fallback
    components_loaded = {}
    
    try:
        from src.document_processor import DocumentProcessor, DocumentChunk
        components_loaded['DocumentProcessor'] = True
        print("✅ DocumentProcessor imported")
    except ImportError as e2:
        print(f"❌ DocumentProcessor import failed: {e2}")
        components_loaded['DocumentProcessor'] = False
    
    try:
        from src.vector_store import VectorStore
        components_loaded['VectorStore'] = True
        print("✅ VectorStore imported")
    except ImportError as e3:
        print(f"❌ VectorStore import failed: {e3}")
        components_loaded['VectorStore'] = False
    
    try:
        from src.embeddings import EmbeddingService
        components_loaded['EmbeddingService'] = True
        print("✅ EmbeddingService imported")
    except ImportError as e4:
        print(f"❌ EmbeddingService import failed: {e4}")
        components_loaded['EmbeddingService'] = False
    
    try:
        from src.config import get_settings
        components_loaded['get_settings'] = True
        print("✅ get_settings imported")
    except ImportError as e5:
        print(f"❌ get_settings import failed: {e5}")
        components_loaded['get_settings'] = False
    
    # Check what we successfully loaded
    loaded_count = sum(components_loaded.values())
    if loaded_count == len(components_loaded):
        print("✅ All individual components loaded successfully!")
    else:
        print(f"⚠️ Only {loaded_count}/{len(components_loaded)} components loaded")
        print("💡 Make sure you're running this notebook from the rag-pipeline directory")
        print("💡 Also ensure you have run the dependency installation cells above")

print("\n📦 Available components (if loaded successfully):")
print("  - DocumentProcessor: Handles document chunking and metadata extraction") 
print("  - DocumentChunk: Container for processed document segments")
print("  - VectorStore: Manages ChromaDB storage and similarity search")
print("  - EmbeddingService: Generates embeddings using sentence transformers")
print("  - get_settings: Loads configuration from .env file")

# Final dependency check with helpful messages
print("\n🔍 Final dependency verification:")
final_check = {
    "chromadb": "Vector database storage",
    "sentence_transformers": "Text embeddings",
    "PyPDF2": "PDF file processing",
    "openai": "OpenAI API (optional)",
    "numpy": "Numerical operations"
}

all_good = True
for package, description in final_check.items():
    try:
        __import__(package)
        print(f"✅ {package}: Available")
    except ImportError:
        print(f"❌ {package}: Missing - {description}")
        all_good = False

if all_good:
    print("\n🚀 All systems ready! You can proceed to the next cells.")
else:
    print("\n⚠️ Some dependencies are missing.")
    print("💡 Try running cells 2-3 to install missing packages")
    print("💡 You may need to restart your kernel after installation")

📂 Current directory: /Users/zsolt/Downloads/edu_platform/rag-pipeline
📁 Source path added: /Users/zsolt/Downloads/edu_platform/rag-pipeline/src
🔄 Importing RAG pipeline components...
❌ ChromaDB import failed: No module named 'chromadb'
💡 Please run the dependency installation cell above


ModuleNotFoundError: No module named 'chromadb'

## 2. Configuration and Settings

In [None]:
# Load configuration from .env file
try:
    settings = get_settings()
    print("✅ Configuration loaded successfully!")
except Exception as e:
    print(f"⚠️ Warning: Could not load .env file. Using default settings. Error: {e}")
    
    # Fallback configuration
    class Settings:
        def __init__(self):
            self.openai_api_key = os.getenv("OPENAI_API_KEY", "")
            self.vector_db_path = "../backend/chroma_db"
            self.collection_name = "school_knowledge"
            self.local_embedding_model = "all-MiniLM-L6-v2"
            self.use_openai_embeddings = False
            self.chunk_size = 1000
            self.chunk_overlap = 200
            self.min_chunk_size = 100
            self.batch_size = 32
            self.documents_dir = "./documents"
    
    settings = Settings()

# Display configuration
print("\n📋 Current Configuration:")
print(f"🗂️ Documents directory: {settings.documents_dir}")
print(f"🗃️ Vector DB path: {settings.vector_db_path}")
print(f"📚 Collection name: {settings.collection_name}")
print(f"🤖 Embedding model: {settings.local_embedding_model}")
print(f"🔢 Chunk size: {settings.chunk_size}")
print(f"📊 Batch size: {settings.batch_size}")
print(f"🔑 OpenAI API key configured: {'Yes' if settings.openai_api_key else 'No'}")

# Create documents directory if it doesn't exist
documents_path = Path(settings.documents_dir)
documents_path.mkdir(exist_ok=True)
print(f"📁 Documents directory ready: {documents_path.absolute()}")

## 3. Initialize Components

In [None]:
# Initialize document processor with enhanced chapter detection
print("🔄 Initializing document processor with chapter-based splitting...")
doc_processor = DocumentProcessor(
    chunk_size=settings.chunk_size,
    chunk_overlap=settings.chunk_overlap,
    min_chunk_size=settings.min_chunk_size
)
print("✅ Document processor initialized with chapter detection")

# Initialize embedding service
print("🔄 Initializing embedding service...")
try:
    embedding_service = EmbeddingService(
        openai_api_key=settings.openai_api_key if hasattr(settings, 'openai_api_key') else None,
        model_name=settings.local_embedding_model
    )
    print("✅ Embedding service initialized")
except Exception as e:
    print(f"⚠️ Embedding service initialization warning: {e}")
    print("💡 Some embedding features may be limited")

# Create a ChromaDB-compatible embedding function
print("🔄 Creating embedding function for ChromaDB...")
try:
    import chromadb.utils.embedding_functions as embedding_functions

    # Create a sentence transformers embedding function that ChromaDB can use
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=settings.local_embedding_model
    )
    print("✅ ChromaDB embedding function created")
except Exception as e:
    print(f"⚠️ ChromaDB embedding function creation warning: {e}")
    sentence_transformer_ef = None

# Initialize vector store (connects to backend ChromaDB) WITH embedding function
print("🔄 Connecting to ChromaDB...")
try:
    vector_store = VectorStore(
        collection_name=settings.collection_name,
        persist_directory=settings.vector_db_path,
        embedding_function=sentence_transformer_ef  # Add the embedding function if available
    )
    print("✅ Connected to ChromaDB successfully")
    print(f"📊 Current document count: {vector_store.count_documents()}")
except Exception as e:
    print(f"❌ Error connecting to ChromaDB: {e}")
    print("💡 This may be normal if this is your first run")
    print("💡 The vector store will be created when you upload documents")
    # Don't raise here, let the process continue

## 4. Document Discovery and Loading

In [None]:
def discover_documents(documents_dir: str) -> List[Path]:
    """Discover all supported document files in the directory."""
    supported_extensions = {'.pdf', '.docx', '.doc', '.txt', '.md', '.markdown'}
    documents = []
    
    documents_path = Path(documents_dir)
    
    for ext in supported_extensions:
        pattern = f"*{ext}"
        files = list(documents_path.glob(pattern))
        documents.extend(files)
        if files:
            print(f"📄 Found {len(files)} {ext} files")
    
    return sorted(documents)

# Discover documents in the documents folder
print("🔍 Discovering documents...")
document_files = discover_documents(settings.documents_dir)

if not document_files:
    print(f"⚠️ No documents found in {settings.documents_dir}")
    print("📝 Supported formats: PDF, DOCX, DOC, TXT, MD")
    print("💡 Please add some documents to the documents folder and re-run this cell")
else:
    print(f"✅ Found {len(document_files)} documents to process:")
    for doc in document_files:
        size_mb = doc.stat().st_size / (1024 * 1024)
        print(f"  📄 {doc.name} ({size_mb:.2f} MB)")

## 5. Document Processing and Chunking

### 📖 Enhanced Chapter-Based Processing

This notebook now includes **chapter-based document splitting** that:
- **Detects Hungarian textbook chapters** automatically
- **Preserves educational coherence** by keeping complete concepts together
- **Enhances metadata** with chapter titles, topics, and educational context
- **Improves AI responses** in both ai-chat and ai-tutor

The processing will automatically detect if your documents have chapter structure and use the enhanced splitting method.

In [None]:
from typing import Set
import time

def process_document_with_metadata(file_path: Path) -> List[DocumentChunk]:
    """Process a document file and extract metadata from filename."""
    
    start_time = time.time()
    
    try:
        print(f"\n🔄 Processing: {file_path.name}")
        print(f"📦 File size: {file_path.stat().st_size / (1024 * 1024):.1f} MB")
        
        # Extract metadata from filename
        print("📋 Extracting metadata from filename...")
        filename_metadata = doc_processor.extract_metadata_from_filename(file_path.name)
        print(f" Filename metadata: {filename_metadata}")

        # Add source metadata
        source_metadata = {
            'processed_at': datetime.now().isoformat(),
            'file_path': str(file_path),
            **filename_metadata
        }
        
        # OPTIMIZED: Extract text only once and reuse it
        print(f"📖 Reading document: {file_path.name}...")
        extraction_start = time.time()
        
        # Add timeout protection for text extraction
        try:
            full_text = doc_processor._extract_text(file_path)
            extraction_time = time.time() - extraction_start
            print(f"⏱️ Text extraction took: {extraction_time:.1f} seconds")
        except Exception as extract_error:
            print(f"❌ Text extraction failed: {extract_error}")
            return []
        
        if not full_text or len(full_text.strip()) == 0:
            print(f"⚠️ WARNING: No text extracted from {file_path.name}")
            return []
        
        # Calculate comprehensive document statistics
        total_chars = len(full_text)
        total_words = len(full_text.split())
        total_lines = full_text.count('\n') + 1
        estimated_pages = total_words / 250  # Standard: ~250 words per page

        print(f"📊 Document analysis: {len(full_text)/1024:.1f} KB of text, {total_words:,} words")
        
        # Add comprehensive document statistics to metadata
        source_metadata.update({
            'original_char_count': total_chars,
            'original_word_count': total_words,
            'original_line_count': total_lines,
            'estimated_pages': round(estimated_pages, 1),
            'text_size_kb': round(len(full_text)/1024, 2)
        })
        
        # OPTIMIZED: Process the text directly instead of re-reading the file
        print(f"✂️ Chunking document...")
        chunking_start = time.time()
        
        try:
            chunks = doc_processor.process_text(full_text, source_metadata)
            chunking_time = time.time() - chunking_start
            print(f"⏱️ Chunking took: {chunking_time:.1f} seconds")
        except Exception as chunk_error:
            print(f"❌ Chunking failed: {chunk_error}")
            return []
        
        if chunks:
            # Calculate detailed chunking statistics
            chunk_chars = sum(len(chunk.content) for chunk in chunks)
            chunk_words = sum(len(chunk.content.split()) for chunk in chunks)
            avg_chunk_size = chunk_chars / len(chunks) if chunks else 0
            min_chunk_size = min(len(chunk.content) for chunk in chunks)
            max_chunk_size = max(len(chunk.content) for chunk in chunks)
            
            print(f"✅ CHUNKING RESULTS:")
            print(f"   🔢 Chunks created: {len(chunks)}")
            print(f"   📏 Average chunk size: {avg_chunk_size:.0f} chars")
            print(f"   📏 Chunk size range: {min_chunk_size} - {max_chunk_size} chars")
            
            # VERIFY METADATA IS ATTACHED
            print(f"🔍 Metadata verification:")
            sample_chunk = chunks[0] if chunks else None
            if sample_chunk and sample_chunk.metadata:
                print(f"   📚 Subject: {sample_chunk.metadata.get('subject', 'MISSING')}")
                print(f"   🎓 Grade: {sample_chunk.metadata.get('grade', 'MISSING')}")
                print(f"   📄 Filename: {sample_chunk.metadata.get('filename', 'MISSING')}")
                print(f"   🗂️ Total metadata fields: {len(sample_chunk.metadata)}")
            else:
                print(f"   ⚠️ WARNING: No metadata found in chunks!")
            
            # Calculate and show text retention percentage
            if full_text and len(full_text) > 0:
                char_retention = (chunk_chars / len(full_text)) * 100
                word_retention = (chunk_words / total_words) * 100 if total_words > 0 else 0
                print(f"   📈 Text retention: {char_retention:.1f}% chars, {word_retention:.1f}% words")
            
            total_time = time.time() - start_time
            print(f"✅ Successfully processed: {file_path.name} in {total_time:.1f} seconds")
        else:
            print(f"❌ No chunks created for: {file_path.name}")
            print("   Check if the document has sufficient readable text content")
            
        # Clear the full_text from memory to help with large files
        del full_text
        
        return chunks
        
    except Exception as e:
        print(f"❌ ERROR processing {file_path.name}: {str(e)}")
        print(f"   Error type: {type(e).__name__}")
        import traceback
        traceback.print_exc()
        return []

# Check if we have documents to process
print("🔍 Checking available documents...")
if 'document_files' not in locals() or not document_files:
    print("❌ No document_files variable found. Please run the document discovery cell first.")
else:
    print(f"✅ Found {len(document_files)} documents to process")

# Process documents with better error handling and progress tracking
if 'document_files' in locals() and document_files:
    print("⚙️ Processing ALL documents with optimized performance...")
    print("=" * 80)
    
    all_chunks = []
    processing_summary = []
    overall_start = time.time()
    
    # Sort files by size - process smaller files first for quick feedback
    sorted_files = sorted(document_files, key=lambda f: f.stat().st_size)
    print(f"📋 Processing order (by size):")
    for i, doc_file in enumerate(sorted_files):
        file_size_mb = doc_file.stat().st_size / (1024 * 1024)
        print(f"  {i+1}. {doc_file.name} ({file_size_mb:.1f} MB)")
    
    for i, doc_file in enumerate(sorted_files, 1):
        file_size_mb = doc_file.stat().st_size / (1024 * 1024)
        print(f"\n📄 DOCUMENT {i}/{len(sorted_files)} - {doc_file.name} ({file_size_mb:.1f} MB)")
        print(f"🕐 Started at: {datetime.now().strftime('%H:%M:%S')}")
        
        doc_start = time.time()
        
        try:
            chunks = process_document_with_metadata(doc_file)
            doc_time = time.time() - doc_start
            
            if chunks:
                all_chunks.extend(chunks)
                
                # VERIFY METADATA PROPAGATION
                first_chunk = chunks[0]
                print(f"📋 Metadata check: Subject='{first_chunk.metadata.get('subject')}', Grade='{first_chunk.metadata.get('grade')}', File='{first_chunk.metadata.get('filename')}'")
                
                # Store individual document stats for summary
                doc_stats = {
                    'filename': doc_file.name,
                    'file_size_mb': file_size_mb,
                    'processing_time': doc_time,
                    'original_words': chunks[0].metadata.get('original_word_count', 0),
                    'original_chars': chunks[0].metadata.get('original_char_count', 0),
                    'original_lines': chunks[0].metadata.get('original_line_count', 0),
                    'estimated_pages': chunks[0].metadata.get('estimated_pages', 0),
                    'chunks_created': len(chunks),
                    'processed_words': sum(len(chunk.content.split()) for chunk in chunks),
                    'subject': chunks[0].metadata.get('subject', 'Unknown'),
                    'grade': chunks[0].metadata.get('grade', 'Unknown')
                }
                processing_summary.append(doc_stats)
                
                print(f"⏱️ Document completed in {doc_time:.1f}s - Running total: {len(all_chunks):,} chunks")
            else:
                print(f"⚠️ No chunks generated from {doc_file.name}")
                
        except Exception as doc_error:
            print(f"❌ Failed to process {doc_file.name}: {doc_error}")
            
        print("-" * 60)  # Separator between documents
        
        # Safety break - if any single document takes more than 5 minutes, something is wrong
        if doc_time > 300:  # 5 minutes
            print(f"⚠️ WARNING: Document processing took {doc_time:.1f}s (>5min). This may indicate an issue.")
    
    total_time = time.time() - overall_start
    
    print("=" * 80)
    print("📊 DOCUMENT STATISTICS SUMMARY")
    print("=" * 80)
    
    # Show detailed individual document statistics
    if processing_summary:
        print(f"{'Document':<35} {'Size(MB)':<8} {'Time(s)':<7} {'Chunks':<7} {'Subject':<12}")
        print("-" * 85)
        
        for stats in processing_summary:
            filename_short = stats['filename'][:32] + "..." if len(stats['filename']) > 35 else stats['filename']
            
            print(f"{filename_short:<35} "
                  f"{stats['file_size_mb']:<8.1f} "
                  f"{stats['processing_time']:<7.1f} "
                  f"{stats['chunks_created']:<7} "
                  f"{stats['subject']:<12}")
    
    print("=" * 80)
    print(f"📊 COMPLETE PROCESSING SUMMARY:")
    print(f"   📚 Documents processed: {len(document_files)}")
    print(f"   🔢 Total chunks created: {len(all_chunks):,}")
    print(f"   ⏱️ Total processing time: {total_time:.1f} seconds")
    if len(all_chunks) > 0 and total_time > 0:
        print(f"   📈 Average speed: {len(all_chunks)/total_time:.1f} chunks/second")
    
    if processing_summary:
        total_words = sum(s['original_words'] for s in processing_summary)
        total_pages = sum(s['estimated_pages'] for s in processing_summary)
        unique_subjects = set(s['subject'] for s in processing_summary)
        unique_grades = set(s['grade'] for s in processing_summary)
        
        print(f"   🔤 Total words across all docs: {total_words:,}")
        print(f"   📖 Total estimated pages: {total_pages:.1f}")
        print(f"   📚 Unique subjects: {len(unique_subjects)} - {sorted(unique_subjects)}")
        print(f"   🎓 Unique grades: {len(unique_grades)} - {sorted(unique_grades)}")
    
    # FINAL METADATA VERIFICATION
    if all_chunks:
        print(f"\n🔍 FINAL METADATA VERIFICATION:")
        subjects_found = set()
        grades_found = set()
        files_found = set()

        for chunk in all_chunks:  # Check all chunks
            if chunk.metadata:
                if chunk.metadata.get('subject'):
                    subjects_found.add(chunk.metadata.get('subject'))
                if chunk.metadata.get('grade'):
                    grades_found.add(chunk.metadata.get('grade'))
                if chunk.metadata.get('filename'):
                    files_found.add(chunk.metadata.get('filename'))
        
        print(f"   📚 Subjects in chunks: {sorted(subjects_found)}")
        print(f"   🎓 Grades in chunks: {sorted(grades_found)}")
        print(f"   📄 Files in chunks: {len(files_found)} files")
        
        if not subjects_found:
            print(f"   ⚠️ WARNING: No subjects found in chunk metadata!")
        if not grades_found:
            print(f"   ⚠️ WARNING: No grades found in chunk metadata!")
    
    print("=" * 80)
    print(f"\n🎯 Ready to generate embeddings for {len(all_chunks):,} chunks from ALL documents...")

else:
    print("⏭️ No documents found to process")
    print("💡 Place PDF, DOCX, TXT, or MD files in the './documents/' folder")
    
    # Set empty variables
    all_chunks = []
    processing_summary = []

## 6. Embedding Generation

In [None]:
async def generate_embeddings_for_chunks(chunks: List[DocumentChunk]) -> List[np.ndarray]:
    """Generate embeddings for all document chunks."""
    if not chunks:
        print("⚠️ No chunks provided for embedding generation")
        return []
    
    # Extract text content from chunks
    texts = [chunk.content for chunk in chunks]
    
    print(f"🧠 Generating embeddings for {len(texts)} chunks...")
    print(f"📊 Estimated processing time: ~{len(texts) * 0.05:.1f} seconds")
    
    # Generate embeddings in batches
    use_openai = hasattr(settings, 'use_openai_embeddings') and settings.use_openai_embeddings
    
    try:
        embeddings = await embedding_service.embed_documents(
            texts, 
            use_openai=use_openai, 
            batch_size=settings.batch_size
        )
        
        print(f"✅ Successfully generated {len(embeddings)} embeddings")
        
        if embeddings:
            embedding_dim = len(embeddings[0]) if embeddings else 0
            print(f"📏 Embedding dimension: {embedding_dim}")
            print(f"💾 Memory usage: ~{len(embeddings) * embedding_dim * 4 / 1024 / 1024:.1f} MB")
        
        return embeddings
        
    except Exception as e:
        print(f"❌ Error generating embeddings: {e}")
        return []

# Generate embeddings for ALL processed chunks
if 'all_chunks' in locals() and all_chunks:
    print("🔄 Starting embedding generation for ALL documents...")
    embeddings = await generate_embeddings_for_chunks(all_chunks)
    
    if embeddings and len(embeddings) == len(all_chunks):
        print(f"✅ Embedding generation successful!")
        print(f"📊 Ready to upload {len(embeddings)} embeddings to ChromaDB")
    elif embeddings:
        print(f"⚠️ Partial success: {len(embeddings)} embeddings for {len(all_chunks)} chunks")
    else:
        print("❌ No embeddings generated")
        
else:
    print("⏭️ Skipping embedding generation - no chunks available")
    print("💡 Make sure to run the document processing cell first")
    embeddings = []

## 7. Batch Upload to ChromaDB

In [None]:
def upload_to_chromadb(chunks: List[DocumentChunk], embeddings: List[np.ndarray]):
    """Upload chunks and embeddings to ChromaDB."""
    if not chunks or not embeddings:
        print("⚠️ No data to upload")
        return False
    
    if len(chunks) != len(embeddings):
        print(f"❌ Mismatch: {len(chunks)} chunks vs {len(embeddings)} embeddings")
        return False
    
    print(f"📤 Uploading {len(chunks)} chunks to ChromaDB...")
    print(f"🎯 Target database: {settings.vector_db_path}")
    print(f"📚 Collection: {settings.collection_name}")
    
    # Show initial database state
    initial_count = vector_store.count_documents()
    print(f"📊 Initial document count: {initial_count}")
    
    # VERIFY METADATA BEFORE UPLOAD
    print(f"\n🔍 PRE-UPLOAD METADATA VERIFICATION:")
    subjects_to_upload = set()
    grades_to_upload = set()
    files_to_upload = set()
    
    for chunk in chunks:  # Check all chunks
        if chunk.metadata:
            if chunk.metadata.get('subject'):
                subjects_to_upload.add(chunk.metadata.get('subject'))
            if chunk.metadata.get('grade'):
                grades_to_upload.add(chunk.metadata.get('grade'))
            if chunk.metadata.get('filename'):
                files_to_upload.add(chunk.metadata.get('filename'))
    
    print(f"   📚 Subjects to upload: {sorted(subjects_to_upload)}")
    print(f"   🎓 Grades to upload: {sorted(grades_to_upload)}")
    print(f"   📄 Files to upload: {len(files_to_upload)} files")
    
    if not subjects_to_upload:
        print(f"   ⚠️ WARNING: No subjects found in chunk metadata before upload!")
        print(f"   🔍 Sample chunk metadata: {chunks[0].metadata if chunks[0].metadata else 'EMPTY'}")
    
    # Prepare data for ChromaDB
    documents = [chunk.content for chunk in chunks]
    metadatas = [chunk.metadata for chunk in chunks]
    ids = [chunk.chunk_id for chunk in chunks]
    
    # VERIFY METADATA STRUCTURE
    print(f"\n🔍 METADATA STRUCTURE VERIFICATION:")
    sample_metadata = metadatas[0] if metadatas else {}
    print(f"   📋 Sample metadata keys: {list(sample_metadata.keys()) if sample_metadata else 'NO KEYS'}")
    print(f"   📚 Sample subject: {sample_metadata.get('subject', 'MISSING') if sample_metadata else 'NO METADATA'}")
    print(f"   🎓 Sample grade: {sample_metadata.get('grade', 'MISSING') if sample_metadata else 'NO METADATA'}")
    print(f"   📄 Sample filename: {sample_metadata.get('filename', 'MISSING') if sample_metadata else 'NO METADATA'}")
    
    # Upload in batches with progress tracking
    batch_size = settings.batch_size
    total_uploaded = 0
    failed_batches = 0
    
    print(f"\n🔄 Uploading in batches of {batch_size}...")
    
    for i in tqdm(range(0, len(chunks), batch_size), desc="Uploading batches"):
        end_idx = min(i + batch_size, len(chunks))
        
        batch_documents = documents[i:end_idx]
        batch_metadatas = metadatas[i:end_idx]
        batch_embeddings = embeddings[i:end_idx]
        batch_ids = ids[i:end_idx]
        
        try:
            vector_store.add_documents(
                documents=batch_documents,
                metadatas=batch_metadatas,
                embeddings=batch_embeddings,
                ids=batch_ids
            )
            total_uploaded += len(batch_documents)
            
            # Progress update every 10 batches
            if (i // batch_size + 1) % 10 == 0:
                print(f"  📈 Progress: {total_uploaded}/{len(chunks)} chunks uploaded")
                
        except Exception as e:
            print(f"❌ Error uploading batch {i//batch_size + 1}: {e}")
            failed_batches += 1
            continue
    
    # Final results
    final_count = vector_store.count_documents()
    
    print(f"\n✅ Upload complete!")
    print(f"📊 Results:")
    print(f"  • Successfully uploaded: {total_uploaded} chunks")
    print(f"  • Failed batches: {failed_batches}")
    print(f"  • Database before: {initial_count} documents")
    print(f"  • Database after: {final_count} documents")
    print(f"  • Net increase: {final_count - initial_count} documents")
    
    # POST-UPLOAD VERIFICATION - Sample more strategically
    print(f"\n🔍 POST-UPLOAD METADATA VERIFICATION:")
    try:
        # Get a larger sample to ensure we get documents from all subjects
        sample_docs = vector_store.get_documents_by_metadata({}, limit=100)
        
        if sample_docs and sample_docs.get('metadatas'):
            uploaded_subjects = set()
            uploaded_grades = set()
            uploaded_files = set()
            
            print(f"   🔍 Checking {len(sample_docs.get('metadatas', []))} documents for metadata diversity...")
            
            for metadata in sample_docs['metadatas']:
                if metadata:
                    if metadata.get('subject'):
                        uploaded_subjects.add(metadata.get('subject'))
                    if metadata.get('grade'):
                        uploaded_grades.add(metadata.get('grade'))
                    if metadata.get('filename'):
                        uploaded_files.add(metadata.get('filename'))
            
            print(f"   📚 Subjects in database: {sorted(uploaded_subjects)}")
            print(f"   🎓 Grades in database: {sorted(uploaded_grades)}")
            print(f"   📄 Files in database: {len(uploaded_files)} files")
            
            # Also check with specific subject queries to be sure
            print(f"   📋 Verifying each expected subject exists:")
            for expected_subject in sorted(subjects_to_upload):
                subject_docs = vector_store.get_documents_by_metadata({'subject': expected_subject}, limit=1)
                if subject_docs and subject_docs.get('metadatas') and len(subject_docs['metadatas']) > 0:
                    print(f"      ✅ {expected_subject}: Found")
                else:
                    print(f"      ❌ {expected_subject}: Missing")
            
            if uploaded_subjects == subjects_to_upload:
                print(f"   ✅ Subject metadata preserved correctly!")
            else:
                print(f"   ⚠️ Subject metadata sampling issue detected!")
                print(f"      Expected: {sorted(subjects_to_upload)}")
                print(f"      Found in sample: {sorted(uploaded_subjects)}")
                print(f"   💡 This may be due to document clustering - checking individual subjects above")
        else:
            print(f"   ❌ Could not retrieve documents for verification")
            
    except Exception as e:
        print(f"   ❌ Error during post-upload verification: {e}")
    
    return total_uploaded > 0

# Upload ALL processed data to ChromaDB
if 'all_chunks' in locals() and 'embeddings' in locals() and all_chunks and embeddings:
    print("🚀 Starting complete upload to ChromaDB...")
    print("=" * 60)
    
    success = upload_to_chromadb(all_chunks, embeddings)
    
    if success:
        print("=" * 60)
        print("🎉 UPLOAD SUCCESSFUL!")
        print("✅ All documents have been processed and uploaded to ChromaDB")
        print("🎯 Your knowledge base is now ready for use")
        print("💡 You can now test the search functionality or use the backend API")
    else:
        print("=" * 60)
        print("❌ Upload failed or incomplete")
        print("💡 Check the error messages above and try again")
        
else:
    print("⏭️ Skipping upload - missing chunks or embeddings")
    print("💡 Make sure to run both the processing and embedding generation cells first")
    
    if 'all_chunks' not in locals() or not all_chunks:
        print("  ❌ No chunks available (run document processing cell)")
    if 'embeddings' not in locals() or not embeddings:
        print("  ❌ No embeddings available (run embedding generation cell)")

In [None]:
# Quick verification of all subjects in the database
print("🔍 MANUAL SUBJECT VERIFICATION:")
print("=" * 50)

expected_subjects = ['foldrajz', 'irodalom', 'matematika', 'nyelvtan', 'tortenelem']

for subject in expected_subjects:
    try:
        # Check if documents with this subject exist
        subject_docs = vector_store.get_documents_by_metadata({'subject': subject}, limit=5)
        if subject_docs and subject_docs.get('metadatas') and len(subject_docs['metadatas']) > 0:
            count = len(subject_docs['metadatas'])
            sample_filename = subject_docs['metadatas'][0].get('filename', 'Unknown')
            print(f"✅ {subject}: Found {count} documents (sample file: {sample_filename})")
        else:
            print(f"❌ {subject}: No documents found")
    except Exception as e:
        print(f"❌ {subject}: Error checking - {e}")

print("\n" + "=" * 50)

# Also check total counts by getting all documents and counting subjects
print("📊 COMPREHENSIVE DATABASE ANALYSIS:")
try:
    # Get more documents to analyze the full distribution
    all_sample_docs = vector_store.collection.get(limit=2843, include=["metadatas"])  # Get all docs
    
    if all_sample_docs and all_sample_docs.get('metadatas'):
        all_subjects = set()
        all_files = set()
        subject_counts = {}
        
        for metadata in all_sample_docs['metadatas']:
            if metadata and metadata.get('subject'):
                subject = metadata.get('subject')
                all_subjects.add(subject)
                subject_counts[subject] = subject_counts.get(subject, 0) + 1
                
            if metadata and metadata.get('filename'):
                all_files.add(metadata.get('filename'))
        
        print(f"📚 All subjects in database: {sorted(all_subjects)}")
        print(f"📄 All files in database: {len(all_files)} files")
        print(f"📊 Subject distribution:")
        for subject, count in sorted(subject_counts.items()):
            print(f"   • {subject}: {count:,} chunks")
        
        total_chunks_with_subjects = sum(subject_counts.values())
        print(f"📈 Total chunks with subject metadata: {total_chunks_with_subjects:,} / {len(all_sample_docs['metadatas']):,}")
        
    else:
        print("❌ Could not retrieve documents for analysis")
        
except Exception as e:
    print(f"❌ Error during comprehensive analysis: {e}")

print("=" * 50)

## 8. Utility Functions

Additional utility functions for managing the knowledge base.

In [None]:
def reset_knowledge_base():
    """Reset (clear) the entire knowledge base. Use with caution!"""
    confirm = input("⚠️ This will delete ALL documents from ChromaDB. Type 'CONFIRM' to proceed: ")
    
    if confirm == "CONFIRM":
        try:
            vector_store.reset_collection()
            print("✅ Knowledge base reset successfully")
            print(f"📊 New document count: {vector_store.count_documents()}")
        except Exception as e:
            print(f"❌ Error resetting knowledge base: {e}")
    else:
        print("❌ Reset cancelled")

def export_processing_report():
    """Export a processing report with statistics."""
    
    report = {
        "timestamp": datetime.now().isoformat(),
        "configuration": {
            "documents_dir": settings.documents_dir,
            "vector_db_path": settings.vector_db_path,
            "collection_name": settings.collection_name,
            "chunk_size": settings.chunk_size,
            "batch_size": settings.batch_size
        },
        "processing_results": {
            "documents_found": len(document_files) if 'document_files' in locals() else 0,
            "chunks_created": len(all_chunks) if 'all_chunks' in locals() else 0,
            "embeddings_generated": len(embeddings) if 'embeddings' in locals() else 0,
            "final_db_count": vector_store.count_documents()
        }
    }
    
    report_file = f"processing_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    with open(report_file, 'w') as f:
        json.dump(report, f, indent=2)
    
    print(f"📊 Processing report exported to: {report_file}")
    return report

# Uncomment the following lines to use these utilities:

# Reset knowledge base (CAUTION!)
#reset_knowledge_base()

# Export processing report
report = export_processing_report()
print(f"\n📋 Processing Summary:")
print(f"  Documents processed: {report['processing_results']['documents_found']}")
print(f"  Chunks created: {report['processing_results']['chunks_created']}")
print(f"  Total documents in DB: {report['processing_results']['final_db_count']}")

In [None]:
# Quick diagnostic - let's check what's actually in the database by subject
print("🔍 Checking database contents by subject...")

# Get a larger sample to see all subjects
collection = vector_store.collection
sample = collection.get(limit=1000)  # Get more documents

if sample and 'metadatas' in sample:
    subject_counts = {}
    file_counts = {}
    
    for metadata in sample['metadatas']:
        subject = metadata.get('subject', 'unknown')
        filename = metadata.get('filename', 'unknown')
        
        subject_counts[subject] = subject_counts.get(subject, 0) + 1
        file_counts[filename] = file_counts.get(filename, 0) + 1
    
    print(f"\n📊 Subject distribution in database:")
    for subject, count in sorted(subject_counts.items()):
        print(f"  📚 {subject}: {count} chunks")
    
    print(f"\n📄 File distribution in database:")
    for filename, count in sorted(file_counts.items()):
        print(f"  📄 {filename}: {count} chunks")
        
    print(f"\n📈 Total chunks in database: {sum(subject_counts.values())}")
    
    # Test retrieval with OpenAI embeddings
    print(f"\n🔍 Testing Hungarian query with OpenAI embeddings...")
    try:
        # Test with the embedding service using OpenAI
        embedding_service = EmbeddingService(
            openai_api_key=settings.openai_api_key,
            model_name=settings.local_embedding_model
        )
        
        # Test query embedding using OpenAI
        test_query = "mi a matematika?"
        print(f"🔄 Generating OpenAI embedding for query: '{test_query}'")
        
        # Use async embedding with OpenAI
        import asyncio
        
        async def test_openai_embedding():
            query_embedding = await embedding_service.embed_text(test_query, use_openai=True)
            return query_embedding
        
        # Run the async function
        query_embedding = await test_openai_embedding()
        print(f"✅ OpenAI query embedding generated: shape {query_embedding.shape}")
        
        # Search directly in vector store
        results = collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=3,
            include=['metadatas', 'documents', 'distances']
        )
        
        if results and results['documents']:
            print(f"🔍 Found {len(results['documents'][0])} results for 'mi a matematika?' using OpenAI embeddings:")
            for i, (doc, metadata, distance) in enumerate(zip(
                results['documents'][0], 
                results['metadatas'][0], 
                results['distances'][0]
            )):
                subject = metadata.get('subject', 'unknown')
                filename = metadata.get('filename', 'unknown')
                print(f"  {i+1}. {subject} ({filename}) - distance: {distance:.3f}")
                print(f"     📝 {doc[:80]}...")
        
    except Exception as e:
        print(f"❌ Error during OpenAI embedding retrieval test: {e}")
        print(f"💡 Make sure your OpenAI API key is configured correctly")
        
        # Fallback to local embeddings if OpenAI fails
        print(f"\n🔄 Falling back to local embeddings...")
        try:
            query_embedding = embedding_service._embed_with_local_model(test_query)
            print(f"✅ Local query embedding generated: shape {query_embedding.shape}")
            
            # Search with local embedding
            results = collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=3,
                include=['metadatas', 'documents', 'distances']
            )
            
            if results and results['documents']:
                print(f"🔍 Found {len(results['documents'][0])} results using local embeddings:")
                for i, (doc, metadata, distance) in enumerate(zip(
                    results['documents'][0], 
                    results['metadatas'][0], 
                    results['distances'][0]
                )):
                    subject = metadata.get('subject', 'unknown')
                    filename = metadata.get('filename', 'unknown')
                    print(f"  {i+1}. {subject} ({filename}) - distance: {distance:.3f}")
                    print(f"     📝 {doc[:80]}...")
        except Exception as e2:
            print(f"❌ Local embedding fallback also failed: {e2}")
        
else:
    print("⚠️ No metadata found in database")