In [None]:
%%time
import os
import shutil

# Configuration
DJANGO_PATH = '/content/drive/MyDrive/Google AI Studio/django-docs-5.2-en'
INTERMEDIATE_PATH = '/content/drive/MyDrive/Google AI Studio/all_html_txt'

# Define supported file extensions
SUPPORTED_EXTENSIONS = {'.xml', '.txt', '.json', '.py', '.md', '.html', '.htm'}

def get_unique_filename(destination_path, filename):
    """Generate unique filename by adding numeric suffix if file already exists."""
    full_path = os.path.join(destination_path, filename)

    if not os.path.exists(full_path):
        return filename

    name, ext = os.path.splitext(filename)
    counter = 1

    while True:
        new_filename = f"{name}_{counter}{ext}"
        new_full_path = os.path.join(destination_path, new_filename)

        if not os.path.exists(new_full_path):
            return new_filename
        counter += 1

print("=== STEP 1: COLLECTING FILES ===")

# Clean up intermediate directory
if os.path.exists(INTERMEDIATE_PATH):
    print(f"Cleaning intermediate directory: {INTERMEDIATE_PATH}")
    shutil.rmtree(INTERMEDIATE_PATH)

os.makedirs(INTERMEDIATE_PATH, exist_ok=True)

print(f"Collecting files with extensions {SUPPORTED_EXTENSIONS}")
print(f"From: {DJANGO_PATH}")
print(f"To: {INTERMEDIATE_PATH}")

collected_files = 0
files_by_extension = {}

for root, _, files in os.walk(DJANGO_PATH):
    for file in files:
        _, ext = os.path.splitext(file)
        ext_lower = ext.lower()

        if ext_lower in SUPPORTED_EXTENSIONS:
            source_file_path = os.path.join(root, file)
            unique_filename = get_unique_filename(INTERMEDIATE_PATH, file)
            destination_file_path = os.path.join(INTERMEDIATE_PATH, unique_filename)

            try:
                shutil.copy2(source_file_path, destination_file_path)
                collected_files += 1
                files_by_extension[ext_lower] = files_by_extension.get(ext_lower, 0) + 1

                if collected_files % 100 == 0:  # Progress indicator
                    print(f"Collected {collected_files} files...")

            except Exception as e:
                print(f"Error collecting {source_file_path}: {e}")

print(f"\nCollection complete! Collected {collected_files} files")
print("Files by extension:")
for ext, count in sorted(files_by_extension.items()):
    print(f"  {ext}: {count} files")

print(f"\nFiles are ready in: {INTERMEDIATE_PATH}")
print("Ready for processing in next cell!")

In [None]:
%%time
import os
import shutil
import json
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

# Configuration
INTERMEDIATE_PATH = '/content/drive/MyDrive/Google AI Studio/all_html_txt'
FINAL_PATH = '/content/drive/MyDrive/Google AI Studio/all_txt'

# Helper: ensure unique filenames in any dest folder
def get_unique_filename(destination_path, filename):
    full_path = os.path.join(destination_path, filename)
    if not os.path.exists(full_path):
        return filename
    name, ext = os.path.splitext(filename)
    counter = 1
    while True:
        new_filename = f"{name}_{counter}{ext}"
        if not os.path.exists(os.path.join(destination_path, new_filename)):
            return new_filename
        counter += 1

# Content processors
def process_html_content(content):
    """Process HTML content and extract clean text."""
    soup = BeautifulSoup(content, 'html.parser')
    for script in soup(["script", "style"]):
        script.decompose()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    cleaned = '\n'.join(chunk for chunk in chunks if chunk)
    return cleaned.lower()

def process_xml_content(content):
    """Process XML content and extract text from all elements."""
    try:
        root = ET.fromstring(content)
        def extract_text(el):
            parts = []
            if el.text:
                parts.append(el.text.strip())
            for child in el:
                parts.extend(extract_text(child))
                if child.tail:
                    parts.append(child.tail.strip())
            return parts
        parts = extract_text(root)
        return '\n'.join(p for p in parts if p).lower()
    except ET.ParseError:
        return content.lower()

def process_json_content(content):
    """Process JSON content and extract all string values."""
    try:
        data = json.loads(content)
        def extract_strings(obj):
            strs = []
            if isinstance(obj, dict):
                for k, v in obj.items():
                    strs.append(str(k))
                    strs.extend(extract_strings(v))
            elif isinstance(obj, list):
                for item in obj:
                    strs.extend(extract_strings(item))
            elif isinstance(obj, str):
                strs.append(obj)
            else:
                strs.append(str(obj))
            return strs
        all_strs = extract_strings(data)
        return '\n'.join(all_strs).lower()
    except json.JSONDecodeError:
        return content.lower()

def process_plain_text_content(content):
    """Process plain text content (for .txt, .py, .md files)."""
    return content.lower()

# STEP 2: PROCESSING TO TEXT
print("=== STEP 2: PROCESSING TO TEXT ===")

if not os.path.exists(INTERMEDIATE_PATH):
    raise FileNotFoundError(f"Intermediate directory not found: {INTERMEDIATE_PATH}")

# Reset final directory
tag = "Cleaning" if os.path.exists(FINAL_PATH) else "Creating"
print(f"{tag} final directory: {FINAL_PATH}")
if os.path.exists(FINAL_PATH):
    shutil.rmtree(FINAL_PATH)
os.makedirs(FINAL_PATH, exist_ok=True)

print(f"Processing files from: {INTERMEDIATE_PATH}")
print(f"Saving processed text to: {FINAL_PATH}")

processed_files = 0
processing_errors = 0
files_by_type = {}

for file in os.listdir(INTERMEDIATE_PATH):
    src = os.path.join(INTERMEDIATE_PATH, file)
    if not os.path.isfile(src):
        continue
    _, ext = os.path.splitext(file)
    ext_lower = ext.lower()
    base = os.path.splitext(file)[0]
    # Generate unique .txt name
    final_name = get_unique_filename(FINAL_PATH, f"{base}.txt")
    dest = os.path.join(FINAL_PATH, final_name)
    try:
        with open(src, 'r', encoding='utf-8') as f:
            content = f.read()
        if ext_lower in ['.html', '.htm']:
            text_content = process_html_content(content)
            ftype = 'HTML'
        elif ext_lower == '.xml':
            text_content = process_xml_content(content)
            ftype = 'XML'
        elif ext_lower == '.json':
            text_content = process_json_content(content)
            ftype = 'JSON'
        elif ext_lower in ['.txt', '.py', '.md']:
            text_content = process_plain_text_content(content)
            ftype = ext_lower.upper().replace('.', '')
        else:
            continue
        with open(dest, 'w', encoding='utf-8') as f:
            f.write(text_content)
        processed_files += 1
        files_by_type[ftype] = files_by_type.get(ftype, 0) + 1
        if processed_files % 50 == 0:
            print(f"Processed {processed_files} files...")
    except UnicodeDecodeError:
        print(f"Encoding error (skipping binary file): {file}")
        processing_errors += 1
    except Exception as e:
        print(f"Processing error for {file}: {e}")
        processing_errors += 1

print("\nProcessing complete!")
print(f"Total files processed: {processed_files}")
print(f"Processing errors: {processing_errors}")
print("Files processed by type:")
for t, cnt in sorted(files_by_type.items()):
    print(f"  {t}: {cnt} files")

print("\n=== PIPELINE COMPLETE ===")
print(f"RAG-ready text files available in: {FINAL_PATH}")
print(f"Intermediate files are still in: {INTERMEDIATE_PATH}")
print("You can manually delete the intermediate folder if you want to save space.")


In [None]:
!pip install uv



In [None]:
!uv pip install python-dotenv tqdm langchain langchain-huggingface langchain-community chromadb tensorflow-hub tensorflow-text pypdf2 pymupdf


[2mUsing Python 3.11.13 environment at: /usr[0m
[2mAudited [1m10 packages[0m [2min 65ms[0m[0m


In [None]:
%%time

"""
create_database.py - TPU-only, fully parallel with progress bars

Features:
  • Use TPU (via TensorFlow Hub) for embeddings
  • Load, split, embed, and ingest .txt and .pdf files in parallel
  • Progress bars at each step using tqdm
  • Support for multiple PDF extraction methods
"""
import os
from pathlib import Path
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

from dotenv import load_dotenv
load_dotenv()

import torch
from tqdm.auto import tqdm
from langchain.schema import Document
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import TensorflowHubEmbeddings
from langchain_community.vectorstores import Chroma

# Alternative PDF loaders for better extraction
try:
    import fitz  # PyMuPDF
    PYMUPDF_AVAILABLE = True
except ImportError:
    PYMUPDF_AVAILABLE = False

try:
    import PyPDF2
    PYPDF2_AVAILABLE = True
except ImportError:
    PYPDF2_AVAILABLE = False

# Paths
DATA_DIR = Path("/content/drive/MyDrive/Google AI Studio/all_txt")
CHROMA_PATH = Path("/content/drive/MyDrive/Google AI Studio/chroma_db")
CHROMA_COLLECTION_NAME = "rag_collection"

# Embedding via TPU
EMBEDDING_MODEL_URL = "https://tfhub.dev/google/universal-sentence-encoder-large/5"

def load_pdf_with_pymupdf(file_path):
    """Load PDF using PyMuPDF (fitz) - generally better text extraction"""
    docs = []
    try:
        pdf_document = fitz.open(file_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text()
            if text.strip():  # Only add non-empty pages
                doc = Document(
                    page_content=text,
                    metadata={
                        "source": str(file_path),
                        "page": page_num + 1,
                        "total_pages": len(pdf_document)
                    }
                )
                docs.append(doc)
        pdf_document.close()
    except Exception as e:
        print(f"❌ PyMuPDF failed for {file_path}: {e}")
    return docs

def load_pdf_with_pypdf2(file_path):
    """Load PDF using PyPDF2 - fallback method"""
    docs = []
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                if text.strip():  # Only add non-empty pages
                    doc = Document(
                        page_content=text,
                        metadata={
                            "source": str(file_path),
                            "page": page_num + 1,
                            "total_pages": len(pdf_reader.pages)
                        }
                    )
                    docs.append(doc)
    except Exception as e:
        print(f"❌ PyPDF2 failed for {file_path}: {e}")
    return docs

def load_pdf_with_langchain(file_path):
    """Load PDF using LangChain's PyPDFLoader - another fallback"""
    docs = []
    try:
        loader = PyPDFLoader(str(file_path))
        docs = loader.load()
    except Exception as e:
        print(f"❌ LangChain PyPDFLoader failed for {file_path}: {e}")
    return docs

def load_single_pdf(file_path):
    """Try multiple PDF loading methods in order of preference"""
    docs = []

    # Try PyMuPDF first (usually best quality)
    if PYMUPDF_AVAILABLE:
        docs = load_pdf_with_pymupdf(file_path)
        if docs:
            return docs

    # Try PyPDF2 as fallback
    if PYPDF2_AVAILABLE:
        docs = load_pdf_with_pypdf2(file_path)
        if docs:
            return docs

    # Try LangChain's PyPDFLoader as last resort
    docs = load_pdf_with_langchain(file_path)
    return docs

def load_single_txt(file_path):
    """Load a single text file"""
    try:
        return TextLoader(str(file_path)).load()
    except Exception as e:
        print(f"❌ Failed to load {file_path}: {e}")
        return []

def load_documents_parallel(file_paths, file_type):
    """Load documents in parallel"""
    docs = []
    load_func = load_single_pdf if file_type == 'pdf' else load_single_txt

    with ThreadPoolExecutor(max_workers=min(len(file_paths), os.cpu_count())) as executor:
        futures = {executor.submit(load_func, path): path for path in file_paths}

        for future in tqdm(as_completed(futures), total=len(futures),
                          desc=f"Loading {file_type.upper()} files", unit="file"):
            try:
                result = future.result()
                if result:
                    docs.extend(result)
            except Exception as e:
                file_path = futures[future]
                print(f"❌ Failed to load {file_path}: {e}")

    return docs

def build_db():
    # 1. Load documents
    print("📂 Loading .txt and .pdf files...")

    # Find all files
    txt_files = list(DATA_DIR.rglob("*.txt"))
    pdf_files = list(DATA_DIR.rglob("*.pdf"))

    print(f"Found {len(txt_files)} .txt files and {len(pdf_files)} .pdf files")

    if not txt_files and not pdf_files:
        print("❌ No documents found.")
        return

    # Load documents in parallel
    all_docs = []

    if txt_files:
        txt_docs = load_documents_parallel(txt_files, 'txt')
        all_docs.extend(txt_docs)
        print(f"✅ Loaded {len(txt_docs)} text documents")

    if pdf_files:
        pdf_docs = load_documents_parallel(pdf_files, 'pdf')
        all_docs.extend(pdf_docs)
        print(f"✅ Loaded {len(pdf_docs)} PDF pages")

    if not all_docs:
        print("❌ No documents could be loaded.")
        return

    print(f"📄 Total documents loaded: {len(all_docs)}")

    # 2. Split documents in parallel
    print("✂️  Splitting documents into chunks...")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=50,
        separators=["\n\n", "\n", " ", ""]  # Better for PDF content
    )

    chunks = []
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = {executor.submit(splitter.split_documents, [d]): d for d in all_docs}
        for future in tqdm(as_completed(futures), total=len(futures),
                          desc="Splitting docs", unit="doc"):
            try:
                result = future.result()
                chunks.extend(result)
            except Exception as e:
                print(f"❌ Failed to split document: {e}")
                continue

    print(f"📄 Generated {len(chunks)} chunks from {len(all_docs)} documents.")

    # 3. Prepare Chroma directory
    if CHROMA_PATH.exists():
        import shutil
        shutil.rmtree(CHROMA_PATH)
    CHROMA_PATH.mkdir(parents=True, exist_ok=True)

    # 4. Persist with parallel ingestion
    print("🌐 Ingesting chunks to Chroma with TPU embeddings in parallel...")
    embed_fn = TensorflowHubEmbeddings(model_url=EMBEDDING_MODEL_URL)
    db = Chroma(
        collection_name=CHROMA_COLLECTION_NAME,
        persist_directory=str(CHROMA_PATH),
        embedding_function=embed_fn
    )

    # Process in batches to avoid memory issues
    batch_size = 50
    batches = [chunks[i:i + batch_size] for i in range(0, len(chunks), batch_size)]

    with ThreadPoolExecutor(max_workers=min(len(batches), os.cpu_count())) as executor:
        futures = {executor.submit(db.add_documents, documents=batch): idx
                  for idx, batch in enumerate(batches)}

        for future in tqdm(as_completed(futures), total=len(futures),
                          desc="Adding batches", unit="batch"):
            try:
                future.result()
            except Exception as e:
                batch_idx = futures[future]
                print(f"❌ Batch {batch_idx} failed: {e}")

    db.persist()

    # 5. Save metadata
    info = {
        'created_at': time.strftime('%Y-%m-%d %H:%M:%S'),
        'model_url': EMBEDDING_MODEL_URL,
        'total_documents': len(all_docs),
        'txt_files': len(txt_files),
        'pdf_files': len(pdf_files),
        'total_chunks': len(chunks),
        'pdf_extraction_methods': {
            'pymupdf_available': PYMUPDF_AVAILABLE,
            'pypdf2_available': PYPDF2_AVAILABLE
        }
    }

    with open(CHROMA_PATH / 'info.json', 'w') as f:
        json.dump(info, f, indent=2)

    print("✅ Database built successfully.")
    print(f"📊 Summary:")
    print(f"   • Total documents: {len(all_docs)}")
    print(f"   • Text files: {len(txt_files)}")
    print(f"   • PDF files: {len(pdf_files)}")
    print(f"   • Total chunks: {len(chunks)}")

if __name__ == '__main__':
    build_db()

📂 Loading .txt and .pdf files...
Found 1777 .txt files and 0 .pdf files


Loading TXT files:   0%|          | 0/1777 [00:00<?, ?file/s]

✅ Loaded 1777 text documents
📄 Total documents loaded: 1777
✂️  Splitting documents into chunks...


Splitting docs:   0%|          | 0/1777 [00:00<?, ?doc/s]

📄 Generated 51838 chunks from 1777 documents.
🌐 Ingesting chunks to Chroma with TPU embeddings in parallel...




Adding batches:   0%|          | 0/1037 [00:00<?, ?batch/s]



✅ Database built successfully.
📊 Summary:
   • Total documents: 1777
   • Text files: 1777
   • PDF files: 0
   • Total chunks: 51838
CPU times: user 1h 13min 31s, sys: 59min 10s, total: 2h 12min 41s
Wall time: 38min 19s


In [None]:
!pip install python-dotenv tqdm langchain langchain-huggingface langchain-community chromadb tensorflow-hub tensorflow-text google-generativeai


In [None]:
#!/usr/bin/env python3

"""
qa_with_google_ai.py - Interactive QA with ChromaDB retrieval and Google AI

Features:
  • Retrieves relevant documents from ChromaDB using TPU embeddings
  • Uses Google's Gemini API to generate clear, contextual answers
  • Interactive question-answering interface
  • Updated to use Universal Sentence Encoder v5
"""
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

import time
from tqdm.auto import tqdm
import google.generativeai as genai

# Updated import for current LangChain version
from langchain_community.embeddings import TensorflowHubEmbeddings
from langchain_community.vectorstores import Chroma

# API Configuration - Better to use environment variable
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', "AIzaSyCDhN63yAiXk5Z_zh3TIw50Cu5PD3MUMts")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)

# Paths (ensure database is already built)
CHROMA_PATH = Path('/content/drive/MyDrive/Google AI Studio/chroma_db_cloude')
COLLECTION_NAME = 'rag_collection'

# IMPORTANT: This must match the model used when creating the database
EMBEDDING_MODEL_URL = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'

TOP_K = 5  # Retrieve more documents for better context

# Google AI Model Configuration - Using FREE TIER
MODEL_NAME = "gemini-1.5-flash"  # FREE: 15 req/min, 1M tokens/min, 1,500 req/day
# Alternative free options:
# "gemini-1.5-pro" - FREE: 2 req/min, 32k tokens/min, 50 req/day (higher quality)
# "gemini-1.5-flash-8b" - FREE: 15 req/min, 1M tokens/min, 1,500 req/day (fastest)

model = genai.GenerativeModel(MODEL_NAME)

# Generation configuration optimized for free tier
generation_config = genai.types.GenerationConfig(
    temperature=0.1,  # Lower for more factual responses
    top_p=0.8,
    top_k=40,
    max_output_tokens=1024,  # Reduced to save tokens on free tier
)

def initialize_retriever():
    """Initialize ChromaDB retriever with TPU embeddings"""
    print("🔧 Initializing ChromaDB retriever...")

    try:
        embed_fn = TensorflowHubEmbeddings(model_url=EMBEDDING_MODEL_URL)
        print(f"✅ Embeddings initialized with model: {EMBEDDING_MODEL_URL}")

        db = Chroma(
            collection_name=COLLECTION_NAME,
            persist_directory=str(CHROMA_PATH),
            embedding_function=embed_fn
        )

        # Test if database exists and has data
        collection = db._collection
        doc_count = collection.count()
        print(f"📊 Database contains {doc_count} documents")

        if doc_count == 0:
            raise ValueError("Database is empty. Please run the database creation script first.")

        retriever = db.as_retriever(search_kwargs={"k": TOP_K})
        print("✅ Retriever initialized successfully!")
        return retriever

    except Exception as e:
        print(f"❌ Failed to initialize retriever: {e}")
        if "No such file or directory" in str(e):
            print("💡 Hint: Make sure you've run the database creation script first")
        elif "model" in str(e).lower():
            print("💡 Hint: Ensure the embedding model URL matches the one used to create the database")
        raise

def create_prompt(query, documents):
    """Create a prompt for Google AI with retrieved context"""
    context = "\n\n".join([
        f"Document {i+1} (Source: {doc.metadata.get('source', 'Unknown')}):\n{doc.page_content[:800]}"
        for i, doc in enumerate(documents)
    ])

    prompt = f"""You are a helpful assistant that answers questions based on the provided context documents.

Context Documents:
{context}

Question: {query}

Instructions:
1. Answer the question based ONLY on the information provided in the context documents
2. If the answer is not available in the context, clearly state that the information is not available
3. Be specific and cite relevant details from the documents
4. Provide a clear, well-structured answer
5. If multiple documents contain relevant information, synthesize them coherently
6. When possible, mention which document(s) the information comes from

Answer:"""

    return prompt

def get_ai_response(prompt):
    """Get response from Google AI with rate limiting for free tier"""
    try:
        # Add a small delay to respect free tier rate limits
        time.sleep(0.1)  # Prevents hitting 15 req/min limit too quickly

        response = model.generate_content(
            prompt,
            generation_config=generation_config
        )

        if response.parts:
            return response.text
        else:
            return "⚠️ No response generated. The content might have been filtered."

    except Exception as e:
        error_msg = str(e)
        if "quota" in error_msg.lower() or "rate" in error_msg.lower():
            return "⚠️ Rate limit reached. Please wait a moment and try again. Free tier allows 15 requests per minute."
        elif "safety" in error_msg.lower():
            return "⚠️ Response was filtered due to safety settings. Try rephrasing your question."
        return f"❌ Error generating response: {error_msg}"

def format_sources(documents):
    """Format document sources for reference"""
    sources = []
    for i, doc in enumerate(documents, 1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata.get('source', 'Unknown')}"
        if 'page' in metadata:
            source_info += f" (Page {metadata['page']})"
        sources.append(source_info)
    return "\n".join(sources)

def main():
    print("🤖 Interactive QA with ChromaDB Retrieval + Google AI (FREE TIER)")
    print("=" * 60)
    print("📊 Configuration:")
    print(f"   • Embedding Model: Universal Sentence Encoder Large v5")
    print(f"   • Google AI Model: {MODEL_NAME}")
    print(f"   • Free Tier Limits: 15 requests/min, 1,500 requests/day")
    print("=" * 60)

    # Initialize retriever
    try:
        retriever = initialize_retriever()
    except Exception as e:
        print(f"❌ Failed to initialize system: {e}")
        return

    print(f"🔍 Retrieving top {TOP_K} relevant documents per query")
    print("\nType your question and press Enter. Type 'exit' to quit.")
    print("💡 Tip: Be specific in your questions for better results!")
    print("=" * 60)

    while True:
        try:
            # Get user query
            query = input("\n🔍 Your Question: ").strip()

            if query.lower() in ('exit', 'quit', 'q'):
                print("👋 Goodbye!")
                break

            if not query:
                continue

            print("\n⏳ Searching documents...")

            # Retrieve relevant documents
            documents = retriever.get_relevant_documents(query)

            if not documents:
                print("❌ No relevant documents found in the database.")
                print("💡 Try rephrasing your question or using different keywords.")
                continue

            print(f"✅ Found {len(documents)} relevant documents")
            print("🤖 Generating answer with Google AI...")

            # Create prompt and get AI response
            prompt = create_prompt(query, documents)
            ai_response = get_ai_response(prompt)

            # Display results
            print("\n" + "="*60)
            print("🎯 ANSWER:")
            print("="*60)
            print(ai_response)

            print("\n" + "="*60)
            print("📖 SOURCES:")
            print("="*60)
            print(format_sources(documents))

            # Optional: Show document previews
            show_docs = input("\n📄 Show document previews? (y/n): ").strip().lower()
            if show_docs == 'y':
                print("\n" + "="*60)
                print("📋 DOCUMENT PREVIEWS:")
                print("="*60)
                for i, doc in enumerate(documents, 1):
                    print(f"\n--- Document {i} Preview ---")
                    preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
                    print(preview)
                    print("-" * 30)

        except KeyboardInterrupt:
            print("\n👋 Goodbye!")
            break
        except Exception as e:
            print(f"❌ Error: {str(e)}")
            print("Please try again with a different question.")

def test_system():
    """Test function to verify the system works"""
    print("🧪 Testing the QA system...")

    try:
        retriever = initialize_retriever()
        test_queries = [
            "What is this document about?",
            "Tell me about the main topics covered",
            "What are the key points mentioned?"
        ]

        for query in test_queries:
            print(f"Testing query: {query}")
            documents = retriever.get_relevant_documents(query)

            if documents:
                print(f"✅ Retrieved {len(documents)} documents")
                prompt = create_prompt(query, documents[:2])  # Test with 2 docs
                response = get_ai_response(prompt)
                print(f"AI Response preview: {response[:150]}...")
                break
            else:
                print("❌ No documents found for this query")

        return len(documents) > 0

    except Exception as e:
        print(f"❌ System test failed: {e}")
        return False

def check_embedding_compatibility():
    """Check if the embedding model matches the database"""
    print("🔍 Checking embedding model compatibility...")

    info_file = CHROMA_PATH / 'info.json'
    if info_file.exists():
        import json
        with open(info_file, 'r') as f:
            db_info = json.load(f)

        db_model = db_info.get('model_url', 'Unknown')
        current_model = EMBEDDING_MODEL_URL

        print(f"Database model: {db_model}")
        print(f"Current model:  {current_model}")

        if db_model != current_model:
            print("⚠️  WARNING: Embedding model mismatch detected!")
            print("This will cause poor retrieval performance.")
            print("Please update the EMBEDDING_MODEL_URL to match your database.")
            return False
        else:
            print("✅ Embedding models match!")
            return True
    else:
        print("⚠️  No database info found. Cannot verify compatibility.")
        return None

if __name__ == '__main__':
    # Check compatibility first
    check_embedding_compatibility()

    # Uncomment the next line to run a quick test first
    # test_system()

    main()

🔍 Checking embedding model compatibility...
⚠️  No database info found. Cannot verify compatibility.
🤖 Interactive QA with ChromaDB Retrieval + Google AI (FREE TIER)
📊 Configuration:
   • Embedding Model: Universal Sentence Encoder Large v5
   • Google AI Model: gemini-1.5-flash
   • Free Tier Limits: 15 requests/min, 1,500 requests/day
🔧 Initializing ChromaDB retriever...
✅ Embeddings initialized with model: https://tfhub.dev/google/universal-sentence-encoder-large/5


  db = Chroma(


📊 Database contains 1750 documents
✅ Retriever initialized successfully!
🔍 Retrieving top 5 relevant documents per query

Type your question and press Enter. Type 'exit' to quit.
💡 Tip: Be specific in your questions for better results!

🔍 Your Question: django

⏳ Searching documents...


  documents = retriever.get_relevant_documents(query)


✅ Found 5 relevant documents
🤖 Generating answer with Google AI...

🎯 ANSWER:
Based on the provided text, Django is a web framework designed for rapid development of database-driven web applications.  It was developed in a fast-paced newsroom environment, making common web development tasks easier (Document 1).  The documentation includes sections on getting started for both beginners and experienced developers (Document 4), an overview of how Django works (Document 1, Document 3), and details on using the Django source code repository (Document 2).  Django includes an object-relational mapper (ORM) allowing database layout descriptions in Python code (Document 3).  A release version 1.0.2 is mentioned (Document 5).  For production environments, using official packaged releases is recommended (Document 2).


📖 SOURCES:
Source 1: /content/drive/MyDrive/Google AI Studio/all_txt/overview_2.txt
Source 2: /content/drive/MyDrive/Google AI Studio/all_txt/git_1.txt
Source 3: /content/drive/MyD