In [264]:
# Install dependencies
%pip install langchain langchain-google-genai chromadb python-dotenv unstructured sentence-transformers langchain-community --quiet



[0mNote: you may need to restart the kernel to use updated packages.


In [265]:
# Set environment variable to handle tokenizers parallelism
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Explicitly disable parallelism


In [266]:
# Setup environment and load API key

import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Get API key and set it in the environment
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise RuntimeError("Set GEMINI_API_KEY in your .env file")

# Set the API key for Google Generative AI
os.environ["GOOGLE_API_KEY"] = api_key


In [267]:
# Imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI  # generation


In [268]:
# Set up paths with absolute references
import os

# Get absolute paths
current_dir = os.path.dirname(os.path.abspath("__file__"))
DATA_PATH = os.path.join(current_dir, "alice_in_wonderland.md")
PERSIST_DIR = os.path.join(current_dir, "chroma_rag_db")  # New database directory
COLLECTION = "alice_wonderland"

# Ensure the persist directory exists with proper permissions
os.makedirs(PERSIST_DIR, exist_ok=True)
os.chmod(PERSIST_DIR, 0o755)  # More secure permissions (rwxr-xr-x)

# Load and preprocess the document
loader = UnstructuredMarkdownLoader(DATA_PATH, show_progress=True)
docs = loader.load()
print(f"Loaded {len(docs)} documents")

# Configure text splitter for better chunks
CHUNK_SIZE = 1000  # Larger chunks to maintain more context
CHUNK_OVERLAP = 200  # Larger overlap to prevent losing context at boundaries

# Use RecursiveCharacterTextSplitter with better separators
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],  # More granular splitting
    keep_separator=True,  # Keep the separators to maintain readability
    strip_whitespace=True,  # Clean up whitespace
    add_start_index=True,  # Add position info to metadata
)

Loaded 1 documents


In [269]:
# Split into chunks and filter out boilerplate
def is_meaningful_chunk(text: str) -> bool:
    # Skip headers, licensing info, and other boilerplate
    skip_patterns = [
        "Project Gutenberg",
        "THE MILLENNIUM FULCRUM EDITION",
        "Contents",
        "*      *      *",
        "trademark",
        "license",
        "copyright"
    ]
    return not any(pattern.lower() in text.lower() for pattern in skip_patterns)

# Split and filter chunks
chunks = splitter.split_documents(docs)
filtered_chunks = [
    chunk for chunk in chunks 
    if is_meaningful_chunk(chunk.page_content) and len(chunk.page_content.strip()) > 50  # Skip very short chunks
]

print(f"Split into {len(chunks)} chunks, {len(filtered_chunks)} after filtering")

Split into 217 chunks, 185 after filtering


In [270]:
# Create embeddings & persist vector DB

# Initialize ChromaDB with explicit settings
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB client with explicit settings
chroma_settings = Settings(
    persist_directory=PERSIST_DIR,
    is_persistent=True,
    anonymized_telemetry=False
)

# Create a new client instance
chroma_client = chromadb.Client(chroma_settings)

# Get or create collection - this is safer than deleting/recreating
try:
    collection = chroma_client.get_collection(name=COLLECTION)
    print(f"Using existing collection: {COLLECTION}")
except:
    collection = chroma_client.create_collection(name=COLLECTION)
    print(f"Created new collection: {COLLECTION}")

# Initialize the sentence-transformers embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Lightweight, fast model
model_kwargs = {
    'device': 'cpu'  # Use CPU for better compatibility
}
encode_kwargs = {
    'normalize_embeddings': True,  # Normalize for better similarity matching
    'batch_size': 32  # Process in smaller batches for memory efficiency
}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Create vector DB with filtered chunks
# Note: Using normalized embeddings which automatically uses cosine similarity
vectordb = Chroma.from_documents(
    documents=filtered_chunks,
    embedding=embeddings,
    persist_directory=PERSIST_DIR,
    collection_name=COLLECTION,
    client=chroma_client  # Use our explicitly configured client
)
print(f"Database created and saved to disk with {len(filtered_chunks)} chunks")





Using existing collection: alice_wonderland
Database created and saved to disk with 185 chunks


In [271]:
# Reload persisted DB with consistent settings
vectordb = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embeddings,
    collection_name=COLLECTION,  # Use same collection name
    client_settings=chroma_settings  # Use same settings
)

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

def answer_query(query: str, k: int = 3):
    print("=== RETRIEVAL STEP ===")
    print(f"Query: {query}\n")
    
    # Use MMR for better diversity and relevance
    retriever = vectordb.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": k,
            "fetch_k": k * 3,
            "lambda_mult": 0.7  # Balance between relevance (1.0) and diversity (0.0)
        }
    )
    
    # Get initial documents
    docs = retriever.get_relevant_documents(query)
    
    # Filter for unique content and those with start_index
    seen_content = set()
    filtered_docs = []
    
    for doc in docs:
        # Get start_index from metadata if it exists
        start_index = doc.metadata.get('start_index', None)
        if start_index is None:
            continue
            
        # Normalize content for comparison (remove extra whitespace)
        content = ' '.join(doc.page_content.split())
        
        # Skip if we've seen this content before
        if content in seen_content:
            continue
            
        seen_content.add(content)
        filtered_docs.append((doc, start_index))
    
    # Sort by start_index and take top 3
    filtered_docs.sort(key=lambda x: x[1])
    final_docs = [doc for doc, _ in filtered_docs[:3]]
    
    # Show retrieved chunks
    print(f"Retrieved {len(final_docs)} unique chunks:")
    for i, doc in enumerate(final_docs):
        print(f"\n--- Chunk {i+1} ---")
        print("Content:")
        print(doc.page_content)
        print("\nMetadata:", doc.metadata)
        print("-" * 80)
    
    if not final_docs:
        print("\nNo relevant chunks found.")
        return

    print("\n=== GENERATION STEP ===")
    
    # Optimized prompt for flash model with narrative context
    template = """You are helping answer questions about Alice in Wonderland. Use only the provided context to answer.
If you can't find the answer in the context, say "Based on the provided context, I cannot answer this question."

Context (in story order):
{context}

Question: {question}

Instructions:
1. Use only information from the context
2. Be specific and quote relevant parts
3. Follow the story's sequence when describing events
4. If information is incomplete, say so

Answer:"""
    
    # Format prompt with better context joining and position info
    contexts = []
    for doc in final_docs:
        # Add position context to help with narrative flow
        start_idx = doc.metadata.get('start_index', 0)
        context = f"[Story position {start_idx}]:\n{doc.page_content}"
        contexts.append(context)
    formatted_context = "\n\n---\n\n".join(contexts)
    
    prompt = PromptTemplate(input_variables=["context", "question"], template=template)
    formatted = prompt.format(context=formatted_context, question=query)

    # Use Gemini flash model with narrative-optimized settings
    chat = ChatGoogleGenerativeAI(
        model="models/gemma-3n-e2b-it",
        temperature=0.2,  # Slightly higher for better narrative flow
        top_p=0.85,      # More focused token selection
        top_k=30,        # More focused selection
        max_output_tokens=512  # Limit length for more concise answers
    )
    response = chat.invoke(formatted)

    print("\n=== FINAL ANSWER ===")
    print(response.content)


# Test queries focusing on specific events/characters
queries = [
    "What happens at the tea party?",
    "How does Alice meet the Mad Hatter?",
    "What does the Queen of Hearts say?",
    "Describe the Cheshire Cat's appearance"
]

for query in queries:
    print(f"\nQuery: {query}")
    print("=" * 80)
    answer_query(query)
    print("\n" + "=" * 80)



Query: What happens at the tea party?
=== RETRIEVAL STEP ===
Query: What happens at the tea party?

Retrieved 3 unique chunks:

--- Chunk 1 ---
Content:
CHAPTER VII. A Mad Tea-Party

There was a table set out under a tree in front of the house, and the March Hare and the Hatter were having tea at it: a Dormouse was sitting between them, fast asleep, and the other two were using it as a cushion, resting their elbows on it, and talking over its head. “Very uncomfortable for the Dormouse,” thought Alice; “only, as it’s asleep, I suppose it doesn’t mind.”

Metadata: {'start_index': 72167, 'source': '/Users/vaishnavipullakhandam/Desktop/github/Staying Relevant/RAG/FromDocument/alice_in_wonderland.md'}
--------------------------------------------------------------------------------

--- Chunk 2 ---
Content:
“At any rate I’ll never go there again!” said Alice as she picked her way through the wood. “It’s the stupidest tea-party I ever was at in all my life!”

Just as she said this, she notic