In [114]:
# Install dependencies
%pip install langchain langchain-google-genai chromadb python-dotenv unstructured --quiet



I0000 00:00:1755628425.720722 4712835 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


[0mNote: you may need to restart the kernel to use updated packages.


In [115]:
# Setup environment and load API key

import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Get API key and set it in the environment
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise RuntimeError("Set GEMINI_API_KEY in your .env file")

# Set the API key for Google Generative AI
os.environ["GOOGLE_API_KEY"] = api_key


In [116]:
# Imports

from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate


In [117]:
DATA_PATH = "alice_in_wonderland.md"  # Updated file extension to .md
PERSIST_DIR = "chroma_db"  # Using a single consistent directory name
COLLECTION = "docs"

# Load and preprocess the document
loader = UnstructuredMarkdownLoader(DATA_PATH, show_progress=True)
docs = loader.load()
print(f"Loaded {len(docs)} documents")

# Configure text splitter for better chunks
CHUNK_SIZE = 1000  # Larger chunks to maintain more context
CHUNK_OVERLAP = 200  # Larger overlap to prevent losing context at boundaries

# Use RecursiveCharacterTextSplitter with better separators
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],  # More granular splitting
    keep_separator=True,  # Keep the separators to maintain readability
    strip_whitespace=True,  # Clean up whitespace
    add_start_index=True,  # Add position info to metadata
)

Loaded 1 documents


In [118]:
# Split into chunks and filter out boilerplate
def is_meaningful_chunk(text: str) -> bool:
    # Skip headers, licensing info, and other boilerplate
    skip_patterns = [
        "Project Gutenberg",
        "THE MILLENNIUM FULCRUM EDITION",
        "Contents",
        "*      *      *",
        "trademark",
        "license",
        "copyright"
    ]
    return not any(pattern.lower() in text.lower() for pattern in skip_patterns)

# Split and filter chunks
chunks = splitter.split_documents(docs)
filtered_chunks = [
    chunk for chunk in chunks 
    if is_meaningful_chunk(chunk.page_content) and len(chunk.page_content.strip()) > 50  # Skip very short chunks
]

print(f"Split into {len(chunks)} chunks, {len(filtered_chunks)} after filtering")

Split into 217 chunks, 185 after filtering


In [119]:
# Create embeddings & persist vector DB

# Gemini embeddings with retry on failure
from tenacity import retry, stop_after_attempt, wait_exponential

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def get_embeddings():
    return GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        task_type="retrieval_query"  # Specify task type for better embeddings
    )

embeddings = get_embeddings()

# Create vector DB with filtered chunks
vectordb = Chroma.from_documents(
    documents=filtered_chunks,  # Use filtered chunks
    embedding=embeddings,
    persist_directory=PERSIST_DIR,
    collection_name=COLLECTION
)
print(f"Database created and saved to disk with {len(filtered_chunks)} chunks")





Database created and saved to disk with 185 chunks


In [120]:
# Reload persisted DB
vectordb = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embeddings,
    collection_name="docs"
)

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

def answer_query(query: str, k: int = 3):
    print("=== RETRIEVAL STEP ===")
    print(f"Query: {query}\n")
    
    # Use MMR retrieval with higher fetch_k for better candidate selection
    retriever = vectordb.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": k * 2,  # Fetch more initially for filtering
            "fetch_k": k * 4,
            "lambda_mult": 0.7
        }
    )
    
    # Get initial documents
    docs = retriever.get_relevant_documents(query)
    
    # Filter for unique content and those with start_index
    seen_content = set()
    filtered_docs = []
    
    for doc in docs:
        # Get start_index from metadata if it exists
        start_index = doc.metadata.get('start_index', None)
        if start_index is None:
            continue
            
        # Normalize content for comparison (remove extra whitespace)
        content = ' '.join(doc.page_content.split())
        
        # Skip if we've seen this content before
        if content in seen_content:
            continue
            
        seen_content.add(content)
        filtered_docs.append((doc, start_index))
    
    # Sort by start_index and take top 3
    filtered_docs.sort(key=lambda x: x[1])
    final_docs = [doc for doc, _ in filtered_docs[:3]]
    
    # Show retrieved chunks
    print(f"Retrieved {len(final_docs)} unique chunks:")
    for i, doc in enumerate(final_docs):
        print(f"\n--- Chunk {i+1} ---")
        print("Content:")
        print(doc.page_content)
        print("\nMetadata:", doc.metadata)
        print("-" * 80)
    
    if not final_docs:
        print("\nNo relevant chunks found.")
        return

    print("\n=== GENERATION STEP ===")
    
    # Optimized prompt for flash model with narrative context
    template = """You are helping answer questions about Alice in Wonderland. Use only the provided context to answer.
If you can't find the answer in the context, say "Based on the provided context, I cannot answer this question."

Context (in story order):
{context}

Question: {question}

Instructions:
1. Use only information from the context
2. Be specific and quote relevant parts
3. Follow the story's sequence when describing events
4. If information is incomplete, say so

Answer:"""
    
    # Format prompt with better context joining and position info
    contexts = []
    for doc in final_docs:
        # Add position context to help with narrative flow
        start_idx = doc.metadata.get('start_index', 0)
        context = f"[Story position {start_idx}]:\n{doc.page_content}"
        contexts.append(context)
    formatted_context = "\n\n---\n\n".join(contexts)
    
    prompt = PromptTemplate(input_variables=["context", "question"], template=template)
    formatted = prompt.format(context=formatted_context, question=query)

    # Use Gemini flash model with narrative-optimized settings
    chat = ChatGoogleGenerativeAI(
        model="models/gemini-1.5-flash",
        temperature=0.2,  # Slightly higher for better narrative flow
        top_p=0.85,      # More focused token selection
        top_k=30,        # More focused selection
        max_output_tokens=512  # Limit length for more concise answers
    )
    response = chat.invoke(formatted)

    print("\n=== FINAL ANSWER ===")
    print(response.content)


# Test queries focusing on specific events/characters
queries = [
    "What happens at the tea party?",
    "How does Alice meet the Mad Hatter?",
    "What does the Queen of Hearts say?",
    "Describe the Cheshire Cat's appearance"
]

for query in queries:
    print(f"\nQuery: {query}")
    print("=" * 80)
    answer_query(query)
    print("\n" + "=" * 80)



Query: What happens at the tea party?
=== RETRIEVAL STEP ===
Query: What happens at the tea party?

Retrieved 0 unique chunks:

No relevant chunks found.


Query: How does Alice meet the Mad Hatter?
=== RETRIEVAL STEP ===
Query: How does Alice meet the Mad Hatter?

Retrieved 3 unique chunks:

--- Chunk 1 ---
Content:
“Oh, you’re sure to do that,” said the Cat, “if you only walk long enough.”

Alice felt that this could not be denied, so she tried another question. “What sort of people live about here?”

“In that direction,” the Cat said, waving its right paw round, “lives a Hatter: and in that direction,” waving the other paw, “lives a March Hare. Visit either you like: they’re both mad.”

“But I don’t want to go among mad people,” Alice remarked.

“Oh, you can’t help that,” said the Cat: “we’re all mad here. I’m mad. You’re mad.”

“How do you know I’m mad?” said Alice.

“You must be,” said the Cat, “or you wouldn’t have come here.”

Alice didn’t think that proved it at all; however, 