Here's the diagram for the process : [rag_image](mermaid_rag.png)
Link to the Streamlit application : [rag_application](https://alice-in-wonderland-rag.streamlit.app/)

In [1]:
# STEP 1: Install dependencies

%pip install langchain langchain-community langchain-huggingface langchain-google-genai --quiet
%pip install chromadb sentence-transformers FlagEmbedding huggingface_hub --quiet



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# STEP 2: Imports
import os, re, json
from pathlib import Path

# LangChain components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

# Reranker
from FlagEmbedding import FlagReranker


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# STEP 3: Authentication (Hugging Face + Gemini key)
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

# Hugging Face login
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
login(HF_TOKEN)

# Gemini API key
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
assert GOOGLE_API_KEY, "Gemini API Key not found!"


In [4]:
# (NEW) Instructor client for OpenRouter — add right after STEP 3
import os
import instructor
from openai import OpenAI

# Safe re-init: only creates if not already present
try:
    or_client
except NameError:
    OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
    assert OPENROUTER_API_KEY, "OpenRouter API Key not found! Set OPENROUTER_API_KEY in your .env"
    or_client = instructor.from_openai(
        OpenAI(base_url="https://openrouter.ai/api/v1", api_key=OPENROUTER_API_KEY),
        mode=instructor.Mode.JSON  # enforce JSON schema + auto-retries
    )

# #printing
print("✅ Instructor/OpenRouter client ready.")


✅ Instructor/OpenRouter client ready.


In [5]:
# STEP 4: Load and chunk the book (token-aware) + batch summarization via OpenRouter (gpt-oss-20b)
import os, json, re, requests
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pydantic import BaseModel, Field

# --- Pydantic models for consistent metadata ---
class ChunkMetadata(BaseModel):
    source: str
    chapter_number: str | None
    chapter_title: str | None
    start_index: int | None
    chunk_number: int
    chunk_summary: str

class ChunkDoc(BaseModel):
    content: str
    metadata: ChunkMetadata

# --- Summarization pipeline (OpenRouter: openai/gpt-oss-20b) ---
class OpenRouterSummarizer:
    def __init__(self, model="openai/gpt-oss-20b", api_key=None):
        self.model = model
        self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
        assert self.api_key, "OpenRouter API key not found. Set OPENROUTER_API_KEY in your .env."

    # Keep the same call signature you used before
    def __call__(self, texts, max_length=40, min_length=10, do_sample=False, batch_size=8):
        url = "https://openrouter.ai/api/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        results = []
        for t in texts:
            prompt = (
                "Summarize the passage in 2-3 sentences as per the content, capture the main point of the content and make sure you're not missing out on the quotes from the content."
                f"Passage:\n{t}\n\nSummary:"
            )
            payload = {
                "model": self.model,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.2,
            }
            r = requests.post(url, headers=headers, json=payload, timeout=120)
            r.raise_for_status()
            content = r.json()["choices"][0]["message"]["content"].strip()
            results.append({"summary_text": content})
        return results

# Initialize summarizer (variable name unchanged)
summarizer = OpenRouterSummarizer(model="openai/gpt-oss-20b")

# --- Function: parse chapters (captures both same-line and next-line titles) ---
def parse_chapters(text):
    """
    Matches:
      Contents lines:  'CHAPTER I. Down the Rabbit-Hole'
      Chapter headers: 'CHAPTER I.\nDown the Rabbit-Hole'
    """
    pattern = re.compile(r"CHAPTER\s+([IVXLCDM]+)\.\s*(?:([^\n]+)|\n([^\n]+))")
    chapters = []
    for m in pattern.finditer(text):
        title = (m.group(2) or m.group(3) or "").strip()
        chapters.append({
            "start": m.start(),
            "chapter_number": f"Chapter {m.group(1)}",
            "chapter_title": title
        })
    chapters.sort(key=lambda x: x["start"])
    return chapters

# --- Function: split and summarize (token-aware, reliable start_index) ---
def load_and_chunk_markdown(md_path, chunk_size=600, chunk_overlap=100, batch_size=10):
    with open(md_path, "r", encoding="utf-8") as f:
        full_text = f.read()

    chapters = parse_chapters(full_text)

    # Token-aware splitter with start indices:
    # NOTE: pass add_start_index=True on the splitter (NOT on create_documents)
    try:
        splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            add_start_index=True
        )
    except TypeError:
        # Fallback for older LangChain versions: construct directly
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            add_start_index=True
        )

    # create_documents returns documents with metadata['start_index']
    doc_objs = splitter.create_documents([full_text])

    # Prepare raw chunk strings for summarizer call
    chunks = [d.page_content for d in doc_objs]

    # 🔹 Summarize chunks (OpenRouter; sequential for clarity)
    summaries = summarizer(
        chunks,
        max_length=40,
        min_length=10,
        do_sample=False,
        batch_size=batch_size
    )

    docs: list[ChunkDoc] = []
    for i, (d, summary_obj) in enumerate(zip(doc_objs, summaries)):
        start_index = d.metadata.get("start_index", None)
        chunk = d.page_content
        summary = summary_obj["summary_text"]

        # Find chapter for this chunk by position (binary search style)
        chapter_number, chapter_title = None, None
        if start_index is not None and chapters:
            lo, hi, idx = 0, len(chapters)-1, -1
            while lo <= hi:
                mid = (lo + hi) // 2
                if chapters[mid]["start"] <= start_index:
                    idx = mid
                    lo = mid + 1
                else:
                    hi = mid - 1
            if idx >= 0:
                chapter_number = chapters[idx]["chapter_number"]
                chapter_title = chapters[idx]["chapter_title"]

        # Create ChunkDoc with validated metadata
        meta = ChunkMetadata(
            source=md_path,
            chapter_number=chapter_number,     # e.g., "Chapter I"
            chapter_title=chapter_title,       # e.g., "Down the Rabbit-Hole"
            start_index=start_index,
            chunk_number=i,
            chunk_summary=summary
        )
        docs.append(ChunkDoc(content=chunk, metadata=meta))

    return docs

# Load and chunk with batching
print("⚡ Summarizing chunks with OpenRouter gpt-oss-20b (token-aware splitting)...")
docs = load_and_chunk_markdown("alice_in_wonderland.md", batch_size=8)

# #printing
print("### Example 3 Chunks ###")
for d in docs[:3]:
    print(d.model_dump_json(indent=2))  # ✅ Pydantic pretty JSON


⚡ Summarizing chunks with OpenRouter gpt-oss-20b (token-aware splitting)...


### Example 3 Chunks ###
{
  "content": "**_ START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND _**\n\nContents\n\nCHAPTER I. Down the Rabbit-Hole\nCHAPTER II. The Pool of Tears\nCHAPTER III. A Caucus-Race and a Long Tale\nCHAPTER IV. The Rabbit Sends in a Little Bill\nCHAPTER V. Advice from a Caterpillar\nCHAPTER VI. Pig and Pepper\nCHAPTER VII. A Mad Tea-Party\nCHAPTER VIII. The Queen’s Croquet-Ground\nCHAPTER IX. The Mock Turtle’s Story\nCHAPTER X. The Lobster Quadrille\nCHAPTER XI. Who Stole the Tarts?\nCHAPTER XII. Alice’s Evidence\n\nCHAPTER I.\nDown the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into\nthe book her sister was reading, but it had no pictures or\nconversations in it, “and what is the use of a book,” thought Alice\n“without pictures or conversations?”\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her f

In [6]:
"""from_texts() takes each chunk from texts=[d["content"] for d in docs].
For each chunk, it calls embeddings.embed_text(text) under the hood.
embed_text converts the chunk into a high-dimensional vector (embedding).
These vectors are then stored in Chroma along with your metadata. """

'from_texts() takes each chunk from texts=[d["content"] for d in docs].\nFor each chunk, it calls embeddings.embed_text(text) under the hood.\nembed_text converts the chunk into a high-dimensional vector (embedding).\nThese vectors are then stored in Chroma along with your metadata. '

In [7]:
# STEP 5: Build embeddings + vectorstore (persistent Chroma; cosine)
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

vectordb = Chroma.from_texts(
    texts=[d.content for d in docs],                       # Pydantic -> attribute
    embedding=embeddings,
    metadatas=[d.metadata.model_dump() for d in docs],     # Pydantic -> dict
    persist_directory="./chroma_store",                    # persist to disk
    collection_name="alice",
    collection_metadata={"hnsw:space": "cosine"}           # cosine similarity
)
vectordb.persist()

# #printing
print("### Vector DB Info ###")
print("Persist dir:", "./chroma_store")
print("Total vectors stored:", vectordb._collection.count())


### Vector DB Info ###
Persist dir: ./chroma_store
Total vectors stored: 616


  vectordb.persist()


In [8]:
# STEP 6: Multi-query expansions using Gemini
from pydantic import BaseModel, Field, field_validator
from typing import List
import json

class Expansions(BaseModel):
    items: List[str] = Field(..., min_items=1, description="List of paraphrased questions")
    
    @field_validator('items')
    def validate_items(cls, v):
        if not all(isinstance(item, str) and len(item.strip()) > 0 for item in v):
            raise ValueError("All items must be non-empty strings")
        return [item.strip() for item in v]

def expand_queries(llm, question, n=4):
    """
    Uses Gemini to generate diverse paraphrases of the input question.
    Returns a list of exactly n paraphrased questions, validated by Pydantic.
    """
    prompt = (
        f"Generate exactly {n} diverse paraphrases of the question below.\n"
        "Your response should be ONLY a valid JSON object with this exact format:\n"
        '{"items": ["paraphrase1", "paraphrase2", ...]}\n\n'
        f"Question: {question}\n\n"
        "Remember: Return ONLY the JSON object, no other text."
    )
    
    response = llm.invoke(prompt)
    try:
        # Try to parse the response as JSON
        json_str = response.content.strip()
        # Find JSON object if it's embedded in other text
        start = json_str.find('{')
        end = json_str.rfind('}') + 1
        if start >= 0 and end > start:
            json_str = json_str[start:end]
        result = json.loads(json_str)
        
        # Validate with Pydantic and ensure exactly n items
        items = result.get('items', [])
        # If we have too few items, pad with the original question
        while len(items) < n:
            items.append(question)
        # If we have too many items, take the first n
        items = items[:n]
        
        # Final validation with Pydantic
        expansions = Expansions(items=items)
        return expansions.items
        
    except (json.JSONDecodeError, KeyError, IndexError, AttributeError) as e:
        # If JSON parsing fails, create a list with the original question repeated
        items = [question] * n
        expansions = Expansions(items=items)
        return expansions.items

# keep your Gemini client as-is (used for answering later)
from langchain_google_genai import ChatGoogleGenerativeAI
llm_gemini = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.3,
    google_api_key=GOOGLE_API_KEY
)

user_question = "Why did Alice follow the White Rabbit and what happened immediately after she fell down the rabbit hole?"
expansions = expand_queries(llm_gemini, user_question)

# #printing
print("### Multi-Query Expansions ###")
print(f"Original: {user_question}")
print("\nExpansions:")
print(json.dumps({"items": expansions}, indent=2))


### Multi-Query Expansions ###
Original: Why did Alice follow the White Rabbit and what happened immediately after she fell down the rabbit hole?

Expansions:
{
  "items": [
    "What motivated Alice to pursue the White Rabbit, and what events transpired right after her descent into the rabbit hole?",
    "Could you explain Alice's reason for chasing the White Rabbit, and describe the immediate aftermath of her fall down the rabbit hole?",
    "What prompted Alice to go after the White Rabbit, and what occurred the moment she plunged into the rabbit hole?",
    "Detail Alice's rationale for trailing the White Rabbit, and recount the initial occurrences following her tumble into the rabbit hole."
  ]
}


In [9]:
# STEP 7: Retrieve chunks (cosine similarity) for original + expansions
from pydantic import BaseModel, Field
from typing import List, Optional, Tuple

class RetrievedMetadata(BaseModel):
    source: str
    chapter_number: Optional[str] = None
    chapter_title: Optional[str] = None
    position: Optional[int] = Field(None, alias="start_index")
    chunk_number: Optional[int] = None
    chunk_summary: Optional[str] = None

class RetrievedChunk(BaseModel):
    question: str
    score: float
    content: str
    metadata: RetrievedMetadata

class RetrievalResults(BaseModel):
    results: List[RetrievedChunk]

def retrieve_candidates(vectordb, queries, per_query_k=5):
    results = []
    seen = set()
    for q in queries:
        hits = vectordb.similarity_search_with_score(q, k=per_query_k)
        for doc, score in hits:
            key = (doc.metadata.get("start_index"), doc.metadata.get("chunk_number"))
            if key not in seen:
                seen.add(key)
                # Create a structured chunk with Pydantic
                chunk = RetrievedChunk(
                    question=q,
                    score=float(score),  # Ensure score is float
                    content=doc.page_content,
                    metadata=RetrievedMetadata(**doc.metadata)
                )
                results.append(chunk)
    return RetrievalResults(results=results)

queries = [user_question] + expansions
candidates = retrieve_candidates(vectordb, queries, per_query_k=5)

# Printing in structured JSON format
print("### Retrieved Candidates ###")
print(candidates.model_dump_json(indent=2))


### Retrieved Candidates ###
{
  "results": [
    {
      "question": "Why did Alice follow the White Rabbit and what happened immediately after she fell down the rabbit hole?",
      "score": 0.2551690936088562,
      "content": "So she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure of\nmaking a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.\n\nThere was nothing so _very_ remarkable in that; nor did Alice think it\nso _very_ much out of the way to hear the Rabbit say to itself, “Oh\ndear! Oh dear! I shall be late!” (when she thought it over afterwards,\nit occurred to her that she ought to have wondered at this, but at the\ntime it all seemed quite natural); but when the Rabbit actually _took a\nwatch out of its waistcoat-pocket_, and looked at it, and then hurried\non, Alice started to her feet, for it flas

In [10]:
# STEP 8: Rerank candidates with cross-encoder (dedupe; take top 5)
from pydantic import BaseModel, Field
from typing import List, Optional

class RerankedMetadata(BaseModel):
    source: str
    chapter_number: Optional[str] = None
    chapter_title: Optional[str] = None
    position: Optional[int] = Field(None, alias="start_index")
    chunk_number: Optional[int] = None
    chunk_summary: Optional[str] = None

class RerankedDocument(BaseModel):
    content: str
    metadata: RerankedMetadata
    rerank_score: float
    
class RerankedResults(BaseModel):
    results: List[RerankedDocument]
    original_question: str
    model_name: str = "BAAI/bge-reranker-base"

reranker = FlagReranker("BAAI/bge-reranker-base", use_fp16=True)

def rerank_candidates(question, candidates, top_n=5):
    # Create pairs for reranking
    pairs = [[question, doc.content] for doc in candidates.results]
    scores = reranker.compute_score(pairs)
    reranked = sorted(zip(candidates.results, scores), key=lambda x: x[1], reverse=True)
    
    # Deduplicate by start_index
    seen = set()
    top_docs = []
    for doc, rerank_score in reranked:
        sid = doc.metadata.position  # Using the position field we defined
        if sid not in seen:
            seen.add(sid)
            # Create structured document
            reranked_doc = RerankedDocument(
                content=doc.content,
                metadata=RerankedMetadata(**doc.metadata.model_dump()),
                rerank_score=float(rerank_score)
            )
            top_docs.append(reranked_doc)
        if len(top_docs) == top_n:
            break
            
    return RerankedResults(
        results=top_docs,
        original_question=question
    )

top_docs = rerank_candidates(user_question, candidates, top_n=5)

# Printing in structured JSON format
print("### Top Reranked Docs ###")
print(top_docs.model_dump_json(indent=2))


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Top Reranked Docs ###
{
  "results": [
    {
      "content": "So she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure of\nmaking a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.\n\nThere was nothing so _very_ remarkable in that; nor did Alice think it\nso _very_ much out of the way to hear the Rabbit say to itself, “Oh\ndear! Oh dear! I shall be late!” (when she thought it over afterwards,\nit occurred to her that she ought to have wondered at this, but at the\ntime it all seemed quite natural); but when the Rabbit actually _took a\nwatch out of its waistcoat-pocket_, and looked at it, and then hurried\non, Alice started to her feet, for it flashed across her mind that she\nhad never before seen a rabbit with either a waistcoat-pocket, or a\nwatch to take out of it, and burning with curiosity, she ran acro

In [11]:
# STEP 9: Build context and answer with Gemini (final result with structured output)
from pydantic import BaseModel, Field
from typing import List, Optional

# Pydantic models for structured output
class Citation(BaseModel):
    source: str
    chapter_number: Optional[str] = None
    chapter_title: Optional[str] = None
    position: Optional[int] = Field(None, alias="start_index")
    chunk_number: Optional[int] = None

class GeminiResponse(BaseModel):
    answer: str = Field(..., description="The answer from Gemini")
    citations: List[Citation] = Field(default_factory=list, description="Citations from the context")
    context_headers: List[str] = Field(default_factory=list, description="Headers from the context")

def build_context(docs):
    parts = []
    for doc in docs.results:  # Using the structured results from Step 8
        m = doc.metadata
        header = f"[Source: {m.source} | Chapter {m.chapter_number} {m.chapter_title} | Position={m.position} | Chunk {m.chunk_number}]"
        parts.append(header + "\n" + doc.content)
    return "\n\n".join(parts)

context = build_context(top_docs)

# Get headers for citations
headers = []
citations = []
for doc in top_docs.results:  # Using the structured results from Step 8
    m = doc.metadata
    headers.append(f"[Source: {m.source} | Chapter {m.chapter_number} {m.chapter_title} | Position={m.position} | Chunk {m.chunk_number}]")
    citations.append(Citation(
        source=m.source,
        chapter_number=m.chapter_number,
        chapter_title=m.chapter_title,
        start_index=m.position,  # Using position as start_index
        chunk_number=m.chunk_number
    ))

prompt = PromptTemplate.from_template("""
You are a helpful assistant. 
Answer the user question using ONLY the provided context.
Read the chunk summary carefully and if it matches with the question then check the chunk content and answer the question.
Expand the answer into at least 2–3 sentences and don't use quotes from the content unless the question is asking for the quotes.

Question: {question}
Context:
{context}
""")

answer = llm_gemini.invoke(prompt.format(question=user_question, context=context))

# Create structured response
response = GeminiResponse(
    answer=answer.content,
    citations=citations,
    context_headers=headers
)

# Printing in structured JSON format
print("### FINAL ANSWER (Structured) ###")
print(response.model_dump_json(indent=2))


### FINAL ANSWER (Structured) ###
{
  "answer": "Alice followed the White Rabbit because she was overcome with curiosity. She was astonished to see a rabbit take a watch from its waistcoat-pocket, an sight she had never encountered before. Immediately after she went down the rabbit hole, it turned into a very deep well, and she found herself falling slowly. During her descent, she observed that the sides of the well were filled with various objects like cupboards, bookshelves, maps, and pictures.",
  "citations": [
    {
      "source": "alice_in_wonderland.md",
      "chapter_number": null,
      "chapter_title": null,
      "position": null,
      "chunk_number": 1
    },
    {
      "source": "alice_in_wonderland.md",
      "chapter_number": null,
      "chapter_title": null,
      "position": null,
      "chunk_number": 0
    }
  ],
  "context_headers": [
    "[Source: alice_in_wonderland.md | Chapter None None | Position=None | Chunk 1]",
    "[Source: alice_in_wonderland.md | Cha