Here's the diagram for the process : [rag_image](mermaid_rag.png)

In [2]:
# STEP 1: Install dependencies

%pip install langchain langchain-community langchain-huggingface langchain-google-genai --quiet
%pip install chromadb sentence-transformers FlagEmbedding huggingface_hub --quiet



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
# STEP 2: Imports
import os, re, json
from pathlib import Path

# LangChain components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

# Reranker
from FlagEmbedding import FlagReranker


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# STEP 3: Authentication (Hugging Face + Gemini key)
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

# Hugging Face login
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
login(HF_TOKEN)

# Gemini API key
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
assert GOOGLE_API_KEY, "Gemini API Key not found!"


In [7]:
# STEP 4: Load and chunk the book (with batching for summarization)
from transformers import pipeline
import json, re
from pathlib import Path

# Summarization pipeline (DistilBART CNN)
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Function: parse chapters
def parse_chapters(text):
    headers = list(re.finditer(r"CHAPTER\s+([IVXLCDM]+)(.*)", text))
    return [(h.start(), h.group(0)) for h in headers]

# Function: split and summarize (with batching)
def load_and_chunk_markdown(md_path, chunk_size=800, chunk_overlap=200, batch_size=8):
    with open(md_path, "r", encoding="utf-8") as f:
        full_text = f.read()
    
    chapters = parse_chapters(full_text)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?", " ", ""]
    )
    
    chunks = splitter.split_text(full_text)
    docs = []

    # 🔹 Summarize chunks in batches
    summaries = summarizer(
        chunks,
        max_length=40,
        min_length=10,
        do_sample=False,
        batch_size=batch_size   # batching here
    )

    for i, (chunk, summary_obj) in enumerate(zip(chunks, summaries)):
        summary = summary_obj["summary_text"]

        # Find chapter for this chunk
        start_index = full_text.find(chunk)
        chapter_num, chapter_title = None, None
        for pos, header in reversed(chapters):
            if pos <= start_index:
                chapter_num = len([c for c in chapters if c[0] <= pos])
                chapter_title = header
                break

        docs.append({
            "content": chunk,
            "metadata": {
                "source": md_path,
                "chapter_number": chapter_num,
                "chapter_title": chapter_title,
                "start_index": start_index,
                "chunk_number": i,
                "chunk_summary": summary
            }
        })
    return docs

# Load and chunk with batching
print("⚡ Summarizing chunks with batching (this may take ~2–3 minutes on CPU)...")
docs = load_and_chunk_markdown("alice_in_wonderland.md", batch_size=8)

# printing
print("### Example 3 Chunks ###")
for d in docs[:3]:
    print(json.dumps(d, indent=2))


Device set to use mps:0


⚡ Summarizing chunks with batching (this may take ~2–3 minutes on CPU)...
### Example 3 Chunks ###
{
  "content": "The Project Gutenberg eBook of Alice's Adventures in Wonderland\n\nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: Alice's Adventures in Wonderland\n\nAuthor: Lewis Carroll\n\nRelease date: June 27, 2008 [eBook #11]\nMost recently updated: March 30, 2021\n\nLanguage: English\n\nCredits: Arthur DiBianca and David Widger\n\n**_ START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND _**",
  "metadata": {
    "source": "alice_in_wonderland.md",
    "chapter_n

In [None]:
"""from_texts() takes each chunk from texts=[d["content"] for d in docs].
For each chunk, it calls embeddings.embed_text(text) under the hood.
embed_text converts the chunk into a high-dimensional vector (embedding).
These vectors are then stored in Chroma along with your metadata. """

In [13]:
# STEP 5: Build embeddings + vectorstore (in memory Chroma)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Build vectorstore
vectordb = Chroma.from_texts(
    texts=[d["content"] for d in docs],
    embedding=embeddings,
    metadatas=[d["metadata"] for d in docs],
    collection_name="alice",
    collection_metadata={"hnsw:space": "cosine"}
)

#printing
print("### Vector DB Info ###")
print("Total vectors stored:", vectordb._collection.count())


### Vector DB Info ###
Total vectors stored: 251


In [32]:
# STEP 6: Multi-query expansions (use Gemini)
# Generate 4 alternative phrasings of the user question

def expand_queries(llm, question, n=4):
    prompt = f"""
    Generate {n} different phrasings of the following user question:
    "{question}"
    Provide only the variations, one per line.
    """
    out = llm.invoke(prompt)
    expansions = list(set(out.content.strip().split("\n")))
    return [q.strip("- ").strip() for q in expansions if q.strip()]

llm_gemini = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.3,
    google_api_key=GOOGLE_API_KEY
)

user_question = "Why did Alice follow the White Rabbit and what happened immediately after she fell down the rabbit hole?"
expansions = expand_queries(llm_gemini, user_question)

#printing
print("### Multi-Query Expansions ###")
print("Original:", user_question)
for i, e in enumerate(expansions, 1):
    print(f"Q{i}: {e}")


### Multi-Query Expansions ###
Original: Why did Alice follow the White Rabbit and what happened immediately after she fell down the rabbit hole?
Q1: 3. Detail Alice's impetus for going after the White Rabbit, and outline the initial occurrences directly after her fall into its burrow.
Q2: 2. Could you explain Alice's reasons for chasing the White Rabbit, and then describe what transpired the moment she plunged down the hole?
Q3: 4. What prompted Alice to follow the White Rabbit, and what happened right after she dropped into the subterranean passage?
Q4: 1. What was Alice's motivation for pursuing the White Rabbit, and what events immediately followed her descent into the rabbit hole?


In [33]:
# STEP 7: Retrieve chunks (cosine similarity) for original + expansions

def retrieve_candidates(vectordb, queries, per_query_k=5):
    results = []
    seen = set()
    for q in queries:
        hits = vectordb.similarity_search_with_score(q, k=per_query_k)
        for doc, score in hits:
            key = (doc.metadata["start_index"], doc.metadata["chunk_number"])
            if key not in seen:
                seen.add(key)
                results.append((doc, score, q))
    return results

queries = [user_question] + expansions
candidates = retrieve_candidates(vectordb, queries, per_query_k=5)

#printing
print("### Retrieved Candidates ###")
for doc, score, q in candidates[:10]:
    meta = doc.metadata
    print(f"Query: {q}")
    print(f"Score: {score:.4f}")
    print(f"Meta: {meta}")
    print("---")


### Retrieved Candidates ###
Query: Why did Alice follow the White Rabbit and what happened immediately after she fell down the rabbit hole?
Score: 0.2055
Meta: {'start_index': 2678, 'chapter_number': 13, 'chunk_summary': ' The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down . In another moment down went Alice after it, never once considering how she was to get', 'source': 'alice_in_wonderland.md', 'chunk_number': 4, 'chapter_title': 'CHAPTER I.'}
---
Query: Why did Alice follow the White Rabbit and what happened immediately after she fell down the rabbit hole?
Score: 0.2926
Meta: {'chapter_title': 'CHAPTER I.', 'chunk_number': 11, 'source': 'alice_in_wonderland.md', 'start_index': 6364, 'chunk_summary': ' Alice was not a bit hurt, and she jumped up on to her feet in a moment . Before her was another long passage, and the White Rabbit was still in sight, hurrying', 'chapter_number': 13}
---
Query: Why did Alice follow the White Rabbit and what hap

In [34]:
# STEP 8: Rerank candidates with cross-encoder
reranker = FlagReranker("BAAI/bge-reranker-base", use_fp16=True)

def rerank_candidates(question, candidates, top_n=5):
    pairs = [[question, doc.page_content] for doc, _, _ in candidates]
    scores = reranker.compute_score(pairs)
    reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    
    # Deduplicate by start_index
    seen = set()
    top_docs = []
    for (doc, score, q), rerank_score in reranked:
        if doc.metadata["start_index"] not in seen:
            seen.add(doc.metadata["start_index"])
            top_docs.append((doc, rerank_score))
        if len(top_docs) == top_n:
            break
    return top_docs

top_docs = rerank_candidates(user_question, candidates, top_n=5)

#printing
print("### Top Reranked Docs ###")
for d, s in top_docs:
    print(f"RERANK={s:.4f} | {d.metadata['chapter_title']} | {d.metadata['chunk_summary'][:80]}...")


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Top Reranked Docs ###
RERANK=3.5273 | CHAPTER I. |  The rabbit-hole went straight on like a tunnel for some way, and then dipped su...
RERANK=2.4629 | CHAPTER I. |  Alice was beginning to get tired of sitting by her sister on the bank . She was...
RERANK=2.1152 | CHAPTER I. |  Alice had never before seen a rabbit with either a waistcoat-pocket, or a watch...
RERANK=1.7168 | CHAPTER II. |  Alice felt so desperate that she was ready to ask help of any one; so, when the...
RERANK=-0.2273 | CHAPTER I. |  Alice was not a bit hurt, and she jumped up on to her feet in a moment . Before...


In [35]:
# STEP 9: Build context and answer with Gemini

def build_context(docs):
    parts = []
    for d, _ in docs:
        m = d.metadata
        header = f"[Source: {m['source']} | Chapter {m['chapter_number']} {m['chapter_title']} | Position={m['start_index']} | Chunk {m['chunk_number']}]"
        parts.append(header + "\n" + d.page_content)
    return "\n\n".join(parts)

context = build_context(top_docs)

prompt = PromptTemplate.from_template("""
You are a helpful assistant. 
Answer the user question using ONLY the provided context. Read the chunk summary carefully and if it matches with the question then check the chunk content and answer the question. Exapnd the answer into atleast 2-3 sentences.
Cite the source and chunk content at the end.                     

Question: {question}
Context:
{context}
""")

answer = llm_gemini.invoke(prompt.format(question=user_question, context=context))

#printing
print("### FINAL ANSWER ###")
print(answer.content)


### FINAL ANSWER ###
Alice followed the White Rabbit primarily out of curiosity. She was astonished when she saw the Rabbit take a watch out of its waistcoat-pocket and then hurry away, as she had never before seen a rabbit with such human-like characteristics. Immediately after falling down the rabbit hole, Alice found herself falling down a very deep well, and upon landing, she quickly got to her feet and saw another long passage with the White Rabbit still in sight, hurrying down it.

Source: alice_in_wonderland.md | Chapter 13 CHAPTER I. | Position=1937 | Chunk 3, alice_in_wonderland.md | Chapter 13 CHAPTER I. | Position=2678 | Chunk 4, alice_in_wonderland.md | Chapter 13 CHAPTER I. | Position=6364 | Chunk 11
