Here's the diagram for the process : [rag_image](mermaid_rag.png)
Link to the Streamlit application : [rag_application](https://alice-in-wonderland-rag.streamlit.app/)

In [21]:
# STEP 1: Install dependencies

%pip install langchain langchain-community langchain-huggingface langchain-google-genai --quiet
%pip install chromadb sentence-transformers FlagEmbedding huggingface_hub --quiet



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [22]:
# STEP 2: Imports
import os, re, json
from pathlib import Path

# LangChain components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

# Reranker
from FlagEmbedding import FlagReranker


In [23]:
# STEP 3: Authentication (Hugging Face + Gemini key)
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

# Hugging Face login
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
login(HF_TOKEN)

# Gemini API key
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
assert GOOGLE_API_KEY, "Gemini API Key not found!"


In [24]:
# STEP 4: Load and chunk the book (token-aware) + batch summarization via OpenRouter (gpt-oss-20b)
import os, json, re, requests
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Summarization pipeline (OpenRouter: openai/gpt-oss-20b) ---
class OpenRouterSummarizer:
    def __init__(self, model="openai/gpt-oss-20b", api_key=None):
        self.model = model
        self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
        assert self.api_key, "OpenRouter API key not found. Set OPENROUTER_API_KEY in your .env."

    # Keep the same call signature you used before
    def __call__(self, texts, max_length=40, min_length=10, do_sample=False, batch_size=8):
        url = "https://openrouter.ai/api/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        results = []
        for t in texts:
            prompt = (
                "Summarize the passage in 2-3 sentences as per the content, capture the main point of the content and make sure you're not missing out on the quotes from the content."
                f"Passage:\n{t}\n\nSummary:"
            )
            payload = {
                "model": self.model,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.2,
            }
            r = requests.post(url, headers=headers, json=payload, timeout=120)
            r.raise_for_status()
            content = r.json()["choices"][0]["message"]["content"].strip()
            results.append({"summary_text": content})
        return results

# Initialize summarizer (variable name unchanged)
summarizer = OpenRouterSummarizer(model="openai/gpt-oss-20b")

# --- Function: parse chapters (captures both same-line and next-line titles) ---
def parse_chapters(text):
    """
    Matches:
      Contents lines:  'CHAPTER I. Down the Rabbit-Hole'
      Chapter headers: 'CHAPTER I.\nDown the Rabbit-Hole'
    """
    pattern = re.compile(r"CHAPTER\s+([IVXLCDM]+)\.\s*(?:([^\n]+)|\n([^\n]+))")
    chapters = []
    for m in pattern.finditer(text):
        title = (m.group(2) or m.group(3) or "").strip()
        chapters.append({
            "start": m.start(),
            "chapter_number": f"Chapter {m.group(1)}",
            "chapter_title": title
        })
    chapters.sort(key=lambda x: x["start"])
    return chapters

# --- Function: split and summarize (token-aware, reliable start_index) ---
def load_and_chunk_markdown(md_path, chunk_size=600, chunk_overlap=100, batch_size=10):
    with open(md_path, "r", encoding="utf-8") as f:
        full_text = f.read()

    chapters = parse_chapters(full_text)

    # Token-aware splitter with start indices:
    # NOTE: pass add_start_index=True on the splitter (NOT on create_documents)
    try:
        splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            add_start_index=True
        )
    except TypeError:
        # Fallback for older LangChain versions: construct directly
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            add_start_index=True
        )

    # create_documents returns documents with metadata['start_index']
    doc_objs = splitter.create_documents([full_text])

    # Prepare raw chunk strings for summarizer call
    chunks = [d.page_content for d in doc_objs]

    # 🔹 Summarize chunks (OpenRouter; sequential for clarity)
    summaries = summarizer(
        chunks,
        max_length=40,
        min_length=10,
        do_sample=False,
        batch_size=batch_size
    )

    docs = []
    for i, (d, summary_obj) in enumerate(zip(doc_objs, summaries)):
        start_index = d.metadata.get("start_index", None)
        chunk = d.page_content
        summary = summary_obj["summary_text"]

        # Find chapter for this chunk by position (binary search style)
        chapter_number, chapter_title = None, None
        if start_index is not None and chapters:
            lo, hi, idx = 0, len(chapters)-1, -1
            while lo <= hi:
                mid = (lo + hi) // 2
                if chapters[mid]["start"] <= start_index:
                    idx = mid
                    lo = mid + 1
                else:
                    hi = mid - 1
            if idx >= 0:
                chapter_number = chapters[idx]["chapter_number"]
                chapter_title = chapters[idx]["chapter_title"]

        docs.append({
            "content": chunk,
            "metadata": {
                "source": md_path,
                "chapter_number": chapter_number,     # e.g., "Chapter I"
                "chapter_title": chapter_title,       # e.g., "Down the Rabbit-Hole"
                "start_index": start_index,
                "chunk_number": i,
                "chunk_summary": summary
            }
        })

    return docs

# Load and chunk with batching
print("⚡ Summarizing chunks with OpenRouter gpt-oss-20b (token-aware splitting)...")
docs = load_and_chunk_markdown("alice_in_wonderland.md", batch_size=8)

# #printing
print("### Example 3 Chunks ###")
for d in docs[:3]:
    print(json.dumps(d, indent=2))


⚡ Summarizing chunks with OpenRouter gpt-oss-20b (token-aware splitting)...
### Example 3 Chunks ###
{
  "content": "**_ START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND _**\n\nContents\n\nCHAPTER I. Down the Rabbit-Hole\nCHAPTER II. The Pool of Tears\nCHAPTER III. A Caucus-Race and a Long Tale\nCHAPTER IV. The Rabbit Sends in a Little Bill\nCHAPTER V. Advice from a Caterpillar\nCHAPTER VI. Pig and Pepper\nCHAPTER VII. A Mad Tea-Party\nCHAPTER VIII. The Queen\u2019s Croquet-Ground\nCHAPTER IX. The Mock Turtle\u2019s Story\nCHAPTER X. The Lobster Quadrille\nCHAPTER XI. Who Stole the Tarts?\nCHAPTER XII. Alice\u2019s Evidence\n\nCHAPTER I.\nDown the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into\nthe book her sister was reading, but it had no pictures or\nconversations in it, \u201cand what is the use of a book,\u201d thought Alice\n\u201cwithout pictures or co

In [25]:
"""from_texts() takes each chunk from texts=[d["content"] for d in docs].
For each chunk, it calls embeddings.embed_text(text) under the hood.
embed_text converts the chunk into a high-dimensional vector (embedding).
These vectors are then stored in Chroma along with your metadata. """

'from_texts() takes each chunk from texts=[d["content"] for d in docs].\nFor each chunk, it calls embeddings.embed_text(text) under the hood.\nembed_text converts the chunk into a high-dimensional vector (embedding).\nThese vectors are then stored in Chroma along with your metadata. '

In [26]:
# STEP 5: Build embeddings + vectorstore (persistent Chroma; cosine)
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

vectordb = Chroma.from_texts(
    texts=[d["content"] for d in docs],
    embedding=embeddings,
    metadatas=[d["metadata"] for d in docs],
    persist_directory="./chroma_store",              
    collection_name="alice",
    collection_metadata={"hnsw:space": "cosine"}     
)
vectordb.persist()  # save to disk

# #printing
print("### Vector DB Info ###")
print("Persist dir:", "./chroma_store")
print("Total vectors stored:", vectordb._collection.count())



### Vector DB Info ###
Persist dir: ./chroma_store
Total vectors stored: 272


In [35]:
# STEP 6: Multi-query expansions (use Gemini)
# Generate 4 alternative phrasings of the user question

def expand_queries(llm, question, n=4):
    prompt = f"""
    Generate {n} different phrasings of the following user question:
    "{question}"
    Provide only the variations, one per line.
    """
    out = llm.invoke(prompt)
    expansions = list(set(out.content.strip().split("\n")))
    return [q.strip("- ").strip() for q in expansions if q.strip()]

llm_gemini = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.3,
    google_api_key=GOOGLE_API_KEY
)

user_question = "What are the contents in the book?"
expansions = expand_queries(llm_gemini, user_question)

#printing
print("### Multi-Query Expansions ###")
print("Original:", user_question)
for i, e in enumerate(expansions, 1):
    print(f"Q{i}: {e}")


### Multi-Query Expansions ###
Original: What are the contents in the book?
Q1: Could you tell me what's inside the book?
Q2: What topics are covered in the book?
Q3: What is the book about?
Q4: What does the book contain?


In [36]:
# STEP 7: Retrieve chunks (cosine similarity) for original + expansions

def retrieve_candidates(vectordb, queries, per_query_k=5):
    results = []
    seen = set()
    for q in queries:
        hits = vectordb.similarity_search_with_score(q, k=per_query_k)
        for doc, score in hits:
            key = (doc.metadata["start_index"], doc.metadata["chunk_number"])
            if key not in seen:
                seen.add(key)
                results.append((doc, score, q))
    return results

queries = [user_question] + expansions
candidates = retrieve_candidates(vectordb, queries, per_query_k=10)

# #printing
# #printing
print("### Retrieved Candidates ###")

def _get(m, k, default=""):
    return m.get(k, default)

for idx, (doc, score, q) in enumerate(candidates[:20], 1):
    m = doc.metadata or {}
    meta_line = (
        f"source={_get(m,'source')} | "
        f"chapter_number={_get(m,'chapter_number','N/A')} | "
        f"chapter_title={_get(m,'chapter_title','')} | "
        f"position={_get(m,'start_index','-')} | "
        f"chunk_number={_get(m,'chunk_number','-')} | "
        f"chunk_summary={_get(m,'chunk_summary','')}"
    )
    print(f"Query: {q}")
    print(f"Score (cosine ANN): {score:.4f}")
    print("Meta:", meta_line)
    print("---")



### Retrieved Candidates ###
Query: What are the contents in the book?
Score (cosine ANN): 0.3952
Meta: source=alice_in_wonderland.md | chapter_number=Chapter XII | chapter_title=Alice’s Evidence | position=134839 | chunk_number=84 | chunk_summary=The King interrogates Alice about the business, insisting it is “important—unimportant—unimportant—important—” while Alice simply replies “Nothing” and “Nothing whatever,” prompting the White Rabbit to correct the King that it is “Un_important, your Majesty means, of course,” and the jury records either “important” or “unimportant.”  Suddenly the King cites “Rule Forty‑two. All persons more than a mile high to leave the court,” to which Alice protests, “I’m not a mile high,” the King retorts, “You are,” and the Queen adds, “Nearly two miles high,” while Alice counters, “Well, I shan’t go, at any rate,” and insists the rule should be “Number One.”  The King, unsettled, orders the jury to “Consider your verdict,” the White Rabbit promises more 

In [37]:
# STEP 8: Rerank candidates with cross-encoder (dedupe; take top 5)
from FlagEmbedding import FlagReranker

reranker = FlagReranker("BAAI/bge-reranker-base", use_fp16=True)

def rerank_candidates(question, candidates, top_n=10):
    pairs = [[question, doc.page_content] for doc, _, _ in candidates]
    scores = reranker.compute_score(pairs)
    reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    
    # Deduplicate by start_index
    seen = set()
    top_docs = []
    for (doc, score, q), rerank_score in reranked:
        if doc.metadata["start_index"] not in seen:
            seen.add(doc.metadata["start_index"])
            top_docs.append((doc, rerank_score))
        if len(top_docs) == top_n:
            break
    return top_docs

top_docs = rerank_candidates(user_question, candidates, top_n=5)

# #printing
# #printing
print("### Top Reranked Docs ###")
for d, s in top_docs:
    m = d.metadata or {}
    chapter_number = m.get('chapter_number', 'Unknown')
    chapter_title  = m.get('chapter_title', 'Unknown')
    chunk_summary  = (m.get('chunk_summary') or '')[:100]
    print(f"RERANK={s:.4f} | {chapter_number} {chapter_title} | {chunk_summary}...")



You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Top Reranked Docs ###
RERANK=-2.0566 | Unknown Unknown | Alice, bored while sitting beside her sister, remarks that “without pictures or conversations” a boo...
RERANK=-8.3906 | Unknown Unknown | The Duck asks, “Found _what_?” and the Mouse replies, “Found _it_,” prompting the Duck to wonder, “w...
RERANK=-8.5391 | Chapter XII Alice’s Evidence | The King interrogates Alice about the business, insisting it is “important—unimportant—unimportant—i...
RERANK=-8.5391 | Chapter XII Alice’s Evidence | The King questions Alice about the business, to which she replies “Nothing,” and he insists, prompti...
RERANK=-8.8906 | Chapter V Advice from a Caterpillar | Alice wonders, “One side of _what?_ The other side of _what?_” and the Caterpillar replies, “Of the ...


In [38]:
# STEP 9: Build context and answer with Gemini (final result)

def build_context(docs):
    parts = []
    for d, _ in docs:
        m = getattr(d, "metadata", {}) or {}
        source         = m.get("source", "unknown")
        chapter_number = m.get("chapter_number", "Unknown")
        chapter_title  = m.get("chapter_title", "Unknown")
        start_index    = m.get("start_index", "?")
        chunk_number   = m.get("chunk_number", "?")
        header = f"[Source: {source} | Chapter {chapter_number} {chapter_title} | Position={start_index} | Chunk {chunk_number}]"
        parts.append(header + "\n" + d.page_content)
    return "\n\n".join(parts)

context = build_context(top_docs)

prompt = PromptTemplate.from_template("""
You are a helpful assistant. 
Answer the user question using ONLY the provided context.
Read the chunk summary carefully and if it matches with the question then check the chunk content and answer the question.
Expand the answer into at least 2–3 sentences and don't use quotes from the content unless the question is asking for the quotes.


Question: {question}
Context:
{context}
""")

answer = llm_gemini.invoke(prompt.format(question=user_question, context=context))

# #printing
print("### FINAL ANSWER ###")
print(answer.content)

# #printing

print("### Context Headers Preview ###")
for d, _ in top_docs:
    m = d.metadata or {}
    print(f"[Source: {m.get('source','unknown')} | Chapter {m.get('chapter_number','Unknown')} {m.get('chapter_title','Unknown')} | Position={m.get('start_index','?')} | Chunk {m.get('chunk_number','?')}]") 


### FINAL ANSWER ###
The book contains twelve chapters, each detailing a different adventure. These chapters include "Down the Rabbit-Hole," "The Pool of Tears," "A Caucus-Race and a Long Tale," "The Rabbit Sends in a Little Bill," "Advice from a Caterpillar," "Pig and Pepper," "A Mad Tea-Party," "The Queen’s Croquet-Ground," "The Mock Turtle’s Story," "The Lobster Quadrille," "Who Stole the Tarts?," and "Alice’s Evidence." These titles provide a clear outline of the narrative journey within the book.
### Context Headers Preview ###
[Source: alice_in_wonderland.md | Chapter Unknown Unknown | Position=1 | Chunk 0]
[Source: alice_in_wonderland.md | Chapter Unknown Unknown | Position=-1 | Chunk 15]
[Source: alice_in_wonderland.md | Chapter Chapter XII Alice’s Evidence | Position=134839 | Chunk 84]
[Source: alice_in_wonderland.md | Chapter Chapter XII Alice’s Evidence | Position=133976 | Chunk 83]
[Source: alice_in_wonderland.md | Chapter Chapter V Advice from a Caterpillar | Position=5255