In [380]:
%pip install -q --upgrade langchain langchain-community langchain-google-genai langchain-text-splitters langchain-huggingface


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [381]:
%pip install -q --upgrade chromadb sentence-transformers FlagEmbedding huggingface_hub


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [382]:
import os, re, json, shutil
from pathlib import Path
from typing import List, Dict, Tuple
from pprint import pprint


In [383]:


from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

from FlagEmbedding import FlagReranker  # local free reranker


In [384]:
import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()



# Set up Hugging Face token
import huggingface_hub

# Get token from environment variable
HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")
if not HF_TOKEN:
    raise ValueError("Please set the HUGGINGFACE_TOKEN environment variable")

# Login to Hugging Face
huggingface_hub.login(token=HF_TOKEN)


In [385]:
DATA_PATH = "alice_in_wonderland.md"
PERSIST_DIR = None  # Use in-memory database instead
GOOGLE_API_KEY = os.environ.get("GEMINI_API_KEY")


In [386]:
assert Path(DATA_PATH).exists(), "alice_in_wonderland.md not found at DATA_PATH"
assert GOOGLE_API_KEY, "Set GOOGLE_API_KEY in your environment"


In [387]:
# === Models ===
EMBED_MODEL_NAME = "BAAI/bge-base-en-v1.5"     # strong retriever (768d)
RERANK_MODEL_NAME = "BAAI/bge-reranker-base"   # local cross-encoder reranker
GEN_MODEL_NAME = "gemini-1.5-flash"            # your LLM


In [388]:
# === Chunking ===
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 500

print("[SETUP] Using:")
print(f" - Embeddings: {EMBED_MODEL_NAME}")
print(f" - Reranker:   {RERANK_MODEL_NAME}")
print(f" - LLM:        {GEN_MODEL_NAME}")
print(f" - Data file:  {DATA_PATH}")
print(f" - Chroma dir: {PERSIST_DIR}")


[SETUP] Using:
 - Embeddings: BAAI/bge-base-en-v1.5
 - Reranker:   BAAI/bge-reranker-base
 - LLM:        gemini-1.5-flash
 - Data file:  alice_in_wonderland.md
 - Chroma dir: None


In [389]:
def parse_chapters(full_text: str) -> List[Dict]:
    """
    Parses Gutenberg-like CHAPTER headers. Matches e.g.:
      CHAPTER I. Down the Rabbit-Hole
      CHAPTER II The Pool of Tears
    """
    chapters = []
    pattern = r"(?m)^(CHAPTER\s+([IVXLCDM]+)\.?\s*(.*))\s*$"
    for m in re.finditer(pattern, full_text):
        chapters.append({
            "start": m.start(),
            "end": None,
            "chapter_roman": (m.group(2) or "").strip(),
            "chapter_title": (m.group(3) or "").strip(),
            "header": m.group(1).strip()
        })
    for i in range(len(chapters)-1):
        chapters[i]["end"] = chapters[i+1]["start"]
    if chapters:
        chapters[-1]["end"] = len(full_text)
    return chapters

def chapter_for_pos(chapters: List[Dict], pos: int) -> Dict:
    if not chapters:
        return {}
    for ch in chapters:
        if ch["start"] <= pos < ch["end"]:
            return ch
    return chapters[-1]

def generate_chunk_summary(chunk_text: str, llm) -> str:
    """Generate a brief summary of the chunk content."""
    prompt = f"""
    Generate a very brief (1-2 sentences) summary of the following text chunk from Alice in Wonderland.
    Focus on the key events, characters, or dialogue.
    
    Text chunk:
    {chunk_text}
    
    Summary:"""
    
    response = llm.invoke(prompt)
    return response.content if hasattr(response, "content") else str(response)

def load_and_chunk_markdown(md_path: str, llm) -> List:
    text = Path(md_path).read_text(encoding="utf-8")
    chapters = parse_chapters(text)
    print(f"[INGEST] Parsed {len(chapters)} chapter headers")

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        add_start_index=True,
        separators=["\n\n", "\n", " ", ""]
    )
    docs = splitter.create_documents([text], metadatas=[{"source": os.path.basename(md_path)}])

    total_chunks = len(docs)
    print(f"[INGEST] Generating summaries for {total_chunks} chunks...")
    for i, d in enumerate(docs, 1):
        s = d.metadata.get("start_index", 0)
        ch = chapter_for_pos(chapters, s)
        
        # Generate summary for the chunk
        chunk_summary = generate_chunk_summary(d.page_content, llm)
        
        # Show progress every 10%
        if i % max(1, total_chunks // 10) == 0:
            print(f"Progress: {i}/{total_chunks} chunks processed ({(i/total_chunks*100):.1f}%)")
        
        d.metadata.update({
            "file": os.path.basename(md_path),
            "start_index": s,
            "chapter_number": ch.get("chapter_roman"),
            "chapter_title": ch.get("chapter_title"),
            "chapter_header": ch.get("header"),
            "chunk_summary": chunk_summary
        })
    print(f"[INGEST] Created {len(docs)} chunks with chapter-aware metadata and summaries")
    return docs

# Documents will be created in build_vectorstore()


In [390]:
def build_vectorstore():
    # Initialize LLM first
    llm = ChatGoogleGenerativeAI(
        model=GEN_MODEL_NAME,
        temperature=0.2,
        convert_system_message_to_human=True,
        google_api_key=GOOGLE_API_KEY,
    )
    
    # Create documents with summaries
    print("[INGEST] Creating documents with chunk summaries...")
    docs = load_and_chunk_markdown(DATA_PATH, llm)
    
    # Initialize embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBED_MODEL_NAME,
        encode_kwargs={"normalize_embeddings": True},  # cosine-ready
        model_kwargs={"token": HF_TOKEN}  # Pass token to the model
    )

    print("[EMBED] Building Chroma collection with cosine space...")
    try:
        # Create vectorstore in memory
        vectordb = Chroma.from_documents(
            documents=docs,
            embedding=embeddings,
            collection_name="alice",
            collection_metadata={"hnsw:space": "cosine"}
        )
        return embeddings, vectordb, llm
    except Exception as e:
        print(f"Error building vectorstore: {str(e)}")
        raise

# Vectorstore will be built in the main execution cell


In [391]:
def log_embeddings_info(embeddings, vectordb):
    print("\n[🔹 EMBEDDINGS INFO]")
    print(f"Embedding model: {EMBED_MODEL_NAME}")
    dim = None
    try:
        dim = embeddings.client.get_sentence_embedding_dimension()
    except Exception:
        try:
            sample = embeddings.embed_query("test")
            dim = len(sample)
        except Exception:
            dim = "unknown"
    print(f"Embedding dimension: {dim}")
    print(f"Chroma collection: {vectordb._collection.name}")
    print(f"Chroma vectors stored: {vectordb._collection.count()}")
    print(f"Persist directory: {PERSIST_DIR}")

def log_multiquery_expansions(question: str, expansions: List[str]):
    print("\n[🔹 MULTI-QUERY EXPANSIONS]")
    print(f"Original question: {question}")
    for i, q in enumerate(expansions, 1):
        print(f"  Q{i}: {q}")

def log_retrieved_candidates(candidates: List[Dict]):
    # Group candidates by their matched query
    query_groups = {}
    for c in candidates:
        q = c.get("matched_query")
        if q not in query_groups:
            query_groups[q] = []
        query_groups[q].append(c)
    
    print("\n[🔹 RETRIEVED CANDIDATES BY QUERY]")
    
    # First print original query results
    original_query = list(query_groups.keys())[0]  # First query is the original
    print("\nORIGINAL QUESTION:")
    print(f"'{original_query}'")
    print("\nRETRIEVED CHUNKS:")
    for i, c in enumerate(query_groups[original_query], 1):
        d = c["doc"]
        md = d.metadata
        ann = c.get("ann_score")
        print(f"\nChunk {i} (Similarity Score: {ann:.4f})")
        print(f"Location: Chapter {md.get('chapter_number')}: {md.get('chapter_title')}")
        print(f"Summary: {md.get('chunk_summary', 'No summary available')}")
        print(f"Content: {d.page_content[:200]}...")
        print("\nMetadata:")
        for k, v in md.items():
            if k not in ['chunk_summary', 'source']:
                print(f"- {k}: {v}")
    
    # Then print expansion query results
    for query in list(query_groups.keys())[1:]:
        print("\n" + "="*80)
        print(f"\nGENERATED QUESTION:")
        print(f"'{query}'")
        print("\nRETRIEVED CHUNKS:")
        for i, c in enumerate(query_groups[query], 1):
            d = c["doc"]
            md = d.metadata
            ann = c.get("ann_score")
            print(f"\nChunk {i} (Similarity Score: {ann:.4f})")
            print(f"Location: Chapter {md.get('chapter_number')}: {md.get('chapter_title')}")
            print(f"Summary: {md.get('chunk_summary', 'No summary available')}")
            print(f"Content: {d.page_content[:200]}...")
            print("\nMetadata:")
            for k, v in md.items():
                if k not in ['chunk_summary', 'source']:
                    print(f"- {k}: {v}")

def log_reranked(top_docs: List, top_scores: List, enriched_all: List[Dict]):
    print("\n[🔹 RERANKED RESULTS (cross-encoder)]")
    for i, (d, s) in enumerate(zip(top_docs, top_scores), 1):
        md = d.metadata
        # find the candidate to also print its pre-rerank ANN score
        ann = None
        for item in enriched_all:
            if item["doc"] is d and "ann_score" in item:
                ann = item["ann_score"]
                break
        preview = d.page_content[:140].replace("\n", " ") + "..."
        print(f"{i:>2}. RERANK={s:.4f} | ANN={ann:.4f} | {md.get('file')} | Ch.{md.get('chapter_number')} | start={md.get('start_index')}")
        print(f"    Title: {md.get('chapter_title')}")
        print(f"    Preview: {preview}\n")

def log_final_answer(question: str, answer: str, docs: List):
    print("\n[🔹 FINAL ANSWER]")
    print(f"Question:\n{question}\n")
    print("Answer:\n" + answer + "\n")
    print("Sources used:")
    for d in docs:
        md = d.metadata
        print(f" - {md.get('file')} | Chapter {md.get('chapter_number')}: {md.get('chapter_title')} | Position: {md.get('start_index')}")


In [392]:
# LLM will be initialized in build_vectorstore()

def _extract_json_array(text: str) -> str:
    m = re.search(r"\[.*\]", text, re.S)
    return m.group(0) if m else "[]"

def expand_queries(llm, question: str, n: int = 4) -> List[str]:
    prompt = f"""
You are helping expand a user's query about the novel "Alice's Adventures in Wonderland".
Generate {n} diverse, *meaningfully different* paraphrases that could retrieve relevant passages.
Return ONLY a JSON array of strings.

Question: {question}
"""
    resp = llm.invoke(prompt)
    out = resp.content if hasattr(resp, "content") else str(resp)
    try:
        expansions = json.loads(_extract_json_array(out))
    except Exception:
        expansions = [ln.strip("-• ").strip() for ln in out.splitlines() if ln.strip()]
    # remove empties and exact duplicates of original
    expansions = [e for e in expansions if e and e.strip() and e.strip() != question.strip()]
    return expansions

def retrieve_candidates(vectordb: Chroma, queries: List[str], per_query_k: int = 10) -> List[Dict]:
    """
    For each query, run ANN retrieval with scores.
    Returns list of dicts: {"doc": Document, "ann_score": float, "matched_query": str}
    Maintains separate results for each query while avoiding exact duplicates.
    """
    all_results = []
    seen_keys = set()  # Track unique documents
    
    # First add results from original query
    original_query = queries[0]
    results = vectordb.similarity_search_with_relevance_scores(original_query, k=per_query_k)
    for doc, score in results:
        key = (doc.metadata.get("file"), doc.metadata.get("start_index"))
        if key not in seen_keys:
            seen_keys.add(key)
            all_results.append({
                "doc": doc,
                "ann_score": float(score),
                "matched_query": original_query
            })
    
    # Then add unique results from expansion queries
    for q in queries[1:]:  # Skip the original query
        results = vectordb.similarity_search_with_relevance_scores(q, k=per_query_k)
        for doc, score in results:
            key = (doc.metadata.get("file"), doc.metadata.get("start_index"))
            if key not in seen_keys:
                seen_keys.add(key)
                all_results.append({
                    "doc": doc,
                    "ann_score": float(score),
                    "matched_query": q
                })
    
    # Sort all results by score while maintaining query grouping
    all_results.sort(key=lambda x: (x["matched_query"] != original_query, -x["ann_score"]))
    return all_results


In [393]:
reranker = FlagReranker(RERANK_MODEL_NAME, use_fp16=True)

def rerank_candidates(question: str, candidates: List[Dict], top_n: int = 6) -> Tuple[List, List[float], List[Dict]]:
    if not candidates:
        return [], [], []
    docs = [c["doc"] for c in candidates]
    pairs = [[question, d.page_content] for d in docs]
    scores = reranker.compute_score(pairs)  # list of floats, higher is better
    enriched = []
    for c, s in zip(candidates, scores):
        e = dict(c)
        e["rerank_score"] = float(s)
        enriched.append(e)
    enriched.sort(key=lambda x: x["rerank_score"], reverse=True)
    top = enriched[:top_n]
    top_docs = [t["doc"] for t in top]
    top_scores = [t["rerank_score"] for t in top]
    return top_docs, top_scores, enriched


In [394]:
RAG_PROMPT = PromptTemplate.from_template("""
You are a careful literary assistant answering questions about *Alice's Adventures in Wonderland*.
Use ONLY the provided context. If the context is insufficient, say so briefly and ask for a clarification.

Return:
1) A precise answer.
2) (Optional) 1–3 brief quotes (≤ 2 sentences each).
3) Sources as: [file]:[chapter]:[start]

Context:
{context}

Question: {question}
""")

def join_with_citations(docs: List) -> str:
    parts = []
    for d in docs:
        md = d.metadata
        header = f"[Source: {md.get('file')} | Chapter {md.get('chapter_number')}: {md.get('chapter_title')} | Position: {md.get('start_index')}]"
        parts.append(header + "\n" + d.page_content.strip())
    return "\n\n---\n\n".join(parts)


In [395]:
# Build the vectorstore and get components
embeddings, vectordb, llm = build_vectorstore()

def answer_query(question: str, per_query_k: int = 10, expansions_n: int = 4, final_k: int = 6):
    print("\n=========================")
    print(f"🔍 Processing question: {question}")
    print("=========================")

    # 0) Embeddings/DB summary
    log_embeddings_info(embeddings, vectordb)

    # 1) Multi-Query expansions
    expansions = expand_queries(llm, question, n=expansions_n)
    log_multiquery_expansions(question, expansions)
    queries = [question] + expansions

    # 2) Retrieval across (original + expansions)
    candidates = retrieve_candidates(vectordb, queries, per_query_k=per_query_k)
    log_retrieved_candidates(candidates)

    # 3) Rerank with local cross-encoder
    top_docs, top_scores, enriched_all = rerank_candidates(question, candidates, top_n=final_k)
    log_reranked(top_docs, top_scores, enriched_all)

    # 4) Build context & ask LLM
    context = join_with_citations(top_docs)
    prompt = RAG_PROMPT.format(context=context, question=question)
    resp = llm.invoke(prompt)
    answer_text = resp.content if hasattr(resp, "content") else str(resp)

    # 5) Log final result
    log_final_answer(question, answer_text, top_docs)

    return {
        "answer": answer_text,
        "retrieved_count": len(candidates),
        "used_count": len(top_docs),
        "sources": [
            {
                "file": d.metadata.get("file"),
                "chapter_number": d.metadata.get("chapter_number"),
                "chapter_title": d.metadata.get("chapter_title"),
                "start_index": d.metadata.get("start_index"),
                "chunk_summary": d.metadata.get("chunk_summary", "No summary available")
            }
            for d in top_docs
        ],
        "expansions": expansions
    }


[INGEST] Creating documents with chunk summaries...
[INGEST] Parsed 24 chapter headers
[INGEST] Generating summaries for 258 chunks...


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 36
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing

KeyboardInterrupt: 

In [None]:
res = answer_query(
    "Why did Alice follow the White Rabbit and what happened immediately after?",
    per_query_k=10,    # candidates per (sub)query
    expansions_n=4,    # number of multi-query expansions
    final_k=6          # final top-N after rerank sent to LLM
)

# 'res' also contains expansions, counts, and structured sources if you need them programmatically



🔍 Processing question: Why did Alice follow the White Rabbit and what happened immediately after?

[🔹 EMBEDDINGS INFO]
Embedding model: BAAI/bge-base-en-v1.5
Embedding dimension: 768
Chroma collection: alice
Chroma vectors stored: 723
Persist directory: None





[🔹 MULTI-QUERY EXPANSIONS]
Original question: Why did Alice follow the White Rabbit and what happened immediately after?
  Q1: What compelled Alice to chase the White Rabbit, and what was her experience falling down the rabbit hole?
  Q2: Describe Alice's pursuit of the White Rabbit and the beginning of her journey into Wonderland.
  Q3: Why did Alice go down the rabbit hole, and what did she encounter during her descent?
  Q4: What made Alice follow the White Rabbit, and what was the immediate environment like after she entered the hole?

[🔹 RETRIEVED CANDIDATES BY QUERY (pre-rerank)]

Results for query: Why did Alice follow the White Rabbit and what happened immediately after?
--------------------------------------------------------------------------------
 1. ANN=0.7410 | Ch.XII: Alice’s Evidence
    Preview: “That _proves_ his guilt,” said the Queen.  “It proves nothing of the sort!” said Alice. “Why, you don’t even know what they’re about!”  “Re...

 2. ANN=0.7232 | Ch.I: Down th

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



[🔹 RERANKED RESULTS (cross-encoder)]
 1. RERANK=0.1173 | ANN=0.7167 | alice_in_wonderland.md | Ch.X | start=117915
    Title: The Lobster Quadrille
    Preview: “No, no! The adventures first,” said the Gryphon in an impatient tone: “explanations take such a dreadful time.”  So Alice began telling the...

 2. RERANK=-0.9883 | ANN=0.7204 | alice_in_wonderland.md | Ch.II | start=14431
    Title: The Pool of Tears
    Preview: After a time she heard a little pattering of feet in the distance, and she hastily dried her eyes to see what was coming. It was the White R...

 3. RERANK=-1.2969 | ANN=0.7031 | alice_in_wonderland.md | Ch.VIII | start=86935
    Title: The Queen’s Croquet-Ground
    Preview: First came ten soldiers carrying clubs; these were all shaped like the three gardeners, oblong and flat, with their hands and feet at the co...

 4. RERANK=-1.5840 | ANN=0.7422 | alice_in_wonderland.md | Ch.I | start=1309
    Title: Down the Rabbit-Hole
    Preview: CHAPTER I. Down the Rabbit-H




[🔹 FINAL ANSWER]
Question:
Why did Alice follow the White Rabbit and what happened immediately after?

Answer:
Alice followed the White Rabbit impulsively after seeing it run close by and then hurry down a passage, feeling there was "not a moment to be lost." Immediately after she went down the rabbit-hole, she found herself falling down a very deep well because the hole dipped suddenly.

* "when suddenly a White Rabbit with pink eyes ran close by her."
* "There was not a moment to be lost: away went Alice like the wind, and was just in time to hear it say, as it turned a corner, “Oh my ears and whiskers, how late it’s getting!”"
* "The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well."

[alice_in_wonderland.md]:Chapter I: Down the Rabbit-Hole:1309
[alice_in_wonderland.md]:Chapter I: Down the Rabbit-Hole:6364
[alice_in_wo