In [6]:
%pip install langchain langchain-google-genai sentence-transformers chromadb PyPDF2 pandas tqdm --quiet


[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
import os
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import PyPDF2

In [8]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
BOOK_PDF =  "book.pdf"
QUERIES_JSON =  "queries.json"

DOWNLOADS = Path.home() / "Downloads"
OUTPUT_CSV = DOWNLOADS / "submission.csv"

In [None]:
# Load Queries
with open(QUERIES_JSON, "r", encoding="utf-8") as f:
    queries = json.load(f)

# Normalize to list of dicts with {query_id, question}
if isinstance(queries, dict):
    if "queries" in queries:
        queries = queries["queries"]
    else:
        queries = [{"query_id": k, "question": v} for k, v in queries.items()]

print(f"Loaded {len(queries)} queries")

Loaded 50 queries


In [14]:
# Step 1: Print first 5 queries
for query in queries[:5]:
    print(f"Query ID: {query['query_id']} - Question: {query['question']}")


Query ID: 1 - Question: What is the scientific method in psychology?
Query ID: 2 - Question: What are the basic parts of a neuron?
Query ID: 3 - Question: What are the stages of sleep?
Query ID: 4 - Question: What is operant conditioning?
Query ID: 5 - Question: What is problem-solving in psychology?


In [15]:
# Step 2: Extract PDF text

def extract_text_from_pdf(path):
    reader = PyPDF2.PdfReader(str(path))
    pages = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        pages.append({"page": i+1, "text": text})
    return pages

pages = extract_text_from_pdf(BOOK_PDF)
print(f"Extracted {len(pages)} pages")


Extracted 753 pages


In [18]:
# Step 3: Chunk text

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    separators=["\n\n", "\n", ".", " "]
)

docs = []
for p in pages:
    chunks = splitter.split_text(p["text"])
    for chunk in chunks:
        docs.append({"text": chunk, "page": p["page"]})

print(f"Created {len(docs)} chunks")

Created 2201 chunks


In [19]:
# Step 4: Embeddings + Chroma
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

texts = [d["text"] for d in docs]
metadatas = [{"page": d["page"]} for d in docs]

vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas,
    persist_directory="chroma_store"
)
print("Chroma store created with", vectorstore._collection.count(), "documents")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Chroma store created with 2201 documents


In [23]:
# Step 5: LLM (Gemma-3n-e2b-it)
import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Get API key and set it in the environment
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise RuntimeError("Set GEMINI_API_KEY in your .env file")

# Set the API key for Google Generative AI
os.environ["GOOGLE_API_KEY"] = api_key

llm = ChatGoogleGenerativeAI(model="models/gemma-3n-e2b-it", temperature=0)


In [24]:
# Step 6: Prompt template

PROMPT_TEMPLATE = """
You are an academic assistant. Using ONLY the provided context snippets (each has a page number),
answer the user query precisely, concisely, and with references.

If the answer is not present, reply "Answer not found in the provided book."

Return answer in 2 parts:
1. The answer text
2. A References section with page numbers mentioned

Context:
{context}

User Question:
{question}
"""

In [25]:
# Step 7: Retrieval + Generation
results = []
for q in tqdm(queries, desc="Processing queries"):
    qid = q.get("query_id")
    question = q.get("question")

    # Retrieve top-5 chunks
    retrieved = vectorstore.similarity_search(question, k=5)

    context_text = "\n\n".join(
        [f"[Page {doc.metadata['page']}]\n{doc.page_content}" for doc in retrieved]
    )

    # Build prompt
    prompt = PROMPT_TEMPLATE.format(context=context_text, question=question)

    # Call LLM
    resp = llm([HumanMessage(content=prompt)])
    answer_text = resp.content.strip()

    # Build references JSON
    pages = list({doc.metadata["page"] for doc in retrieved})
    references = {"sections": [], "pages": [str(p) for p in pages]}

    results.append({
        "query_id": qid,
        "context": context_text,
        "answer": answer_text,
        "references": json.dumps(references)
    })


  resp = llm([HumanMessage(content=prompt)])
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing queries:  14%|█▍        | 7/50 [00:24<02:25,  3.38s/it]Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_input_token_count"
  quota_id: "GenerateContentInputTokensPerModelPerMinute-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemma-3-2b"
  }
  quota_dimensions {
    key: "location"
    value:

KeyboardInterrupt: 