In [14]:
%pip install google-generativeai


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [15]:
from pathlib import Path
import json
import pandas as pd
import torch

# LangChain
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.pdf import PDFPlumberLoader
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Google AI
import google.generativeai as genai
import os

# ---------------- paths ----------------
PDF_PATH        = Path("../Sources/book.pdf")
QUERIES_PATH    = Path("../Sources/queries.json")
SUBMISSION_CSV  = Path("./submission.csv")

PERSIST_DIR     = Path("./chroma_db")
COLLECTION_NAME = "psychology_textbook"

# ---------------- RAG params ----------------
CHUNK_SIZE      = 800   # ~200 tokens
CHUNK_OVERLAP   = 100
CANDIDATE_K     = 30
FINAL_K         = 10

EMBED_MODEL     = "sentence-transformers/all-MiniLM-L6-v2"

print("✅ Config loaded")


✅ Config loaded


In [None]:
# Set API key (paste your key here or set in environment before starting notebook)
os.environ["GEMINI_API_KEY"] = ""

genai.configure(api_key=os.environ["GEMINI_API_KEY"])
gemini_model = genai.GenerativeModel("gemini-2.5-flash")

print("✅ Gemini 2.5 Flash ready")


✅ Gemini 2.5 Flash ready


In [24]:
# Load PDF
loader = PDFPlumberLoader(str(PDF_PATH))
docs = loader.load()

# Ensure all docs have page numbers
for i, doc in enumerate(docs, start=1):
    if "page" not in doc.metadata or doc.metadata["page"] is None:
        doc.metadata["page"] = i  # fallback: sequential page assignment

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)
chunks = splitter.split_documents(docs)

# Ensure each chunk inherits a valid page
for chunk in chunks:
    if "page" not in chunk.metadata or chunk.metadata["page"] is None:
        chunk.metadata["page"] = chunk.metadata.get("source", "Unknown")

print(f"✅ Loaded {len(docs)} pages → {len(chunks)} chunks (all with page numbers)")


✅ Loaded 753 pages → 3387 chunks (all with page numbers)


In [26]:
# Initialize embeddings
embedding_fn = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

# Initialize Chroma vector DB
vectordb = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_fn,
    persist_directory=str(PERSIST_DIR)
)

# Add chunks if DB empty
if vectordb._collection.count() == 0:
    vectordb.add_documents(chunks)
    print("✅ Chunks added to Chroma DB")
else:
    print("✅ Using existing Chroma DB")


✅ Using existing Chroma DB


In [25]:
def prepare_context(question: str, k=CANDIDATE_K, final_k=FINAL_K):
    retriever = vectordb.as_retriever(search_kwargs={"k": k})
    candidates = retriever.invoke(question)   # new API (avoids deprecation warning)

    # Collect only final_k with valid page numbers
    selected = []
    for doc in candidates:
        if doc.metadata.get("page") is not None:
            selected.append(doc)
        if len(selected) == final_k:
            break

    context = "\n\n---\n\n".join(doc.page_content for doc in selected)
    pages = [doc.metadata["page"] for doc in selected]

    return selected, context, pages


In [41]:
prompt = PromptTemplate.from_template(
"""Answer the following question in an academic style using only the provided context. Don't give lengthy explanations. Keep it concise and to the point as per the context.
Paraphrase instead of copying verbatim. Cite textbook pages at the end of the answer like [p. <number>]. Don't cite the pages in the middle of the answer. You don't have to consider the entire context. Just answer the question based on relevant information from the context. 

Context:
{context}

Question:
{question}

Answer:
"""
)


In [42]:
def answer_question(question: str) -> dict:
    docs, context, pages = prepare_context(question)
    prompt_str = prompt.format(question=question, context=context)

    # Call Gemini
    response = gemini_model.generate_content(prompt_str)
    answer = response.text.strip() if response and response.text else ""

    return {
        "answer": answer,
        "context": context,
        "references": {"pages": pages}
    }

# Quick test
test_out = answer_question("What is the scientific method in psychology?")
print(test_out["answer"])


The scientific method in psychology is an empirical process used to advance scientific knowledge by studying the mind and behavior. This method relies on observation and experimentation, rather than solely on logical arguments or previous authorities. It operates as a circular process where ideas, in the form of theories and hypotheses, are tested against real-world empirical observations, which then lead to the development of further ideas. Both deductive and inductive reasoning are integral to this scientific process [p. 1, 8, 40].
