**Installing Dependencies**


In [101]:
%pip install -q google-generativeai python-dotenv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [102]:
from pathlib import Path
import os, json
import pandas as pd
from pydantic import BaseModel, Field
import google.generativeai as genai
from dotenv import load_dotenv

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


In [103]:
LLM_NAME = "gemini-1.5-flash"
load_dotenv()
GEMINI_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_KEY:
    raise RuntimeError("GEMINI_API_KEY missing in .env")
genai.configure(api_key=GEMINI_KEY)
gemini_model = genai.GenerativeModel(LLM_NAME)


In [104]:

PDF_PATH = Path("../../Sources/book.pdf")
loader = PyPDFLoader(str(PDF_PATH))
pages = loader.load()

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

docs = splitter.split_documents(pages)


In [105]:

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
PERSIST_DIR = Path("./chroma_store")
COLLECTION_NAME = "textbook_chunks"

db = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=str(PERSIST_DIR),
    collection_name=COLLECTION_NAME
)


In [106]:

PAGE_TO_SECTION_PATH = Path("./helper/page_to_section.json")
with open(PAGE_TO_SECTION_PATH, "r") as f:
    page_to_section = json.load(f)


In [107]:

class QAResponse(BaseModel):
    answer: str = Field(..., description="Answer in 100-300 words")


In [108]:

def prepare_context(query: str, k: int = 3):
    """
    Retrieve top-k chunks for a given query,
    adjust page numbers, and map them to sections.
    """
    retriever = db.as_retriever(search_kwargs={"k": k})
    results = retriever.invoke(query)

    output = {"question": query}

    for idx, doc in enumerate(results, start=1):

    # 1. Get the page number from metadata
      raw_page = doc.metadata.get("page_label")

    # If no page number is found, skip this chunk
      if raw_page is None:
          continue

    # 2. Adjust the page number (remove front matter offset)
      adjusted_page = int(raw_page) - 12
      page_str = str(adjusted_page)

    # 3. Find the textbook section for this page
      section_name = page_to_section.get(page_str, "Unknown_Section")

    # 4. Add this chunk’s info into the output
      output[f"chunk{idx}"] = [doc.page_content]   # the actual text
      output[f"page number {idx}"] = [page_str]    # adjusted page number
      output[f"section {idx}"] = [section_name]    # matching section

    return output


In [109]:

PROMPT_TEMPLATE = (
    "You are a psychology tutor. Using ONLY the context, answer the question in atleast 50 words and atmost 400 words, not more than that."
    "If the context is insufficient, state what is missing.\n\n"
    "Question: {question}\n\nContext:\n{context}\n\n"
    "Write only the answer. Do not include any citations, page numbers, or section names in the answer text."
)


In [110]:

def format_context_for_prompt(context_data):
    """Clean context string: only the text content (no section/page)."""
    parts = []
    i = 1
    while f"chunk{i}" in context_data:
        text = context_data[f"chunk{i}"][0]
        parts.append(text)
        i += 1
    return "\n\n".join(parts)


In [111]:

def answer_question(question: str, k: int = 3):
    """Retrieve context, build prompt, and query Gemini with QAResponse schema."""
    context_data = prepare_context(question, k=k)
    formatted_context = format_context_for_prompt(context_data)
    prompt = PROMPT_TEMPLATE.format(question=question, context=formatted_context)

    try:
        response = gemini_model.generate_content(
            prompt,
            generation_config={
                "response_mime_type": "application/json",
                "response_schema": QAResponse
            }
        )
        if hasattr(response, "parsed") and response.parsed:
            answer_text = response.parsed.answer
        else:
            try:
                data = json.loads(response.text)
                answer_text = data.get("answer", "")
            except Exception:
                answer_text = response.text or ""
    except Exception as e:
        answer_text = f"Error: {str(e)}"

    sources = []
    i = 1
    while f"chunk{i}" in context_data:
        sources.append({
            "chunk": i,
            "page": context_data.get(f"page number {i}", ["?"])[0],
            "section": context_data.get(f"section {i}", ["?"])[0]
        })
        i += 1

    return {
        "question": question,
        "answer": (answer_text or "").strip(),
        "sources_used": sources
    }


In [112]:

test_q = "What is the scientific method in psychology?"
res = answer_question(test_q, k=3)
print(res["answer"])
print("Sources:")
for s in res["sources_used"]:
    print(f"- {s['section']} (p.{s['page']})")


The scientific method in psychology involves testing ideas (theories and hypotheses) against real-world observations.  This process allows for the advancement of scientific knowledge within the field.  It is a cyclical process where observations lead to the formation of theories, which then generate testable hypotheses. The results of these tests inform whether the theory should be revised, refined, or rejected, leading to further testing and refinement of knowledge.  Essentially, it's a systematic approach to understanding behavior and mental processes through empirical evidence and rigorous testing.
Sources:
- psychological_research/why_is_research_important (p.39)
- psychological_research/why_is_research_important (p.39)
- psychological_research/why_is_research_important (p.39)


In [113]:

QUERIES_PATH = Path("../../Sources/queries.json")
with open(QUERIES_PATH, "r") as f:
    queries = json.load(f)

print(f"Loaded {len(queries)} questions")

def process_all_questions(queries_list, output_filename="submission_solution2.csv", k=3):
    rows = []
    for q in queries_list:
        # Get the full response with context data
        context_data = prepare_context(q["question"], k=k)
        formatted_context = format_context_for_prompt(context_data)
        
        # Get the answer
        r = answer_question(q["question"], k=k)
        
        # Extract sections and pages for references
        sections = []
        pages = []
        i = 1
        while f"chunk{i}" in context_data:
            page = context_data.get(f"page number {i}", ["?"])[0]
            section = context_data.get(f"section {i}", ["?"])[0]
            if page != "?" and page not in pages:
                pages.append(page)
            if section != "?" and section not in sections:
                sections.append(section)
            i += 1
        
        # Sort pages numerically
        pages = sorted(pages, key=lambda x: int(x) if x.isdigit() else float('inf'))
        
        # Create references JSON
        references = json.dumps({
            "sections": sections,
            "pages": pages
        })
        
        rows.append({
            "ID": q["query_id"],
            "context": formatted_context,
            "answer": r["answer"],
            "references": references
        })
    
    df = pd.DataFrame(rows)
    df.to_csv(output_filename, index=False)
    print(f"Saved -> {output_filename} ({len(rows)} rows)")
    return df

print("Batch processor ready (competition format)")


Loaded 50 questions
Batch processor ready (competition format)


In [114]:
results_df = process_all_questions(queries, "submission2.csv", k=3)

Saved -> submission2.csv (50 rows)


Here is the final submission.csv file [submission.csv](submission2.csv)