**Installing Dependencies**

Check requirements.txt for the required dependencies or in the cell below.

In [None]:
# === Cell 1: Install Dependencies ===
%pip install -q transformers accelerate langchain langchain-community langchain-core langchain-chroma instructor langchain-huggingface pypdf chromadb requests==2.32.4

**Importing necessary libraries**

Standard: pathlib, json, torch.

LangChain: PDF loader, text splitter, embeddings, vector DB.

Instructor & Pydantic: enforce structured outputs.



In [3]:
# === Cell 2: Imports ===
from pathlib import Path
import json
import torch

# LangChain
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Instructor
import instructor
from pydantic import BaseModel, Field
from typing import List


**Load and Split Textbook PDF**

Load book.pdf → 753 pages.

Split into ~3384 chunks of ~800 characters with 100 overlap.

Each chunk keeps metadata

In [4]:
# === Cell 3: Load PDF + Page-aware Chunking ===

# Path to your textbook PDF
PDF_PATH = Path("/content/book.pdf")

# Load the PDF, each page is a document with page_number metadata
loader = PyPDFLoader(str(PDF_PATH))
pages = loader.load()
print(f"Loaded {len(pages)} pages from textbook") #753 pages.

# Split into chunks within each page (no overlap across pages)
CHUNK_SIZE = 800
CHUNK_OVERLAP = 100

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

docs = splitter.split_documents(pages)
print(f"Split into {len(docs)} chunks") #3384 chunks.




Loaded 753 pages from textbook
Split into 3384 chunks


**Build Chroma Vector Database**

Convert each chunk into embeddings using all-MiniLM-L6-v2.(384D)

Store them in ChromaDB for fast semantic search.

Persist database to disk. Collection_name is same as table name for a relational DB

In [None]:
# === Cell 4: Build Vector Database (Chroma) ===

# Define embedding model
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

# Define persistent storage for Chroma
PERSIST_DIR = Path("./chroma_store")
COLLECTION_NAME = "textbook_chunks"

# Build / Persist database
db = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=str(PERSIST_DIR),
    collection_name=COLLECTION_NAME
)

print("Chroma vector DB built and persisted")


**Load Page-to-Section Mapping**

page_to_section.json is a file created from the table of contents of the book which maps textbook page numbers to the sections. Even though the file is already in JSON format, it’s just text on disk.
json.load() is what actually converts it into a usable Python dictionary.

page_to_section.json maps page numbers → textbook sections.

Example: "79": "biopsychology/cells_of_the_nervous_system".

In [7]:
# === Cell 5: Load Page-to-Sections Mapping ===

PAGE_TO_SECTION_PATH = Path("/content/page_to_section.json")

with open(PAGE_TO_SECTION_PATH, "r") as f:
    page_to_section = json.load(f)

print("Loaded page_to_sections.json")
print(f"Sample mapping: {list(page_to_section.items())[:5]}")


Loaded page_to_sections.json
Sample mapping: [('1', 'preface'), ('2', 'preface'), ('3', 'preface'), ('4', 'preface'), ('5', 'preface')]


**Instructor Schema**

we use the Field method to structure the response of the LLM.

... means required field.

In [None]:
# === Cell 6: Define Instructor Schema ===

# Schema for structured answers from LLM
class QAResponse(BaseModel):
    answer: str = Field(..., description="The answer in at least 200 words and at most 400 words")

print("QAResponse schema defined")


**Helper Functions**

These functions are used in the retrieval phase.

Used to convert page numbers as per the offset 12.

Lookup the section name as per the key value (page number) from page_to_section.json.

In [10]:
# === Utility Functions ===

def adjust_page_number(raw_page: str, offset: int = 12) -> str:
    """Convert raw page number from PDF into textbook page number by subtracting offset."""
    return str(int(raw_page) - offset)

def lookup_section(page_number: str, mapping: dict) -> str:
    """Find the textbook section for a given page number."""
    return mapping.get(page_number, "Unknown_Section")


**Retrieval Function**

1. For a given query, retrieve top-k relevant chunks from ChromaDB.
2. Get Page number from metadata
3. Adjust page number
4. Map with sections
5. Output Chunk text and metadata (page number, section name)






In [11]:
# === Retriever Function ===

def prepare_context(query: str, k: int = 3):
    """
    Retrieve top-k chunks for a given query,
    adjust page numbers, and map them to sections.
    """
    retriever = db.as_retriever(search_kwargs={"k": k})
    results = retriever.invoke(query)

    output = {"question": query}

    for idx, doc in enumerate(results, start=1):

    # 1. Get the page number from metadata
      raw_page = doc.metadata.get("page_label")

    # If no page number is found, skip this chunk
      if raw_page is None:
          continue

    # 2. Adjust the page number (remove front matter offset)
      adjusted_page = int(raw_page) - 12
      page_str = str(adjusted_page)

    # 3. Find the textbook section for this page
      section_name = page_to_section.get(page_str, "Unknown_Section")

    # 4. Add this chunk’s info into the output
      output[f"chunk{idx}"] = [doc.page_content]   # the actual text
      output[f"page number {idx}"] = [page_str]    # adjusted page number
      output[f"section {idx}"] = [section_name]    # matching section

    return output


**Load LLM using HuggingFaceModels**

Using TinyLlama which contains 1.1B parameters and has 2048 token window (approx 1500 words)

In [None]:
# === Cell A: Load TinyLlama-Chat Model ===

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# 1. Pick the model
LLM_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 2. Load tokenizer (turns text into tokens and back)
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)

# 3. Load the model (generates text step by step)
model = AutoModelForCausalLM.from_pretrained(LLM_MODEL, device_map="auto")

# 4. Wrap into a pipeline for text generation
chat_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

print("TinyLlama-Chat model loaded and pipeline created")


**Answer Function**

(Chat + Unicode Cleaning + Page/Section Mapping)


In [13]:
# === Unified Answer Function (Chat Format + Enhancements) ===

import unicodedata

def clean_text(text: str) -> str:
    """
    Normalize and clean unicode characters from text (e.g. fancy quotes, ligatures).
    """
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")

def answer_question_chat(question: str) -> dict:
    """
    Retrieves context from Chroma, builds a chat-style prompt,
    generates an academic-style answer, deduplicates pages,
    maps sections, and cleans output.
    """
    # Step 1: Retrieve context
    retrieved = prepare_context(question)

    # Combine chunks into context string
    context_str = "\n\n".join(
        [retrieved.get(f"chunk{i}", [""])[0] for i in range(1, 4)]
    )

    # Collect page numbers
    raw_pages = [retrieved.get(f"page number {i}", [""]) for i in range(1, 4)]
    pages = [p for sublist in raw_pages for p in sublist if p]  # flatten
    unique_pages = sorted(set(pages), key=lambda x: int(x))  # dedup + sort

    # Map to sections
    sections = [page_to_section.get(str(p), "Unknown_Section") for p in unique_pages]

    # Step 2: Build chat-style prompt
    messages = f"""
<|system|> You are an academic assistant with expertise in psychology.
Use ONLY the context given below, answer in 200-400 words, academic style,
summarize instead of copying. Cite textbook pages inline like [p. <number>].
</s>
<|user|> Question: {question}

Context:
{context_str}</s>
<|assistant|>
"""

    # Step 3: Generate raw answer
    outputs = chat_pipe(messages, max_new_tokens=600, do_sample=True)
    raw_answer = outputs[0]["generated_text"]

    # Step 4: Extract and clean
    if "<|assistant|>" in raw_answer:
        answer = raw_answer.split("<|assistant|>", 1)[-1].strip()
    else:
        answer = raw_answer.strip()

    answer = clean_text(answer)

    return {
        "answer": answer,
        "references": {
            "pages": unique_pages,
            "sections": sections
        }
    }



**Generate and save submission_clean.csv file**

Generate answers for the questions in queries.json and save them to a csv file.

In [None]:
# === Generate and Save Clean Submission CSV ===

import pandas as pd, csv

# Load queries.json
with open("/content/queries.json", "r") as f:
    queries = json.load(f)

rows = []
for idx, q in enumerate(queries, start=1):
    qid = q.get("id", idx)
    qtext = q["question"]

    # Retrieve context + answer
    retrieved = prepare_context(qtext)
    context_str = "\n\n".join([retrieved.get(f"chunk{i}", [""])[0] for i in range(1, 4)])
    result = answer_question_chat(qtext)

    rows.append({
        "ID": qid,
        "context": context_str.replace('"', "'"),
        "answer": result["answer"].replace('"', "'"),
        "references": json.dumps(result["references"])
    })

# Save directly as clean CSV (Excel/Sheets friendly)
df = pd.DataFrame(rows, columns=["ID", "context", "answer", "references"])
df.to_csv("submission.csv", index=False, quoting=csv.QUOTE_ALL, escapechar="\\")

print("submission_clean.csv created with", len(df), "rows")


Here is the final submission.csv file [submission.csv](submission1.csv)