In [None]:

%pip install -qU \
  "langchain>=0.3" \
  "langchain-community>=0.3" \
  "langchain-text-splitters>=0.3" \
  "langchain-chroma>=0.1.2" \
  "langchain-huggingface>=0.1.0" \
  "chromadb>=0.5" \
  "transformers>=4.44" \
  "sentence-transformers>=3.0" \
  pdfplumber




Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
import json
import pandas as pd

# LangChain core
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate

# New split packages (2025+)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.pdf import PDFPlumberLoader
from langchain_community.document_transformers import LongContextReorder

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline

# Models
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# ---------------- config you may edit ----------------
PDF_PATH        = Path("../Sources/book.pdf")          # <— point to your textbook PDF
QUERIES_PATH    = Path("../Sources/queries.json")          # expects [{"id": "...", "question": "..."}]
SUBMISSION_CSV  = Path("./submission.csv")

PERSIST_DIR     = Path("../Code File/chroma_db")             # Chroma on-disk store
COLLECTION_NAME = "psychology_textbook"

# RAG knobs (tuned for Llama-3.2 3B Instruct)
CHUNK_SIZE      = 800     # ~200 tokens
CHUNK_OVERLAP   = 200
CANDIDATE_K     = 30
FINAL_K         = 3

DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", DEVICE)

EMBED_MODEL     = "sentence-transformers/all-MiniLM-L6-v2"
RERANK_MODEL    = "cross-encoder/ms-marco-MiniLM-L-6-v2"
LLM_MODEL       = "microsoft/phi-2"           # CPU-friendly, instruction-tuned

SEED            = 42


  from .autonotebook import tqdm as notebook_tqdm


Using device: mps


PDFPlumberLoader returns one doc per page with detailed metadata (incl. page index).

In [3]:
assert PDF_PATH.exists(), f"PDF not found at {PDF_PATH.resolve()}"

loader = PDFPlumberLoader(str(PDF_PATH))
pages: list[Document] = loader.load()  # one Document per page; includes page metadata
len(pages), pages[1].metadata


(753,
 {'source': '../Sources/book.pdf',
  'file_path': '../Sources/book.pdf',
  'page': 1,
  'total_pages': 753,
  'CreationDate': "D:20220224092550-06'00'",
  'ModDate': "D:20220301111804-06'00'",
  'Producer': 'Prince 14.2 (www.princexml.com)',
  'Title': 'Psychology 2e'})

split into chunks (preserve page metadata)

In [4]:
# Recursive splitter keeps larger structures intact where possible.
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""],  # fallback cascade
)

chunks: list[Document] = splitter.split_documents(pages)

# carry forward page numbers for citations
for d in chunks:
    # Ensure we retain the original page id if present (pdfplumber sets 'page' or 'page_number' keys)
    pg = d.metadata.get("page", d.metadata.get("page_number"))
    if pg is not None:
        d.metadata["page"] = int(pg)

len(chunks), chunks[0].metadata


(3794,
 {'source': '../Sources/book.pdf',
  'file_path': '../Sources/book.pdf',
  'page': 2,
  'total_pages': 753,
  'CreationDate': "D:20220224092550-06'00'",
  'ModDate': "D:20220301111804-06'00'",
  'Producer': 'Prince 14.2 (www.princexml.com)',
  'Title': 'Psychology 2e'})

embeddings & Chroma index (persisted)

Use HuggingFaceEmbeddings from the partner package; Chroma’s current integration lives in langchain-chroma, supports local persistence. Normalizing embeddings is a common practice for cosine similarity.

In [5]:
embeddings = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs={"device": DEVICE},
    encode_kwargs={"normalize_embeddings": True},  # good for cosine similarity
)

# Fresh or existing DB
vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=str(PERSIST_DIR),
)

# Add only if empty to avoid duplicates when re-running
if vectorstore._collection.count() == 0:
    vectorstore.add_documents(chunks)
    vectorstore.persist()

print("Chroma collection:", COLLECTION_NAME, "size:", vectorstore._collection.count())


Chroma collection: psychology_textbook size: 2882


In [6]:
# Fast recall → semantic retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": CANDIDATE_K})

# Precise ranking → cross-encoder
reranker = CrossEncoder(RERANK_MODEL, device=DEVICE)

def cross_encoder_rerank(query: str, docs: list[Document], top_k: int = FINAL_K) -> list[Document]:
    if not docs:
        return []
    pairs = [(query, d.page_content) for d in docs]
    scores = reranker.predict(pairs)  # higher = more relevant
    ranked = sorted(zip(docs, scores), key=lambda x: float(x[1]), reverse=True)
    return [d for d, _ in ranked[:top_k]]


In [7]:
def prepare_context(query: str) -> tuple[list[Document], str, list[int]]:
    # 1) retrieve candidates
    candidates = retriever.invoke(query)
    # 2) rerank with cross-encoder
    top_docs = cross_encoder_rerank(query, candidates, top_k=FINAL_K)
    # 3) build context string (keep original order from reranker)
    context = "\n\n---\n\n".join(d.page_content.strip() for d in top_docs)
    # 4) collect unique page numbers for citations
    pages = sorted({int(d.metadata.get("page")) for d in top_docs if d.metadata.get("page") is not None})
    return top_docs, context, pages


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline



# Load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
model     = AutoModelForCausalLM.from_pretrained(LLM_MODEL).to(DEVICE)

# Build generation pipeline
gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if DEVICE != "cpu" else -1,
    max_new_tokens=280,      # ~200 words
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.15,
    return_full_text=False   # ensures we only get the generated part
)

llm = HuggingFacePipeline(pipeline=gen_pipe)

print("✅ Gemma-2B text-generation pipeline ready on", DEVICE)


Loading checkpoint shards: 100%|██████████| 2/2 [00:27<00:00, 13.74s/it]


RuntimeError: MPS backend out of memory (MPS allocated: 9.06 GiB, other allocations: 384.00 KiB, max allowed: 9.07 GiB). Tried to allocate 25.00 MiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """Answer the following question in an academic style using only the provided context.
Write at least 200 words, structured with introduction, explanation, and conclusion.
Do not copy text verbatim; paraphrase and expand ideas clearly.
Cite textbook pages inline like [p. <number>] when relevant.

Question:
{question}

Context:
{context}

Answer:
"""
)


In [None]:
def answer_question(question: str) -> dict:
    docs, context, pages = prepare_context(question)
    prompt_str = prompt.format(question=question, context=context)

    outputs = gen_pipe(prompt_str)

    # Gemma returns "generated_text"
    answer = outputs[0]["generated_text"].strip()

    return {
        "answer": answer,
        "context": context,
        "references": {"pages": pages}
    }

# Quick test
test_out = answer_question("What is the scientific method in psychology?")
test_out
