In [1]:
import fitz  
import json
import os
from tqdm import tqdm

RAW_DIR = "data/aml_raw"
META_FILE = os.path.join(RAW_DIR, "metadata.jsonl")
OUT_FILE = "data/aml_chunks.jsonl"

MIN_CHARS = 800 # minimum number of characters per chunk
MAX_CHARS = 1400 # maximum number of characters per chunk

In [2]:
# function to load metadata
def load_metadata(path):
    meta = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            meta.append(json.loads(line))
    return meta

In [3]:
# function to extract text from PDF pages
def extract_pages(pdf_path):
    doc = fitz.open(pdf_path)
    pages = []
    for i, page in enumerate(doc):
        text = page.get_text("text").strip()
        if text:
            pages.append((i + 1, text))
    return pages

In [4]:
# function to determine if a split should occur
def should_split(text):
    triggers = [
        "Definition", "Theorem", "Lemma",
        "Proof", "Algorithm", "Recap",
        "Today‚Äôs lecture", "Overview"
    ]
    return any(t in text for t in triggers)

In [5]:
metadata = load_metadata(META_FILE)

with open(OUT_FILE, "w", encoding="utf-8") as out:
    for doc_meta in tqdm(metadata, desc="Chunking PDFs"):
        pdf_path = doc_meta["source_file"]
        if not os.path.exists(pdf_path):
            continue

        pages = extract_pages(pdf_path)

        buffer = ""
        page_start = None
        chunk_id = 0

        for page_num, text in pages:

            if page_num <= 5:
                low = text.lower()
                if ("administrative" in low
                    or "seminar" in low and "room" in low
                    or "uploaded" in low
                    or "attendance" in low
                    or "exam" in low and "date" in low):
                    
                    continue

                

            if page_start is None:
                page_start = page_num

            if should_split(text) and len(buffer) >= MIN_CHARS:
                chunk = {
                    "chunk_id": f"{doc_meta['doc_id']}_p{page_start}_c{chunk_id}",
                    "doc_id": doc_meta["doc_id"],
                    "type": doc_meta["type"],
                    "index": doc_meta["index"],
                    "page_start": page_start,
                    "page_end": page_num - 1,
                    "source_file": os.path.basename(pdf_path),
                    "text": buffer.strip()
                }
                out.write(json.dumps(chunk, ensure_ascii=False) + "\n")
                chunk_id += 1
                buffer = ""
                page_start = page_num

            buffer += "\n" + text

            if len(buffer) >= MAX_CHARS:
                chunk = {
                    "chunk_id": f"{doc_meta['doc_id']}_p{page_start}_c{chunk_id}",
                    "doc_id": doc_meta["doc_id"],
                    "type": doc_meta["type"],
                    "index": doc_meta["index"],
                    "page_start": page_start,
                    "page_end": page_num,
                    "source_file": os.path.basename(pdf_path),
                    "text": buffer.strip()
                }
                out.write(json.dumps(chunk, ensure_ascii=False) + "\n")
                chunk_id += 1
                buffer = ""
                page_start = None

        if buffer.strip():
            chunk = {
                "chunk_id": f"{doc_meta['doc_id']}_p{page_start}_c{chunk_id}",
                "doc_id": doc_meta["doc_id"],
                "type": doc_meta["type"],
                "index": doc_meta["index"],
                "page_start": page_start,
                "page_end": pages[-1][0],
                "source_file": os.path.basename(pdf_path),
                "text": buffer.strip()
            }
            out.write(json.dumps(chunk, ensure_ascii=False) + "\n")

Chunking PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:04<00:00,  5.33it/s]


In [6]:
import json

IN_FILE = "data/aml_chunks.jsonl"
OUT_FILE = "data/aml_chunks.cleaned.jsonl"

MIN_LEN = 80 

In [7]:
def read_jsonl(path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                items.append(json.loads(line))
    return items

In [8]:
def write_jsonl(path, items):
    with open(path, "w", encoding="utf-8") as f:
        for it in items:
            f.write(json.dumps(it, ensure_ascii=False) + "\n")

In [9]:
def merge_small_into_next(chunks, min_len=80):
    out = []
    i = 0
    while i < len(chunks):
        cur = chunks[i]
        cur_text = (cur.get("text") or "").strip()

        if len(cur_text) < min_len:
            # try merge into next chunk if same doc_id
            if i + 1 < len(chunks) and chunks[i + 1].get("doc_id") == cur.get("doc_id"):
                nxt = chunks[i + 1]
                nxt_text = (nxt.get("text") or "").strip()

                # prepend tiny text to next chunk
                combined = (cur_text + "\n" + nxt_text).strip()
                nxt["text"] = combined

                # expand page range to include the tiny chunk
                nxt["page_start"] = min(cur.get("page_start", nxt.get("page_start")), nxt.get("page_start"))
                # page_end stays nxt's page_end (it already ends later)

                # you can optionally record merged-from ids
                merged_from = nxt.get("merged_from", [])
                merged_from.append(cur.get("chunk_id"))
                nxt["merged_from"] = merged_from

                # skip cur (drop it) and move to next
                i += 1
            else:
                # can't merge forward ‚Üí try merge backward into previous if same doc_id
                if out and out[-1].get("doc_id") == cur.get("doc_id"):
                    prev = out[-1]
                    prev["text"] = (prev.get("text","").strip() + "\n" + cur_text).strip()
                    prev["page_end"] = max(prev.get("page_end", cur.get("page_end")), cur.get("page_end"))

                    merged_from = prev.get("merged_from", [])
                    merged_from.append(cur.get("chunk_id"))
                    prev["merged_from"] = merged_from
                # else: drop it silently (or keep it if you prefer)
        else:
            out.append(cur)

        i += 1

    return out


In [10]:
chunks = read_jsonl(IN_FILE)
cleaned_chunks = merge_small_into_next(chunks, min_len=MIN_LEN)
write_jsonl(OUT_FILE, cleaned_chunks)
print(f"Cleaned chunks written to {OUT_FILE}, from {len(chunks)} to {len(cleaned_chunks)} chunks.")

Cleaned chunks written to data/aml_chunks.cleaned.jsonl, from 165 to 164 chunks.


In [11]:
import json, re

IN_FILE = "data/aml_chunks.cleaned.jsonl"
OUT_FILE = "data/aml_chunks.final.jsonl"

DROP_PATTERNS = [
    r"\badministrative\b",
    r"\bexam date\b",
    r"\blecture\b\s*:\s*",
    r"\bseminar\b\s*:\s*",
    r"\broom\b\s*\d+",
    r"\bschedule\b",
]

drop_re = re.compile("|".join(DROP_PATTERNS), re.IGNORECASE)

kept = 0
dropped = 0

with open(IN_FILE, "r", encoding="utf-8") as fin, open(OUT_FILE, "w", encoding="utf-8") as fout:
    for line in fin:
        obj = json.loads(line)
        text = (obj.get("text") or "").strip()

        # Only drop if it looks like logistics (early pages), to avoid false positives
        is_early = (obj.get("page_end", 9999) <= 12)
        if is_early and drop_re.search(text):
            dropped += 1
            continue

        fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
        kept += 1

print("kept:", kept, "dropped:", dropped, "->", OUT_FILE)


kept: 157 dropped: 7 -> data/aml_chunks.final.jsonl


In [12]:
import json
import re
from dataclasses import dataclass
from typing import List, Dict, Any

import numpy as np
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

import faiss  # assuming this now works on your system

DATA_FILE = "data/aml_chunks.final.jsonl"
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


  from .autonotebook import tqdm as notebook_tqdm





In [13]:
STOPWORDS = {
    "what", "is", "the", "a", "an", "in", "on", "of", "to", "and", "or", "for", "with",
    "as", "this", "that", "it", "be", "are", "was", "were", "do", "does", "did",
    "about", "behind", "explain", "define", "idea", "main"
}

_word_re = re.compile(r"[A-Za-z0-9_]+")

def tokenize(text: str) -> List[str]:
    tokens = [t.lower() for t in _word_re.findall(text)]
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]


def load_chunks(path: str) -> List[Dict[str, Any]]:
    chunks = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                chunks.append(json.loads(line))
    return chunks


In [14]:
from dataclasses import dataclass
from typing import Dict, Any

@dataclass
class SearchResult:
    score: float
    chunk: Dict[str, Any]
    retriever: str   # üëà NEW FIELD

    def cite(self) -> str:
        src = self.chunk.get("source_file", "")
        ps = self.chunk.get("page_start", "")
        pe = self.chunk.get("page_end", "")
        return f"{src} p.{ps}" if ps == pe else f"{src} p.{ps}-{pe}"


In [15]:
class BM25Retriever:
    def __init__(self, chunks):
        self.chunks = chunks
        self.tokens = [tokenize(c.get("text","")) for c in chunks]
        self.bm25 = BM25Okapi(self.tokens)

    def search(self, query, k=5, type_filter="lecture"):
        scores = self.bm25.get_scores(tokenize(query))
        idxs = np.argsort(scores)[::-1]

        results = []
        for i in idxs:
            c = self.chunks[int(i)]
            if type_filter and c.get("type") != type_filter:
                continue
            results.append(
                SearchResult(float(scores[int(i)]), c, "bm25")
            )
            if len(results) >= k:
                break
        return results


In [16]:
class EmbeddingRetriever:
    def __init__(self, chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.chunks = chunks
        self.model = SentenceTransformer(model_name)

        texts = [c.get("text","") for c in chunks]
        embs = []
        for i in tqdm(range(0, len(texts), 64), desc="Embedding chunks"):
            batch = texts[i:i+64]
            vecs = self.model.encode(batch, normalize_embeddings=True)
            embs.append(vecs)

        self.embs = np.vstack(embs).astype("float32")

    def search(self, query, k=5, type_filter="lecture"):
        q = self.model.encode([query], normalize_embeddings=True)[0]
        scores = self.embs @ q

        idxs = np.argsort(scores)[::-1]
        results = []
        for i in idxs:
            c = self.chunks[int(i)]
            if type_filter and c.get("type") != type_filter:
                continue
            results.append(
                SearchResult(float(scores[int(i)]), c, "emb")
            )
            if len(results) >= k:
                break
        return results


In [17]:
chunks = load_chunks(DATA_FILE)
print(f"Loaded {len(chunks)} chunks")

bm25 = BM25Retriever(chunks)
emb = EmbeddingRetriever(chunks)


Loaded 157 chunks


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Embedding chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:03<00:00,  1.16s/it]


In [66]:
def pretty_print(label, results, max_chars=240):
    print(f"\n=== {label} ===")
    for r in results:
        text = r.chunk["text"].replace("\n", " ").strip()
        print(f"- score={r.score:.4f} | {r.chunk['doc_id']} | {r.cite()}")
        print(f"  {text[:max_chars]}{'...' if len(text) > max_chars else ''}")

queries = [
    "What is generalization error in statistical learning?",
    "Explain gamma-weak learnability",
    "What is the idea behind AdaBoost?",
    "What does hard-margin SVM optimize?",
]

for q in queries:
    print(f"\nQUERY: {q}")
    pretty_print("BM25", bm25.search(q))
    pretty_print("Embeddings", emb.search(q))



QUERY: What is generalization error in statistical learning?

=== BM25 ===
- score=8.9361 | aml_lecture_03 | Lecture03.pdf p.1-3
  Advanced Machine Learning Bogdan Alexe,  bogdan.alexe@fmi.unibuc.ro University of Bucharest, 2nd semester, 2024-2025 Recap ‚Ä¢ A Formal Model ‚Äì The Statistical learning framework ‚Äì papaya tasting learning scenario, classification task: tasty ...
- score=6.8099 | aml_lecture_01 | Lecture01.pdf p.56-61
  ‚Ä¢ Survey of prominent methods and approaches with strong  theoretical foundations such as: ‚Äì Boosting ‚Äì SVMs ‚Äì neural networks? (loose bounds, work in progress) ‚Äì etc Course Structure ‚Äì Part 2 Usefulness of Theoretical  Machine Learning Per...
- score=6.0135 | aml_lecture_08 | Lecture08.pdf p.1-6
  Advanced Machine Learning Bogdan Alexe,  bogdan.alexe@fmi.unibuc.ro University of Bucharest, 2nd semester, 2024-2025 The fundamental theorem of  statistical learning The fundamental theorem of statistical learning Theorem (The Fundamental T...
- sc

In [18]:
def rrf_fuse(
    bm25_res: List[SearchResult],
    emb_res: List[SearchResult],
    k: int = 5,
    rrf_k: int = 60
) -> List[SearchResult]:
    # key by chunk_id so identical chunks merge
    def key(r: SearchResult) -> str:
        return r.chunk["chunk_id"]

    scores = {}
    best_obj = {}

    for rank, r in enumerate(bm25_res, start=1):
        kid = key(r)
        scores[kid] = scores.get(kid, 0.0) + 1.0 / (rrf_k + rank)
        best_obj[kid] = r

    for rank, r in enumerate(emb_res, start=1):
        kid = key(r)
        scores[kid] = scores.get(kid, 0.0) + 1.0 / (rrf_k + rank)
        # keep one representative
        best_obj.setdefault(kid, r)

    fused = []
    for kid, sc in sorted(scores.items(), key=lambda x: x[1], reverse=True):
        r = best_obj[kid]
        fused.append(SearchResult(score=float(sc), chunk=r.chunk, retriever="hybrid"))
        if len(fused) >= k:
            break
    return fused

def retrieve(query: str, mode: str = "hybrid", k: int = 5, type_filter: str = "lecture") -> List[SearchResult]:
    if mode == "bm25":
        return bm25.search(query, k=k, type_filter=type_filter)
    if mode == "emb":
        return emb.search(query, k=k, type_filter=type_filter)
    if mode == "hybrid":
        b = bm25.search(query, k=max(k, 8), type_filter=type_filter)
        e = emb.search(query, k=max(k, 8), type_filter=type_filter)
        return rrf_fuse(b, e, k=k)
    raise ValueError("mode must be one of: bm25, emb, hybrid")


In [19]:
import os, json, re, math
import numpy as np
import requests
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

DATA_FILE = "data/aml_chunks.final.jsonl"

GROQ_BASE_URL="https://api.groq.com/openai/v1/"
GROQ_API_KEY = json.load(open("secrets.json")).get("GROQ_API_KEY")
GROQ_MODEL = "llama-3.3-70b-versatile"

In [20]:
def build_context(results: List[SearchResult], max_chars: int = 6000) -> str:
    parts = []
    total = 0
    for i, r in enumerate(results, start=1):
        txt = (r.chunk.get("text","") or "").strip()
        cite = r.cite()
        block = f"[{i}] ({cite})\n{txt}\n"
        if total + len(block) > max_chars:
            break
        parts.append(block)
        total += len(block)
    return "\n".join(parts)

SYSTEM_PROMPT = """You are a course assistant for Advanced Machine Learning.
Rules:
- Answer ONLY using the provided context snippets.
- If the context does not contain the answer, say: "Not in the provided course materials."
- Cite sources inline using the snippet numbers like [1], [2] and include the file+page already shown in those snippets.
- Do not invent equations that are not explicitly present in the context. Prefer conceptual explanations.
- Be concise and accurate.
"""


In [21]:
import time
import requests

def groq_chat(
    messages,
    model=GROQ_MODEL,
    temperature=0.2,
    max_tokens=400,
    max_retries=5
) -> str:
    url = f"{GROQ_BASE_URL}/chat/completions"
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "stream": False,
    }

    for attempt in range(max_retries):
        r = requests.post(url, headers=headers, json=payload, timeout=60)

        if r.status_code == 429:
            wait = 2 ** attempt
            print(f"Rate limited. Sleeping {wait}s...")
            time.sleep(wait)
            continue

        r.raise_for_status()
        return r.json()["choices"][0]["message"]["content"]

    raise RuntimeError("Groq API failed after retries")


In [22]:
def rag_answer(
    query: str,
    mode: str = "hybrid",      # "bm25" | "emb" | "hybrid"
    k: int = 5,
    type_filter: str = "lecture",
    model: str = GROQ_MODEL
) -> dict:
    results = retrieve(query, mode=mode, k=k, type_filter=type_filter)
    context = build_context(results)

    user_prompt = f"""Question: {query}

Context snippets:
{context}

Write the answer now following the rules."""
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]

    answer = groq_chat(messages, model=model, temperature=0.2, max_tokens=450)

    return {
        "query": query,
        "mode": mode,
        "k": k,
        "type_filter": type_filter,
        "retrieved": [{
            "rank": i+1,
            "retriever": r.retriever,
            "score": r.score,
            "chunk_id": r.chunk["chunk_id"],
            "doc_id": r.chunk["doc_id"],
            "cite": r.cite(),
        } for i, r in enumerate(results)],
        "answer": answer
    }


In [71]:
q = "What is generalization error (true risk) in statistical learning?"
for mode in ["bm25", "emb", "hybrid"]:
    out = rag_answer(q, mode=mode, k=5, type_filter="lecture")
    print("\n" + "="*80)
    print("MODE:", mode)
    print(out["answer"])



MODE: bm25
The generalization error (true risk) in statistical learning is defined as LD,f (h), which is the probability that a hypothesis h does not predict the correct label on a random data point x generated by the underlying probability distribution D over X [1] (Lecture02.pdf p.8-11). It is also referred to as the true error of h or the real risk of h. This error is unknown to the learner as they do not know the distribution D and the target function f [1] (Lecture02.pdf p.8-11).

MODE: emb
The generalization error, also known as the true risk, is defined as the probability that a hypothesis h does not predict the correct label on a random data point x generated by the underlying probability distribution D over X [1] (Lecture02.pdf p.8-11). It is denoted as LD,f (h) and represents the true error of h. The goal is to find a hypothesis h such that LD,f (h) is small, which means that h generalizes well to new, unseen data [1] (Lecture02.pdf p.8-11).

MODE: hybrid
The generalization 

In [23]:
def llm_only_answer(query: str) -> str:
    messages = [
        {"role": "system", "content": "Answer the question as best as you can."},
        {"role": "user", "content": query},
    ]
    return groq_chat(messages, temperature=0.7, max_tokens=300)


In [24]:
EVAL_QUESTIONS = [
    "What is generalization error (true risk)?",
    "Define gamma-weak learnability.",
    "What is a hypothesis in statistical learning?",
    "What is the main idea of AdaBoost?",
    "What does hard-margin SVM optimize?",
    "What is uniform convergence and why is it important?",
    "What does the fundamental theorem of statistical learning state?",
    "How is VC dimension related to sample complexity?",
    "What is the Adam optimizer?",
    "Explain transformers in deep learning."
]

def run_eval():
    results = []

    for q in EVAL_QUESTIONS:
        print("\n" + "="*90)
        print("QUESTION:", q)

        llm_ans = llm_only_answer(q)
        print("\nLLM-only:\n", llm_ans)

        for mode in ["bm25", "emb", "hybrid"]:
            out = rag_answer(q, mode=mode, k=5, type_filter="lecture")
            print(f"\nRAG ({mode}):\n", out["answer"])

            results.append({
                "question": q,
                "mode": mode,
                "answer": out["answer"],
                "retrieved": out["retrieved"]
            })

    return results

eval_results = run_eval()



QUESTION: What is generalization error (true risk)?

LLM-only:
 Generalization error, also known as true risk, is a concept in machine learning and statistical modeling that refers to the difference between the expected performance of a model on a training dataset and its performance on unseen, new data.

In other words, generalization error measures how well a model is able to generalize its learning from the training data to make accurate predictions on new, unseen data. It is a measure of the model's ability to avoid overfitting, which occurs when a model is too complex and fits the training data too closely, but fails to perform well on new data.

The generalization error is typically denoted as:

R(h) = E[(h(x) - y)^2]

where:
- R(h) is the true risk or generalization error
- h(x) is the prediction made by the model
- y is the true label or value
- E is the expected value over the distribution of the data

The goal of machine learning is to minimize the generalization error, whic