# Generative Benchmark: End-to-End Notebook

## 1. Setup & Imports

In [None]:
%pip install -q datasets pyarrow tiktoken

In [None]:
%pip install -q ipywidgets

## 2. Load Data
We’ll fetch **500 articles** from the AG News corpus.
Each record is already a clean news snippet, which is ideal for a lightweight benchmark.

In [None]:
from pathlib import Path
import pandas as pd
from datasets import load_dataset 

RAW_DIR = Path("data", "raw")
PROC_DIR = Path("data", "processed")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True,exist_ok=True)

SAMPLE_SIZE = 500

ag_ds = load_dataset(
    "ag_news",
    split=f"train[:{SAMPLE_SIZE}]",
    cache_dir=str(RAW_DIR)
)

print(f"fetched {len(ag_ds):,} articles")

In [None]:
def to_doc(row, idx: int) -> dict:
    """
    Convert a HuggingFace row into {'doc_id', 'content'}.
    """
    return {
        "doc_id": f"ag_{idx:04d}",
        "content": row["text"].strip()
    }

docs = [to_doc(ag_ds[i], i) for i in range(len(ag_ds))]
print(docs[0]["doc_id"], "->", docs[0]["content"][:80], "...")

In [None]:
import numpy as np
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
lengths = [len(enc.encode(d["content"])) for d in docs]

print(
    f"avg tokens: {np.mean(lengths):.1f} | "
    f"min: {np.min(lengths)} | "
    f"max: {np.max(lengths)}"
)

In [None]:
from pathlib import Path
import pandas as pd

out_path = Path("data", "processed", "docs.parquet")
pd.DataFrame(docs).to_parquet(out_path, index=False)
print("saved -> ", out_path)

## 3. Chunk / Embed / Index

In [None]:
%pip install -q sentence-transformers chromadb langchain langchain-community tiktoken

In [None]:
pip install -U langchain-huggingface langchain-chroma

In [None]:
import pandas as pd
from src.chunker import chunk_documents

docs = pd.read_parquet("data/processed/docs.parquet").to_dict(orient="records")
chunks = chunk_documents(docs, chunk_size=400, chunk_overlap=50)

print(f"{len(docs)} docs -> {len(chunks)} chunks")
print("example chunk id :", chunks[0]["id"])

In [None]:
from src.indexer import embed_and_index

retriever_vs = embed_and_index(
    chunks,
    collection_name="ag_miniLM",
    persist_path="data/chroma",
)

print("collection size:", retriever_vs._collection.count())

In [None]:
%pip install -q huggingface_hub jsonlines tqdm

### 4 Synthetic query generation (fast)

We’ll use **google/flan-t5-small** locally, in batches of 16, to produce
one question per chunk in just a few minutes.

In [None]:
%pip install -q transformers accelerate jsonlines tqdm

import textwrap, pathlib, jsonlines, tqdm
from transformers import pipeline
from src.chunker import chunk_documents
import pandas as pd

In [None]:
docs = pd.read_parquet("data/processed/docs.parquet").to_dict("records")
chunks = chunk_documents(docs, chunk_size=400, chunk_overlap=50)

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=-1, #force cpu
)

# plain T5 format
PROMPT = "generate question: \"{passage}\""

In [None]:
out_path = pathlib.Path("data/queries.jsonl")
batch_size = 16

if out_path.exists():
    print("Cached queries found:", out_path)
else:
    with jsonlines.open(out_path, "w") as writer:
        for i in tqdm.trange(0, len(chunks), batch_size, desc="batches"):
            batch = chunks[i : i + batch_size]
            prompts = [PROMPT.format(passage=c["text"].strip()) for c in batch]
            #run 16 passages in one go
            outputs = generator(prompts, max_new_tokens=48)
            for c, out in zip(batch, outputs):
                q = out["generated_text"].strip()
                writer.write(
                    {"query": q, "chunk_id": c["id"], "parent_id": c["parent_id"]}
                )
    print("Saved", len(chunks), "queries to", out_path)

In [None]:
import random, jsonlines
rows = list(jsonlines.open(out_path))
print("Total queries:", len(rows))
for r in random.sample(rows, min(5, len(rows))):
    print(f"[{r['chunk_id']}] {r['query']}")

In [None]:
%pip install -q matplotlib

### 5. Evaluation (Recall@k)

We now measure how often the correct chunk appears in the top k results for each synthetic query.  
We report Recall@1, @3 and @5, then save a bar chart under `figures/recall_baseline.png`.

In [None]:
import jsonlines
import numpy as np
import matplotlib.pyplot as plt
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma as LCChroma

#load golden queries
queries = list(jsonlines.open("data/queries.jsonl"))
print(f"Loaded {len(queries)} synthetic queries")

In [None]:
# sanity check
assert "queries" in globals() and "retriever_vs" in globals(), \
    "You need to run Section 3 (chunking & indexing) first"

def recall_at_k(queries, retriever, k):
    hits = 0
    for entry in queries:
        docs = retriever.invoke(entry["query"])   
        top_parents = [doc.metadata["parent_id"] for doc in docs]
        if entry["parent_id"] in top_parents:
            hits += 1
    return hits / len(queries)

In [None]:
ks = [1, 3, 5]
results = {}

for k in ks:
    retriever = retriever_vs.as_retriever(search_kwargs={"k": k})
    score = recall_at_k(queries, retriever, k)
    results[k] = score
    print(f"Recall@{k}: {score:.3f}")

In [None]:
import pathlib
pathlib.Path("figures").mkdir(exist_ok=True)

#draw with zoomed y-axis and finer ticks
plt.figure(figsize=(5,3))
bars = plt.bar([str(k) for k in ks], [results[k] for k in ks])

#zoom into the 0.88-1 band
plt.ylim(0.88, 1)
# y-ticks every 0.02
yt = np.arange(0.88, 1.01, 0.02)
plt.yticks(yt, [f"{y:.2f}" for y in yt])

plt.title("Baseline Recall@k")
plt.xlabel("k")
plt.ylabel("Recall")
plt.grid(axis="y", alpha=0.3)

# save and show
plt.tight_layout()
plt.savefig("figures/recall_baseline_zoom.png", dpi=150)
plt.show()

## 6. Tweak Experiment - Embedding-Model Swap

We leave the existing index intact and then build a second retriever with  
`multi-qa-MiniLM-L6-dot-v1`. Finally, we recompute Recall@{1,3,5} and plot  
baseline vs. multi-QA side by side. No changes to earlier cells required.

In [None]:
# sanity check
assert "chunks" in globals() and "retriever_vs" in globals(), \
    "You need to run Section 3 (chunking & indexing) first"

from src.indexer import embed_and_index

alt_retriever = embed_and_index(
    chunks, 
    collection_name="ag_multiQA", #new Chroma collection
    persist_path="data/chroma",
    model_name="multi-qa-MiniLM-L6-dot-v1" # the only change
)

print("Built alternative retriever with multi-qa-MiniLM-L6-dot-v1")
    

In [None]:
ks = [1, 3, 5]

baseline_scores = {
    k: recall_at_k(queries, retriever_vs.as_retriever(search_kwargs={"k":k}), k)
    for k in ks
}
alt_scores = {
    k: recall_at_k(queries, alt_retriever.as_retriever(search_kwargs={"k":k}), k)
    for k in ks
}

print("Baseline recall:", baseline_scores)
print("Multi-QA Recall:", alt_scores)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pathlib

pathlib.Path("figures").mkdir(exist_ok=True)

x = np.arange(len(ks))
width = 0.35

plt.figure(figsize=(6,4))
plt.bar(x-width/2, [baseline_scores[k] for k in ks],
        width, label="all-MiniLM-L6-v2")
plt.bar(x + width/2, [alt_scores[k] for k in ks],
        width, label="multi-qa-MiniLM")

plt.xticks(x, [str(k) for k in ks])
plt.ylim(0,1)
plt.xlabel("k")
plt.ylabel("Recall")
plt.title("Recall@k: Baseline vs multi-qa-MiniLM")
plt.legend()
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()

plt.savefig("figures/recall_comparison_embedding", dpi=150)
plt.show()

## 7. Bonus Task - Matrix Benchmark of Embedding & Chunking
We are using 2 different embedding models and 3 different chunking strategies to create a matrix benchmark.

In [None]:
from itertools import product
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import jsonlines, tqdm

from src.chunker import chunk_documents
from src.indexer import embed_and_index

# load the golden queries (as dicts)
with jsonlines.open("data/queries.jsonl") as reader:
    queries = list(reader)
# each entry is now a dict with keys "query" and "parent_id"

#re-define recall_at_k to match this structure
def recall_at_k(queries, retriever, k):
    """
    queries: list of dicts with keys "query" and "parent_id"
    retriever: a LangChain retriever (with .invoke(...))
    k: top-k cutoff
    """
    hits = 0
    for entry in queries:
        q = entry["query"]
        gold = entry["parent_id"]
        docs = retriever.invoke(q)                             
        top_parents = [d.metadata["parent_id"] for d in docs]
        if gold in top_parents:
            hits += 1
    return hits / len(queries)

#set up our grid of embed-models × chunk-sizes
embed_models  = ["all-MiniLM-L6-v2", "multi-qa-MiniLM-L6-dot-v1"]
chunk_configs = [(400,50), (200,100), (100,25)]
ks            = [1, 3, 5]
chunk_labels  = [f"{c}/{o}" for c,o in chunk_configs]

# init empty dataframe
results = {
    k: pd.DataFrame(index=embed_models, columns=chunk_labels, dtype=float)
    for k in ks
}

#run every combo: re-chunk -> index -> eval
for model_name, (cs, ov) in product(embed_models, chunk_configs):
    label = f"{cs}/{ov}"
    print(f">> {model_name} | chunk={label} →", end=" ")

    # re-chunk
    alt_chunks = chunk_documents(docs, chunk_size=cs, chunk_overlap=ov)
    print(f"{len(alt_chunks)} chunks")

    #build fresh index under its own collection
    retriever = embed_and_index(
        alt_chunks,
        collection_name=f"{model_name.replace('/', '_')}_{cs}_{ov}",
        persist_path="data/chroma_grid",
        model_name=model_name
    )

    # compute recall@k for each k
    for k in ks:
        retr = retriever.as_retriever(search_kwargs={"k": k})
        score = recall_at_k(queries, retr, k)
        results[k].loc[model_name, label] = score
        print(f"    Recall@{k}: {score:.3f}")
    print()

for k, df in results.items():
    mat = df.values.astype(float)

    # heatmap
    plt.figure(figsize=(5,3))
    im = plt.imshow(mat, vmin=0, vmax=1, aspect="auto")
    plt.colorbar(im, label="Recall")
    plt.yticks(range(len(embed_models)), embed_models)
    plt.xticks(range(len(chunk_labels)), chunk_labels, rotation=45)
    plt.title(f"Heatmap Recall@{k}")
    plt.xlabel("chunk_size/overlap")
    plt.tight_layout()
    plt.savefig(f"figures/recall_matrix_k{k}_heatmap.png", dpi=150)
    plt.show()

    # grouped bar chart
    x = np.arange(len(embed_models))
    width = 0.25
    plt.figure(figsize=(6,3))
    for i, lab in enumerate(chunk_labels):
        scores = df[lab].astype(float).values
        plt.bar(x + (i-1)*width, scores, width, label=lab)

    plt.ylim(0,1)
    plt.xticks(x, embed_models)
    plt.ylabel("Recall")
    plt.title(f"Recall@{k} by Embedding & Chunking")
    plt.legend(title="chunk/overlap", bbox_to_anchor=(1.02,1), loc="upper left")
    plt.tight_layout()
    plt.savefig(f"figures/recall_matrix_k{k}_barchart.png", dpi=150)
    plt.show()