# Generative Benchmark: End-to-End Notebook

## 1. Setup & Imports

## 5. Evaluation (Recall@k)
## 6. Tweak Experiment

In [None]:
%pip install -q datasets pyarrow tiktoken

In [None]:
%pip install -q ipywidgets

## 2. Load Data
We’ll fetch **500 articles** from the AG News corpus.
Each record is already a clean news snippet, which is ideal for a lightweight benchmark.

In [None]:
from pathlib import Path
import pandas as pd
from datasets import load_dataset 

RAW_DIR = Path("data", "raw")
PROC_DIR = Path("data", "processed")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True,exist_ok=True)

SAMPLE_SIZE = 500

ag_ds = load_dataset(
    "ag_news",
    split=f"train[:{SAMPLE_SIZE}]",
    cache_dir=str(RAW_DIR)
)

print(f"fetched {len(ag_ds):,} articles")

In [None]:
def to_doc(row, idx: int) -> dict:
    """
    Convert a HuggingFace row into {'doc_id', 'content'}.
    """
    return {
        "doc_id": f"ag_{idx:04d}",
        "content": row["text"].strip()
    }

docs = [to_doc(ag_ds[i], i) for i in range(len(ag_ds))]
print(docs[0]["doc_id"], "->", docs[0]["content"][:80], "...")

In [None]:
import numpy as np
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
lengths = [len(enc.encode(d["content"])) for d in docs]

print(
    f"avg tokens: {np.mean(lengths):.1f} | "
    f"min: {np.min(lengths)} | "
    f"max: {np.max(lengths)}"
)

In [None]:
from pathlib import Path
import pandas as pd

out_path = Path("data", "processed", "docs.parquet")
pd.DataFrame(docs).to_parquet(out_path, index=False)
print("saved -> ", out_path)

## 3. Chunk / Embed / Index

In [None]:
%pip install -q sentence-transformers chromadb langchain langchain-community tiktoken

In [None]:
pip install -U langchain-huggingface langchain-chroma

In [None]:
import pandas as pd
from src.chunker import chunk_documents

docs = pd.read_parquet("data/processed/docs.parquet").to_dict(orient="records")
chunks = chunk_documents(docs, chunk_size=400, chunk_overlap=50)

print(f"{len(docs)} docs -> {len(chunks)} chunks")
print("example chunk id :", chunks[0]["id"])

In [None]:
from src.indexer import embed_and_index

retriever_vs = embed_and_index(
    chunks,
    collection_name="ag_miniLM",
    persist_path="data/chroma",
)

print("collection size:", retriever_vs._collection.count())

In [None]:
%pip install -q huggingface_hub jsonlines tqdm

### 4 Synthetic query generation (fast)

We’ll use **google/flan-t5-small** locally, in batches of 16, to produce
one question per chunk in just a few minutes.

In [None]:
%pip install -q transformers accelerate jsonlines tqdm

import textwrap, pathlib, jsonlines, tqdm
from transformers import pipeline
from src.chunker import chunk_documents
import pandas as pd

In [None]:
docs = pd.read_parquet("data/processed/docs.parquet").to_dict("records")
chunks = chunk_documents(docs, chunk_size=400, chunk_overlap=50)

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=-1, #force cpu
)

# plain T5 format
PROMPT = "generate question: \"{passage}\""

In [None]:
out_path = pathlib.Path("data/queries.jsonl")
batch_size = 16

if out_path.exists():
    print("Cached queries found:", out_path)
else:
    with jsonlines.open(out_path, "w") as writer:
        for i in tqdm.trange(0, len(chunks), batch_size, desc="batches"):
            batch = chunks[i : i + batch_size]
            prompts = [PROMPT.format(passage=c["text"].strip()) for c in batch]
            #run 16 passages in one go
            outputs = generator(prompts, max_new_tokens=48)
            for c, out in zip(batch, outputs):
                q = out["generated_text"].strip()
                writer.write(
                    {"query": q, "chunk_id": c["id"], "parent_id": c["parent_id"]}
                )
    print("Saved", len(chunks), "queries to", out_path)

In [None]:
import random, jsonlines
rows = list(jsonlines.open(out_path))
print("Total queries:", len(rows))
for r in random.sample(rows, min(5, len(rows))):
    print(f"[{r['chunk_id']}] {r['query']}")