# Notebook ② – Synthetic Q‑A & Report Dataset Creator

This notebook builds a 50‑item gold set for evaluation.  
You can increase `NUM_EXAMPLES` if you want a larger dev set.

In [3]:
import random, json, pandas as pd, numpy as np, datetime as dt
from pathlib import Path
from llama_index.core import StorageContext, load_index_from_storage, QueryBundle

NUM_EXAMPLES = 50

# ✅ Create storage context and load index from disk
storage_context = StorageContext.from_defaults(persist_dir="lade_chroma")
index = load_index_from_storage(storage_context)
retriever = index.as_retriever(search_kwargs={"k": 1})


Loading llama_index.core.storage.kvstore.simple_kvstore from lade_chroma\docstore.json.


FileNotFoundError: [Errno 2] No such file or directory: 'c:/Users/Admin/Documents/Research LaDe/lade_rag_pipeline/lade_chroma/docstore.json'

In [None]:
def kpi_count_packages(doc_json):
    return len(json.loads(doc_json))

templates = [
    "How many packages did courier {cid} deliver on {date}?",
    "Total parcels for courier {cid} on {date}?"
]
examples = []
rng = random.Random(42)

for _ in range(NUM_EXAMPLES):
    node = random.choice(index.docstore.get_nodes())
    cid = node.metadata['courier']
    date = node.metadata['date']
    question = rng.choice(templates).format(cid=cid, date=date)
    answer = kpi_count_packages(node.text)
    examples.append({
        "question": question,
        "answer": str(answer),
        "doc_id": node.node_id,
        "context": node.text
    })
pd.DataFrame(examples).to_csv("eval_qa50.csv", index=False)
print("Saved eval_qa50.csv")


### Optional polishing with GPT‑4o  
Uncomment and set `OPENAI_API_KEY` to paraphrase numeric answers into full sentences.

In [None]:
# import os, openai, pandas as pd
# openai.api_key = os.getenv("OPENAI_API_KEY")
# df = pd.read_csv("eval_qa50.csv")
# def polish(row):
#     prompt = f"Write a one‑sentence answer: {row['answer']} packages."
#     return openai.ChatCompletion.create(model="gpt-4o", messages=[{"role":"user","content":prompt}]).choices[0].message.content
# df['answer'] = df.apply(polish, axis=1)
# df.to_csv("eval_qa50_polished.csv", index=False)


> **Next**: run `RAG_Pipeline_Evaluation.ipynb`.