In [3]:
from google.colab import drive
drive.mount('/content/drive')

import os
print(os.listdir('/content/drive/MyDrive/rag-pgh'))
from pathlib import Path
import json, re, string, collections, math

# --- Paths ---
root = Path("/content/drive/MyDrive/rag-pgh")
q_path   = root/"data/test/questions.txt"
ref_path = root/"data/test/reference_answers.json"

paths = {
    "sparse": root/"data/test/system_output_sparse.json",
    "dense":  root/"data/test/system_output_dense.json",
    "hybrid": root/"data/test/system_output_test.json",
}

# --- Normalization ---
def _normalize_text(s: str) -> str:
    s = s.lower().strip()
    s = s.translate(str.maketrans("", "", string.punctuation))
    return " ".join(s.split())

def _f1_single(pred: str, gold: str) -> float:
    ptoks = _normalize_text(pred).split()
    gtoks = _normalize_text(gold).split()
    if not ptoks and not gtoks: return 1.0
    if not ptoks or not gtoks:  return 0.0
    common = collections.Counter(ptoks) & collections.Counter(gtoks)
    num_same = sum(common.values())
    if num_same == 0: return 0.0
    precision = num_same / len(ptoks)
    recall    = num_same / len(gtoks)
    return 2 * precision * recall / (precision + recall)

def _best_em_f1(pred: str, gold_variants: str):
    """gold_variants may contain multiple references separated by ';'."""
    vars_ = [g.strip() for g in gold_variants.split(";") if g.strip()]
    if not vars_: return 0, 0.0
    pnorm = _normalize_text(pred)
    best_em, best_f1 = 0, 0.0
    for g in vars_:
        gnorm = _normalize_text(g)
        em = int(pnorm == gnorm)
        f1 = _f1_single(pred, g)
        best_em = max(best_em, em)
        best_f1 = max(best_f1, f1)
    return best_em, best_f1

# --- Buckets for analysis  ---
_MONTH = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)"
_DATE  = rf"(\b{_MONTH}\b|\b{_MONTH}[a-z]*\b|\b\d{{1,2}}/\d{{1,2}}\b|\b\d{{4}}\b)"

_BUCKETS = {
    "when/date":   re.compile(rf"\b(when|date|year|on)\b|{_DATE}", re.I),

    "who/person":  re.compile(r"\b(who|artist|singer|performer|mayor)\b", re.I),

    #
    "where/venue": re.compile(r"\b(where|located|venue)\b|\b(Hall|Center|Park|Museum|Arena|Theater|Theatre|Stadium)\b", re.I),

    # events
    "events":      re.compile(r"\b(event|concert|festival|show)\b|\bPPG\b", re.I),

    "cmu-facts":   re.compile(r"\b(cmu|carnegie mellon)\b", re.I),
    "pgh-facts":   re.compile(r"\b(pittsburgh)\b", re.I),
}

_BUCKET_ORDER = ["when/date", "who/person", "where/venue", "events", "cmu-facts", "pgh-facts"]

def bucket_question(q: str) -> str:
    for name in _BUCKET_ORDER:
        if _BUCKETS[name].search(q):
            return name
    return "other"

# --- evaluation for a single system output ---
def evaluate_one(sys_path: Path):
    questions = [l.strip() for l in q_path.read_text(encoding="utf-8").splitlines() if l.strip()]
    gold = json.loads(ref_path.read_text(encoding="utf-8"))
    pred = json.loads(sys_path.read_text(encoding="utf-8"))

    exp = [str(i+1) for i in range(len(questions))]
    assert list(gold.keys()) == exp, "reference_answers.json keys must be '1'..'N'"
    assert list(pred.keys()) == exp, "system_output.json keys must be '1'..'N'"

    perq = []
    ems, f1s = [], []
    nonempty = nonunknown = 0

    for i, q in enumerate(questions, start=1):
        k = str(i)
        p = pred.get(k, "")
        g = gold.get(k, "")
        em, f1 = _best_em_f1(p, g)
        ems.append(em); f1s.append(f1)
        nonempty   += int(bool(p.strip()))
        nonunknown += int(p.strip().lower() not in ("unknown", ""))

        perq.append({
            "id": i, "bucket": bucket(q), "question": q,
            "pred": p, "gold": g, "EM": em, "F1": f1
        })

    # macro
    EM = sum(ems)/len(ems)
    F1 = sum(f1s)/len(f1s)

    # per bucket
    by_b = collections.defaultdict(lambda: {"n":0,"EM":0.0,"F1":0.0})
    for r in perq:
        b = r["bucket"]; by_b[b]["n"] += 1; by_b[b]["EM"] += r["EM"]; by_b[b]["F1"] += r["F1"]
    buckets = {b: {"n":v["n"], "EM": v["EM"]/v["n"] if v["n"] else 0.0,
                        "F1": v["F1"]/v["n"] if v["n"] else 0.0}
               for b,v in by_b.items()}

    return {
        "N": len(questions),
        "EM": EM, "F1": F1,
        "nonempty_rate": nonempty/len(questions),
        "nonunknown_rate": nonunknown/len(questions),
        "per_question": perq,
        "per_bucket": buckets
    }

def print_summary(name: str, r: dict):
    print(f"== {name} ==")
    print(f"N={r['N']}  EM={r['EM']:.3f}  F1={r['F1']:.3f}  non-empty={r['nonempty_rate']:.3f}  non-'unknown'={r['nonunknown_rate']:.3f}")
    if r["per_bucket"]:
        print("Per-bucket:")
        for b, v in sorted(r["per_bucket"].items()):
            print(f"  {b:12s}  n={v['n']:>3}  EM={v['EM']:.3f}  F1={v['F1']:.3f}")
    print()

results = {}
for name, p in paths.items():
    if p.exists():
        results[name] = evaluate_one(p)
        print_summary(name, results[name])
    else:
        print(f"[warn] Missing file for {name}: {p}")

# --- Simple ablation table ---
print("Ablation (EM/F1):")
for name in ["sparse", "dense", "hybrid"]:
    if name in results:
        print(f"  {name:6s}  EM={results[name]['EM']:.3f}   F1={results[name]['F1']:.3f}")

# --- Significance tests: paired t-test on F1, McNemar on EM ---
def paired_t_test_F1(resA, resB):
    A = [r["F1"] for r in resA["per_question"]]
    B = [r["F1"] for r in resB["per_question"]]
    assert len(A)==len(B)
    diffs = [a-b for a,b in zip(A,B)]
    n = len(diffs)
    mean_d = sum(diffs)/n
    var_d  = sum((d-mean_d)**2 for d in diffs)/(n-1) if n>1 else 0.0
    se     = (var_d/n)**0.5 if n>0 else 1.0
    t = mean_d/se if se>0 else 0.0
    # Normal approx to p-value (fine at N>=30)
    from math import erf, sqrt
    p = 2*(1 - 0.5*(1+erf(abs(t)/sqrt(2))))
    return t, p

def mcnemar_EM(resA, resB):
    A = [r["EM"] for r in resA["per_question"]]
    B = [r["EM"] for r in resB["per_question"]]
    b01 = sum(1 for a,b in zip(A,B) if a==0 and b==1)  # B correct, A wrong
    b10 = sum(1 for a,b in zip(A,B) if a==1 and b==0)  # A correct, B wrong
    # Continuity-corrected chi^2
    chi2 = ((abs(b01-b10)-1)**2) / (b01+b10) if (b01+b10)>0 else 0.0
    from math import erf, sqrt
    p = 1 - erf((chi2/2)**0.5)
    return b01, b10, chi2, p

if "hybrid" in results and "sparse" in results:
    t,p = paired_t_test_F1(results["hybrid"], results["sparse"])
    b01,b10,chi2,pm = mcnemar_EM(results["hybrid"], results["sparse"])
    print("\nHybrid vs Sparse:")
    print(f"  paired t-test on F1: t={t:.3f}, p≈{p:.4f}")
    print(f"  McNemar on EM: b01={b01}, b10={b10}, chi2={chi2:.3f}, p≈{pm:.4f}")

if "hybrid" in results and "dense" in results:
    t,p = paired_t_test_F1(results["hybrid"], results["dense"])
    b01,b10,chi2,pm = mcnemar_EM(results["hybrid"], results["dense"])
    print("\nHybrid vs Dense:")
    print(f"  paired t-test on F1: t={t:.3f}, p≈{p:.4f}")
    print(f"  McNemar on EM: b01={b01}, b10={b10}, chi2={chi2:.3f}, p≈{pm:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['data', 'scripts', 'seeds.txt', 'rag.py', 'system_outputs', 'index', '__pycache__', 'run_rag_colab.ipynb']
== sparse ==
N=240  EM=0.087  F1=0.192  non-empty=1.000  non-'unknown'=0.867
Per-bucket:
  cmu-facts     n= 39  EM=0.154  F1=0.283
  events        n= 19  EM=0.053  F1=0.118
  other         n= 22  EM=0.091  F1=0.158
  pgh-facts     n= 68  EM=0.029  F1=0.098
  when/date     n= 31  EM=0.065  F1=0.231
  where/venue   n= 57  EM=0.105  F1=0.220
  who/person    n=  4  EM=0.500  F1=0.714

== dense ==
N=240  EM=0.121  F1=0.222  non-empty=1.000  non-'unknown'=0.871
Per-bucket:
  cmu-facts     n= 39  EM=0.128  F1=0.228
  events        n= 19  EM=0.105  F1=0.294
  other         n= 22  EM=0.091  F1=0.176
  pgh-facts     n= 68  EM=0.088  F1=0.137
  when/date     n= 31  EM=0.097  F1=0.228
  where/venue   n= 57  EM=0.140  F1=0.274
  who/person    n=  4  EM=0.750  F1=0.7