# Evaluation Metrics – t-test, CI, and Cohen's d

In [None]:

# Evaluation Metrics – t-test, CI, and Cohen's d
# This notebook loads (or synthesises) experiment CSVs and computes statistics.
import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

base_dir = "/mnt/data"
csv_dir = os.path.join(base_dir, "csv")

os.makedirs(csv_dir, exist_ok=True)

# If CSVs are missing, synthesise demo data approximating the paper's numbers.
def maybe_make_demo_csv(path, task, baseline_rate, mitigated_rate, n_prompts=2000, n_runs=20, seed=7):
    if os.path.exists(path):
        return
    rng = np.random.default_rng(seed)
    rows = []
    for run_id in range(1, n_runs+1):
        # Bernoulli samples with slight per-run variance
        b = rng.binomial(1, baseline_rate, size=n_prompts)
        m = rng.binomial(1, mitigated_rate, size=n_prompts)
        for i in range(n_prompts):
            rows.append({
                "prompt_id": f"{task}_{run_id}_{i+1:04d}",
                "run_id": run_id,
                "task": task,
                "is_hallucination_baseline": int(b[i]),
                "is_hallucination_mitigated": int(m[i]),
                "latency_ms": int(rng.normal(1200, 120).clip(800, 1800)),
            })
    pd.DataFrame(rows).to_csv(path, index=False)

gsm_path = os.path.join(csv_dir, "gsm8k_results.csv")
tqa_path = os.path.join(csv_dir, "truthfulqa_results.csv")

maybe_make_demo_csv(gsm_path, "GSM8K", baseline_rate=0.38, mitigated_rate=0.29)
maybe_make_demo_csv(tqa_path, "TruthfulQA", baseline_rate=0.62, mitigated_rate=0.43)

# Load and compute stats
def compute_stats(path):
    df = pd.read_csv(path)
    grouped = df.groupby(["task","run_id"]).mean(numeric_only=True)
    b = grouped["is_hallucination_baseline"].values
    m = grouped["is_hallucination_mitigated"].values
    # Paired t-test across runs
    t_stat, p_val = stats.ttest_rel(b, m)
    # Cohen's d for paired samples ~ approximate using pooled SD
    d = (b.mean() - m.mean()) / np.sqrt(((b.std(ddof=1)**2 + m.std(ddof=1)**2)/2))
    # 95% CI for mean difference
    diff = b - m
    mean_diff = diff.mean()
    se = diff.std(ddof=1) / np.sqrt(len(diff))
    ci_low = mean_diff - 1.96*se
    ci_high = mean_diff + 1.96*se
    return {
        "baseline_mean": b.mean(), "mitigated_mean": m.mean(),
        "mean_diff": mean_diff, "ci_95": (ci_low, ci_high),
        "t_stat": t_stat, "p_value": p_val, "cohen_d": d
    }

gsm_stats = compute_stats(gsm_path)
tqa_stats = compute_stats(tqa_path)

print("GSM8K:", gsm_stats)
print("TruthfulQA:", tqa_stats)

# Bar plot of baseline vs mitigated for both tasks
labels = ["GSM8K", "TruthfulQA"]
baseline_vals = [gsm_stats["baseline_mean"], tqa_stats["baseline_mean"]]
mitig_vals = [gsm_stats["mitigated_mean"], tqa_stats["mitigated_mean"]]

x = np.arange(len(labels))
w = 0.35

plt.figure(figsize=(6,4))
plt.bar(x - w/2, baseline_vals, width=w, label="Baseline")
plt.bar(x + w/2, mitig_vals, width=w, label="With Basins")
plt.xticks(x, labels)
plt.ylabel("Hallucination rate (mean across runs)")
plt.title("Baseline vs With Basins")
plt.legend()
plt.tight_layout()
plt.show()
