# Evaluation Demo (Module-based): Precision@k, Expert Ratings, Uncertainty Alignment, and Agreement

This notebook demonstrates the evaluation workflow using CSV-based synthetic demonstration data.

It:
1) Loads evaluation CSVs under `data/demo_evaluation/`
2) Computes retrieval metrics (citation precision@k)
3) Aggregates expert ratings (factuality, interpretability)
4) Computes inter-rater agreement (pairwise Cohen's κ)
5) Computes uncertainty alignment (system vs expert majority vote)
6) Produces a Table-1-like report table and summary statistics
7) Optionally exports outputs to `outputs/`


In [None]:
## 1) Imports
from pathlib import Path
import pandas as pd

# --- evaluation modules (core logic lives in /evaluation, notebook just calls) ---
from evaluation import (
    parse_relevance_list,
    add_precision_at_k,
    pairwise_cohens_kappa,
    aggregate_expert_scores,
    compute_expert_confidence_majority,
    compute_uncertainty_alignment,
    build_report_table,
    compute_summary_stats,
    export_outputs,
)

In [None]:
## 2)  Load CSV demo data
REPO_ROOT = Path("..")  # notebooks/ is one level below repo root
EVAL_DIR = REPO_ROOT / "data" / "demo_evaluation"
assert EVAL_DIR.exists(), f"demo_evaluation folder not found: {EVAL_DIR.resolve()}"

scenarios = pd.read_csv(EVAL_DIR / "scenarios.csv")
retrieval_relevance = pd.read_csv(EVAL_DIR / "retrieval_relevance.csv")
expert_ratings = pd.read_csv(EVAL_DIR / "expert_ratings.csv")
system_confidence = pd.read_csv(EVAL_DIR / "system_confidence.csv")

# Parse relevance_list: "1|0|1" -> [1,0,1]
retrieval_relevance["relevance_list"] = retrieval_relevance["relevance_list"].apply(parse_relevance_list)

scenarios.head(), retrieval_relevance.head(), expert_ratings.head(), system_confidence.head()

In [None]:
def require_columns(df: pd.DataFrame, required: list[str], df_name: str) -> None:
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(
            f"[SchemaError] {df_name} is missing columns: {missing}\n"
            f"Available columns: {list(df.columns)}"
        )

# --- Schema checks (fail fast, avoid silent errors) ---
require_columns(
    scenarios,
    ["scenario_id", "pollutant", "outcome", "query"],
    "scenarios.csv",
)

require_columns(
    retrieval_relevance,
    ["scenario_id", "k", "relevance_list"],
    "retrieval_relevance.csv",
)

require_columns(
    expert_ratings,
    ["scenario_id", "expert", "factuality", "interpretability", "expert_confidence"],
    "expert_ratings.csv",
)

require_columns(
    system_confidence,
    ["scenario_id", "system_confidence"],
    "system_confidence.csv",
)

print("Schema check passed ✅")


In [None]:
## 3) Compute citation precision@k
retrieval_relevance = add_precision_at_k(
    retrieval_relevance,
    relevance_col="relevance_list",
    out_col="precision_at_k",
)

retrieval_relevance

In [None]:
## 4) Merge retrieval results with scenarios
results = scenarios.merge(
    retrieval_relevance[["scenario_id", "k", "precision_at_k"]],
    on="scenario_id",
    how="left",
)

results

In [None]:
## 5) Aggregate expert ratings per scenario
expert_agg = aggregate_expert_scores(
    expert_ratings,
    scenario_col="scenario_id",
    factuality_col="factuality",
    interpretability_col="interpretability",
)

expert_agg

In [None]:
## 6) Add system confidence labels
results = results.merge(expert_agg, on="scenario_id", how="left")
results = results.merge(system_confidence, on="scenario_id", how="left")
results


In [None]:
## 7) Inter-rater agreement: pairwise Cohen’s κ
kappa_factuality = pairwise_cohens_kappa(expert_ratings, label_col="factuality", scenario_col="scenario_id", expert_col="expert")
kappa_interpretability = pairwise_cohens_kappa(expert_ratings, label_col="interpretability", scenario_col="scenario_id", expert_col="expert")

kappa_factuality, kappa_interpretability


In [None]:
## 8) Uncertainty alignment: system vs expert majority vote
expert_conf_majority = compute_expert_confidence_majority(
    expert_ratings,
    scenario_col="scenario_id",
    conf_col="expert_confidence",
)

ua = compute_uncertainty_alignment(
    system_confidence_df=system_confidence,
    expert_majority_df=expert_conf_majority,
    scenario_col="scenario_id",
    system_conf_col="system_confidence",
    expert_conf_col="expert_confidence_majority",
)

expert_conf_majority, ua


In [None]:
## 9) Combine uncertainty alignment into results
results = results.merge(
    ua[["scenario_id", "expert_confidence_majority", "aligned"]],
    on="scenario_id",
    how="left",
)
results


In [None]:
## 10) Build Table-1-like report table
report_table = build_report_table(results)
report_table


In [None]:
## 11) Summary statistics (mean ± SD + alignment rate + κ means)
summary_stats = compute_summary_stats(
    results=results,
    kappa_factuality=kappa_factuality,
    kappa_interpretability=kappa_interpretability,
    precision_col="precision_at_k",
    factuality_col="factuality_mean",
    interpretability_col="interpretability_mean",
    alignment_col="aligned",
)

summary_stats


In [None]:
## 12) Code cell — Export outputs
OUT_DIR = REPO_ROOT / "outputs"
export_outputs(
    out_dir=OUT_DIR,
    report_table=report_table,
    summary_stats=summary_stats,
    kappa_factuality=kappa_factuality,
    kappa_interpretability=kappa_interpretability,
)
print("Exported outputs to:", OUT_DIR.resolve())