In [None]:
# Setup
import pandas as pd

from rofa.papers.from_answers_to_hypotheses import analysis


In [None]:
# Get run artifacts
from pathlib import Path
from urllib.parse import urlparse

from rofa.core.io import download, unpack_zip

# Greedy run (local folder OR release asset URL)
run_dir_greedy = r"G:\My Drive\rofa_runs\runs\greedy_c1548993ddf6_20260108_165715"
greedy_asset_url = ""

# k-sample ensemble run (local folder OR release asset URL)
run_dir_k_sample = r"G:\My Drive\rofa_runs\runs\k_sample_ensemble_c1548993ddf6_20260108_203546"
k_sample_asset_url = ""

def _download_and_unpack(asset_url: str) -> str:
    runs_root = Path("runs")
    runs_root.mkdir(exist_ok=True)
    filename = Path(urlparse(asset_url).path).name or "run.zip"
    zip_path = runs_root / filename
    download(asset_url, str(zip_path))
    run_dir = runs_root / zip_path.stem
    unpack_zip(str(zip_path), str(run_dir))
    return str(run_dir)

def resolve_run_input(run_dir: str, asset_url: str) -> str:
    if run_dir:
        return run_dir
    if asset_url:
        return _download_and_unpack(asset_url)
    return ""

run_inputs = [
    resolve_run_input(run_dir_greedy, greedy_asset_url),
    resolve_run_input(run_dir_k_sample, k_sample_asset_url),
]
run_inputs = [run_input for run_input in run_inputs if run_input]

if len(run_inputs) < 2:
    raise ValueError("Provide both greedy and k-sample ensemble runs.")


In [None]:
# Load + validate
df_greedy, df_branches, metadata = analysis.load_paper_runs(run_inputs)

required_greedy_cols = {"gold", "prediction", "is_correct"}
required_branch_cols = {"gold", "leader", "leader_correct", "max_frac", "branch_preds"}

missing_greedy = required_greedy_cols - set(df_greedy.columns)
missing_branches = required_branch_cols - set(df_branches.columns)
if missing_greedy:
    raise ValueError(f"Greedy run missing required columns: {missing_greedy}")
if missing_branches:
    raise ValueError(f"k-sample run missing required columns: {missing_branches}")

print("df_greedy:", df_greedy.shape)
print("df_branches:", df_branches.shape)
print("Resolved runs:", metadata["resolved_runs"])


In [None]:
# R1: greedy accuracy
df_greedy_accuracy = pd.DataFrame(
    {"metric": ["greedy_accuracy"], "value": [analysis.accuracy_greedy(df_greedy)]}
)
df_greedy_accuracy


In [None]:
# R2: leader accuracy
df_leader_accuracy = pd.DataFrame(
    {"metric": ["leader_accuracy"], "value": [analysis.accuracy_leader(df_branches)]}
)
df_leader_accuracy


In [None]:
# R3: distribution of max_frac
df_max_frac = analysis.max_frac_distribution(df_branches).reset_index()
df_max_frac.columns = ["max_frac_bin", "count"]
df_max_frac


In [None]:
# Figure 1: accuracy vs internal consensus (max_frac_exact)
import matplotlib.pyplot as plt

df_max_frac_exact = analysis.accuracy_by_max_frac_exact(df_branches)
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(df_max_frac_exact["max_frac_exact"], df_max_frac_exact["accuracy"], marker="o")
ax.set_xlabel("max_frac_exact (leader fraction, N=10)")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs Internal Consensus (max_frac_exact)")
ax.set_ylim(0.0, 1.0)
fig.tight_layout()
fig.savefig("figure1_max_frac_exact.png", dpi=300)
plt.show()
df_max_frac_exact


In [None]:
# R4: unanimous stats
unanimous_stats = analysis.unanimous_stats(df_branches)
df_unanimous = pd.DataFrame([unanimous_stats])
df_unanimous


In [None]:
# R5: near-unanimous stats
near_unanimous_stats = analysis.near_unanimous_stats(df_branches, threshold=0.9)
df_near_unanimous = pd.DataFrame([near_unanimous_stats])
df_near_unanimous


In [None]:
# R6: top-2 coverage
df_top2 = analysis.compute_table_top2(df_branches)
df_top2


In [None]:
# R7: R/W/Other breakdown by max_frac bins
df_rw_other = analysis.rw_other_breakdown(df_branches)
df_rw_other


In [None]:
# R8: error modes (unanimous wrong)
df_unanimous_wrong = analysis.unanimous_wrong(df_branches)
df_unanimous_wrong.head()


In [None]:
# R9: majority vote does not help (greedy vs leader)
merge_keys = [
    key
    for key in ["id", "index", "question"]
    if key in df_greedy.columns and key in df_branches.columns
]
if not merge_keys:
    raise ValueError("No shared keys available to merge greedy and k-sample runs.")

df_merged = df_greedy.merge(df_branches, on=merge_keys, suffixes=("_greedy", "_branches"))
greedy_correct = df_merged["is_correct"].fillna(False).astype(bool)
leader_correct = df_merged["leader_correct"].fillna(False).astype(bool)
df_majority_vote = pd.DataFrame(
    {
        "metric": ["greedy_accuracy", "leader_accuracy"],
        "value": [greedy_correct.mean(), leader_correct.mean()],
    }
)
df_majority_vote


In [None]:
# R10: subject-wise breakdown (optional)
df_subject_greedy = analysis.subject_accuracy(df_greedy, accuracy_field="is_correct")
df_subject_branches = analysis.subject_accuracy(df_branches, accuracy_field="leader_correct")
df_subject_breakdown = pd.DataFrame(
    {
        "greedy_accuracy": df_subject_greedy,
        "leader_accuracy": df_subject_branches,
    }
)
df_subject_breakdown.head(20)


In [None]:
# R11: export paper tables
import json

run_id = Path(metadata["resolved_runs"]["k_sample_ensemble"]).name
report_dir = Path("notebooks") / "reports" / run_id
report_dir.mkdir(parents=True, exist_ok=True)

paper_report = {
    "greedy_accuracy": float(df_greedy_accuracy["value"][0]),
    "leader_accuracy": float(df_leader_accuracy["value"][0]),
    "unanimous": unanimous_stats,
    "near_unanimous": near_unanimous_stats,
    "top2_coverage": float(df_top2["value"][0]),
}

with open(report_dir / "paper_report.json", "w", encoding="utf-8") as f:
    json.dump(paper_report, f, indent=2)

df_max_frac.to_csv(report_dir / "max_frac_distribution.csv", index=False)
df_rw_other.to_csv(report_dir / "rw_other_breakdown.csv")
df_subject_breakdown.to_csv(report_dir / "subject_accuracy.csv")

print("Saved reports to", report_dir)


## Add your own analysis below
