In [None]:
# Setup
import pandas as pd

%pip install -e .

from rofa.papers.from_answers_to_hypotheses import analysis


c:\Users\lavre
Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///C:/Users/lavre
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\lavre\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: file:///C:/Users/lavre does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.


In [23]:
# Get run artifacts
from pathlib import Path
from urllib.parse import urlparse

from rofa.core.io import download, unpack_zip

# Greedy run (local folder OR release asset URL)
run_dir_greedy = r"G:\My Drive\rofa_runs\runs\greedy_c9dcafe07752_20260109_171240"
greedy_asset_url = ""

# k-sample ensemble run (local folder OR release asset URL)
run_dir_k_sample = r"G:\My Drive\rofa_runs\runs\k_sample_ensemble_c9dcafe07752_20260109_185709"
k_sample_asset_url = ""

def _download_and_unpack(asset_url: str) -> str:
    runs_root = Path("runs")
    runs_root.mkdir(exist_ok=True)
    filename = Path(urlparse(asset_url).path).name or "run.zip"
    zip_path = runs_root / filename
    download(asset_url, str(zip_path))
    run_dir = runs_root / zip_path.stem
    unpack_zip(str(zip_path), str(run_dir))
    return str(run_dir)

def resolve_run_input(run_dir: str, asset_url: str) -> str:
    if run_dir:
        return run_dir
    if asset_url:
        return _download_and_unpack(asset_url)
    return ""

run_inputs = [
    resolve_run_input(run_dir_greedy, greedy_asset_url),
    resolve_run_input(run_dir_k_sample, k_sample_asset_url),
]
run_inputs = [run_input for run_input in run_inputs if run_input]

if len(run_inputs) < 2:
    raise ValueError("Provide both greedy and k-sample ensemble runs.")


In [24]:
# Load + validate
df_greedy, df_branches, metadata = analysis.load_paper_runs(run_inputs)

required_greedy_cols = {"gold", "prediction", "is_correct"}
required_branch_cols = {"gold", "leader", "leader_correct", "max_frac", "branch_preds"}

missing_greedy = required_greedy_cols - set(df_greedy.columns)
missing_branches = required_branch_cols - set(df_branches.columns)
if missing_greedy:
    raise ValueError(f"Greedy run missing required columns: {missing_greedy}")
if missing_branches:
    raise ValueError(f"k-sample run missing required columns: {missing_branches}")

print("df_greedy:", df_greedy.shape)
print("df_branches:", df_branches.shape)
print("Resolved runs:", metadata["resolved_runs"])


df_greedy: (400, 14)
df_branches: (400, 16)
Resolved runs: {'greedy': 'G:\\My Drive\\rofa_runs\\runs\\greedy_c9dcafe07752_20260109_171240', 'k_sample_ensemble': 'G:\\My Drive\\rofa_runs\\runs\\k_sample_ensemble_c9dcafe07752_20260109_185709'}


In [25]:
# R1: greedy accuracy
df_greedy_accuracy = pd.DataFrame(
    {"metric": ["greedy_accuracy"], "value": [analysis.accuracy_greedy(df_greedy)]}
)
df_greedy_accuracy


Unnamed: 0,metric,value
0,greedy_accuracy,0.6575


In [26]:
# R2: leader accuracy
df_leader_accuracy = pd.DataFrame(
    {"metric": ["leader_accuracy"], "value": [analysis.accuracy_leader(df_branches)]}
)
df_leader_accuracy


Unnamed: 0,metric,value
0,leader_accuracy,0.6675


In [27]:
# R3: distribution of max_frac
df_max_frac = analysis.max_frac_distribution(df_branches).reset_index()
df_max_frac.columns = ["max_frac_bin", "count"]
df_max_frac


Unnamed: 0,max_frac_bin,count
0,"(-0.001, 0.5]",51
1,"(0.5, 0.8]",127
2,"(0.8, 0.9]",71
3,"(0.9, 1.0]",151


In [None]:
# Figure 1: accuracy vs internal consensus (max_frac_exact)
import matplotlib.pyplot as plt

df_max_frac_exact = analysis.accuracy_by_max_frac_exact(df_branches)
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(df_max_frac_exact["max_frac_exact"], df_max_frac_exact["accuracy"], marker="o")
ax.set_xlabel("max_frac_exact (leader fraction, N=10)")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs Internal Consensus (max_frac_exact)")
ax.set_ylim(0.0, 1.0)
fig.tight_layout()
fig.savefig("figure1_max_frac_exact.png", dpi=300)
plt.show()
df_max_frac_exact


In [28]:
# R4: unanimous stats
unanimous_stats = analysis.unanimous_stats(df_branches)
df_unanimous = pd.DataFrame([unanimous_stats])
df_unanimous


Unnamed: 0,count,accuracy
0,151,0.86755


In [29]:
# R5: near-unanimous stats
near_unanimous_stats = analysis.near_unanimous_stats(df_branches, threshold=0.9)
df_near_unanimous = pd.DataFrame([near_unanimous_stats])
df_near_unanimous


Unnamed: 0,count,accuracy
0,221,0.841629


In [30]:
# R6: top-2 coverage
df_top2 = analysis.compute_table_top2(df_branches)
df_top2


Unnamed: 0,metric,value
0,top2_coverage,0.805


In [31]:
# R7: R/W/Other breakdown by max_frac bins
df_rw_other = analysis.rw_other_breakdown(df_branches)
df_rw_other


  .groupby(["bin", "label"])


label,Other,R,W
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-0.001, 0.5]",51,0,0
"(0.5, 0.8]",77,29,21
"(0.8, 0.9]",0,55,16
"(0.9, 1.0]",0,131,20


In [32]:
# R8: error modes (unanimous wrong)
df_unanimous_wrong = analysis.unanimous_wrong(df_branches)
df_unanimous_wrong.head()


Unnamed: 0,index,picked_index,id,gold,branch_preds,leader,max_frac,valid_n,none_n,variation_ratio,entropy_bits,correct_fraction,leader_correct,class,subject_name,timestamp
4,4,5,d8a284c5-175a-4439-8ef2-5ff24b41f3cc,B,"[A, A, A, A, A, A, A, A, A, A]",A,1.0,10,0,0.0,0.0,0.0,False,unanimous,Microbiology,2026-01-09 19:09:52.062184+00:00
8,8,9,e2dd93d8-8047-4e9a-9389-e2b1781f8a35,B,"[A, A, A, A, A, A, A, A, A, A]",A,1.0,10,0,0.0,0.0,0.0,False,unanimous,Surgery,2026-01-09 19:19:06.349856+00:00
11,11,12,157f841a-269b-4207-825d-2a62dd8cd197,B,"[A, A, A, A, A, A, A, A, A, A]",A,1.0,10,0,0.0,0.0,0.0,False,unanimous,Pediatrics,2026-01-09 19:26:27.763699+00:00
28,28,29,1609ebf3-8709-4e7f-abac-53de9b01a173,B,"[C, C, C, C, C, C, C, C, C, C]",C,1.0,10,0,0.0,0.0,0.0,False,unanimous,Pediatrics,2026-01-09 20:11:55.853987+00:00
33,33,34,17180bef-99d8-42c7-9add-372aae82e08e,C,"[B, B, B, B, B, B, B, B, B, B]",B,1.0,10,0,0.0,0.0,0.0,False,unanimous,Pathology,2026-01-09 20:23:29.196693+00:00


In [33]:
# R9: majority vote does not help (greedy vs leader)
merge_keys = [
    key
    for key in ["id", "index", "question"]
    if key in df_greedy.columns and key in df_branches.columns
]
if not merge_keys:
    raise ValueError("No shared keys available to merge greedy and k-sample runs.")

df_merged = df_greedy.merge(df_branches, on=merge_keys, suffixes=("_greedy", "_branches"))
greedy_correct = df_merged["is_correct"].fillna(False).astype(bool)
leader_correct = df_merged["leader_correct"].fillna(False).astype(bool)
df_majority_vote = pd.DataFrame(
    {
        "metric": ["greedy_accuracy", "leader_accuracy"],
        "value": [greedy_correct.mean(), leader_correct.mean()],
    }
)
df_majority_vote


Unnamed: 0,metric,value
0,greedy_accuracy,0.6575
1,leader_accuracy,0.6675


In [34]:
# R10: subject-wise breakdown (optional)
df_subject_greedy = analysis.subject_accuracy(df_greedy, accuracy_field="is_correct")
df_subject_branches = analysis.subject_accuracy(df_branches, accuracy_field="leader_correct")
df_subject_breakdown = pd.DataFrame(
    {
        "greedy_accuracy": df_subject_greedy,
        "leader_accuracy": df_subject_branches,
    }
)
df_subject_breakdown.head(20)


Unnamed: 0_level_0,greedy_accuracy,leader_accuracy
subject_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anaesthesia,0.705882,0.705882
Anatomy,0.608696,0.608696
Biochemistry,0.869565,0.913043
Dental,0.434783,0.478261
ENT,0.571429,0.619048
Forensic Medicine,0.478261,0.478261
Gynaecology & Obstetrics,0.695652,0.695652
Medicine,0.478261,0.521739
Microbiology,0.608696,0.695652
Ophthalmology,0.608696,0.521739


In [None]:
# R11: export paper tables
import json
from typing import cast

runs = cast(dict, metadata['resolved_runs'])
run_path = runs.get('k_sample_ensemble') or runs.get('greedy')
if run_path is None:
    raise ValueError("No run path found in metadata.")
run_id = Path(run_path).name
report_dir = Path("notebooks") / "reports" / run_id
print("Saving reports to", report_dir)
report_dir.mkdir(parents=True, exist_ok=True)

paper_report = {
    "greedy_accuracy": float(df_greedy_accuracy["value"][0]),
    "leader_accuracy": float(df_leader_accuracy["value"][0]),
    "unanimous": unanimous_stats,
    "near_unanimous": near_unanimous_stats,
    "top2_coverage": float(df_top2["value"][0]),
}

with open(report_dir / "paper_report.json", "w", encoding="utf-8") as f:
    json.dump(paper_report, f, indent=2)

df_max_frac.to_csv(report_dir / "max_frac_distribution.csv", index=False)
df_rw_other.to_csv(report_dir / "rw_other_breakdown.csv")
df_subject_breakdown.to_csv(report_dir / "subject_accuracy.csv")

print("Saved reports to", report_dir)


Saving reports to notebooks\reports\k_sample_ensemble_c9dcafe07752_20260109_185709
Saved reports to notebooks\reports\k_sample_ensemble_c9dcafe07752_20260109_185709


## Add your own analysis below
