# Reproducing "From Answers to Hypotheses"

This notebook reproduces all figures and statistical analyses reported in:

> Victor Lavrenko. *From Answers to Hypotheses: Internal Consensus and Its Limits in Large Language Models*. 2026.

The results correspond to the tagged release:
`paper/from-answers-to-hypotheses-v1`

Repository:
https://github.com/victorlavrenko/rofa

In [None]:
# install ROFA package
import subprocess
import sys
from pathlib import Path

if (Path.cwd().parent.parent / "pyproject.toml").is_file():
    %pip install -e "../.."
else:
    if not Path("rofa").is_dir():
        !git clone https://github.com/victorlavrenko/rofa
    %pip install -e "rofa"

# Setup
import pandas as pd

from rofa.papers.from_answers_to_hypotheses import analysis, notebook_helpers

In [None]:
# Get run artifacts
run_dir_greedy, greedy_asset_url = (
    r"",
    "https://github.com/victorlavrenko/rofa/releases/download/paper%2Ffrom-answers-to-hypotheses-v1/rofa-from-answers-to-hypotheses-runs-v1-greedy.zip",
)
run_dir_k_sample, k_sample_asset_url = (
    r"",
    "https://github.com/victorlavrenko/rofa/releases/download/paper%2Ffrom-answers-to-hypotheses-v1/rofa-from-answers-to-hypotheses-runs-v1-branches10.zip",
)
run_inputs = notebook_helpers.resolve_run_inputs(
    run_dir_greedy, greedy_asset_url, run_dir_k_sample, k_sample_asset_url
)


In [None]:
# Load + validate
df_greedy, df_branches, metadata = analysis.load_paper_runs(run_inputs)
notebook_helpers.validate_required_columns(df_greedy, df_branches)
notebook_helpers.print_run_summary(df_greedy, df_branches, metadata)

In [None]:
# R1: greedy accuracy
df_greedy_accuracy = pd.DataFrame(
    {"metric": ["greedy_accuracy"], "value": [analysis.accuracy_greedy(df_greedy)]}
)
df_greedy_accuracy

In [None]:
# R2: leader accuracy
df_leader_accuracy = pd.DataFrame(
    {"metric": ["leader_accuracy"], "value": [analysis.accuracy_leader(df_branches)]}
)
df_leader_accuracy

In [None]:
# R3: distribution of max_frac
df_max_frac = analysis.max_frac_distribution(df_branches).reset_index()
df_max_frac.columns = ["max_frac_bin", "count"]
df_max_frac

In [None]:
# Figure 1: accuracy vs internal consensus (max_frac_exact)
df_max_frac_exact = notebook_helpers.plot_accuracy_vs_consensus(
    df_branches, "figure1_max_frac_exact.png"
)
df_max_frac_exact

In [None]:
# R4: unanimous stats
unanimous_stats = analysis.unanimous_stats(df_branches)
df_unanimous = pd.DataFrame([unanimous_stats])
df_unanimous

In [None]:
# R5: near-unanimous stats
near_unanimous_stats = analysis.near_unanimous_stats(df_branches, threshold=0.9)
df_near_unanimous = pd.DataFrame([near_unanimous_stats])
df_near_unanimous

In [None]:
# R6: top-2 coverage
df_top2 = analysis.compute_table_top2(df_branches)
df_top2

In [None]:
# R7: R/W/Other breakdown by max_frac bins
df_rw_other = analysis.rw_other_breakdown(df_branches)
df_rw_other

In [None]:
# R8: error modes (unanimous wrong)
df_unanimous_wrong = analysis.unanimous_wrong(df_branches)
df_unanimous_wrong.head()

In [None]:
# R9: majority vote does not help (greedy vs leader)
df_majority_vote = notebook_helpers.majority_vote_table(df_greedy, df_branches)
df_majority_vote

In [None]:
# R10: subject-wise breakdown (optional)
df_subject_breakdown = notebook_helpers.subject_breakdown(df_greedy, df_branches)
df_subject_breakdown.head(20)

In [None]:
# R11: export paper tables
report_dir = notebook_helpers.export_paper_reports(
    metadata,
    df_greedy_accuracy,
    df_leader_accuracy,
    unanimous_stats,
    near_unanimous_stats,
    df_top2,
    df_max_frac,
    df_rw_other,
    df_subject_breakdown,
)
print("Saved reports to", report_dir)

## Add your own analysis below
