# ROFA — Data Generation Notebook

This notebook runs the **data generation stage** for the paper  
*From Answers to Hypotheses: Internal Consensus and Its Limits in Large Language Models*.

It executes model inference under fixed decoding settings and produces
versioned run artifacts (JSON/JSONL) that capture:
- per-question model outputs,
- alternative sampled hypotheses,
- metadata required for downstream analysis.

## Purpose

The outputs of this notebook are **inputs** to the analysis and reproduction
pipeline implemented in `20_paper_reproduce.ipynb`.
They are not required for reproducing the paper figures if you use the
pre-generated release artifacts.

## Usage modes

- **Reproduce paper results (recommended):**  
  Skip this notebook and download the released run artifacts from GitHub.

- **Regenerate data (optional):**  
  Run this notebook to regenerate model outputs, e.g. to:
  - test alternative decoding parameters,
  - evaluate new models,
  - extend experiments beyond the paper.

## Notes

- Generation can be time- and compute-intensive.
- Results depend on model checkpoints, decoding parameters, and random seeds.
- This notebook is typically executed in Colab or a GPU-enabled environment.

In [None]:
MODEL_ID = "HPAI-BSC/Llama3.1-Aloe-Beta-8B"
# Recommended medical models (examples only):
# MODEL_ID = "HPAI-BSC/Llama3.1-Aloe-Beta-8B"
# MODEL_ID = "HPAI-BSC/Qwen2.5-Aloe-Beta-7B"
# MODEL_ID = "m42-health/Llama3-Med42-8B"
# MODEL_ID = "BioMistral/BioMistral-7B"
# MODEL_ID = "google/medgemma-1.5-4b-it"

DATASET_NAME = "openlifescienceai/medmcqa"
DATASET_SPLIT = "validation"
SEED = 42
N = 400
SUBJECTS = 20
MAX_NEW_TOKENS = 1024

# Branch sampling settings.
N_BRANCHES = 10
TEMPERATURE = 0.8
TOP_P = 0.8
TOP_K = 50


In [None]:
# install ROFA package if not already installed
import importlib.metadata

try: 
  importlib.metadata.distribution("rofa")
except importlib.metadata.PackageNotFoundError: 
  from pathlib import Path
  if (Path.cwd().parent.parent / "pyproject.toml").is_file():
      %pip install -e "../.."
  else:
      if not Path("rofa").is_dir():
          !git clone https://github.com/victorlavrenko/rofa
      %pip install -e "rofa"

In [None]:
# Cell 1 — Validate environment (Drive + output + model)
import importlib.util

if importlib.util.find_spec("rofa.core") is None:
    print(
        "\n⚠️  Runtime restart required\n\n"
        "ROFA has just been installed, but the Python runtime has not been restarted yet.\n\n"
        "Please restart the runtime via:\n"
        "  Runtime (or ▼ after Run all) → Restart runtime (or Restart runtime and run all)\n"
        "This is expected behaviour in Google Colab."
    )
    raise SystemExit

import os
from pathlib import Path

from rofa.core.model import load_model_with_fallback, load_tokenizer

try:
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount("/content/drive")
    OUT_BASE = "/content/drive/MyDrive/rofa_runs"
except ImportError:
    OUT_BASE = "./rofa_runs"

out_base = Path(OUT_BASE)
out_base.mkdir(parents=True, exist_ok=True)
assert out_base.exists() and out_base.is_dir(), "Output base not available"
(out_base / "tmp_write_check.txt").write_text("ok")
(out_base / "tmp_write_check.txt").unlink()

tokenizer = load_tokenizer(MODEL_ID)
model = load_model_with_fallback(MODEL_ID)
print(f"Model ready: {MODEL_ID}")

In [None]:
# Cell 1b — Model preflight
from rofa.core.model_id import to_slug

model_slug = to_slug(MODEL_ID)
print(f"Selected MODEL_ID: {MODEL_ID}")
print(f"Model slug: {model_slug}")

# Gated-model checklist (e.g., google/medgemma-1.5-4b-it):
# 1) Open the model page on Hugging Face while logged in.
# 2) Click Agree/Access to accept terms (if required).
# 3) Create a Hugging Face access token (read scope).
# 4) Authenticate here (paste token manually or read from env):
#    from huggingface_hub import login
#    login(token="...")
# 5) Alternatively: os.environ["HF_TOKEN"] = "..."
# Never hardcode secrets into the notebook.


In [None]:
# Cell 2 — Create a fixed question set (IDs)
from pathlib import Path

from rofa.core.question_set import create_question_set, save_question_set

question_set = create_question_set(
    {"dataset_name": DATASET_NAME, "dataset_split": DATASET_SPLIT},
    {
        "seed": SEED,
        "n": N,
        "subjects": SUBJECTS,
        "max_per_subject": N / SUBJECTS * 1.1 + 1,
    },
)

qs_dir = Path(OUT_BASE) / "question_sets"
qs_dir.mkdir(parents=True, exist_ok=True)
question_set_id = question_set.qs_id
qs_path = qs_dir / f"{question_set_id}.json"
save_question_set(question_set, str(qs_path))

print(f"Saved question set: {question_set_id} -> {qs_path}")

In [None]:
# Cell 3 — Run generation (auto resume/expand/new)
from datetime import datetime, timezone
from pathlib import Path
import json
import os

from rofa.core.io import load_manifest, load_progress
from rofa.core.runner import run_generation
from rofa.core.schemas import GenerationConfig
from rofa.papers.from_answers_to_hypotheses.methods import (
    BranchSamplingEnsemble,
    GreedyDecode,
)

run_root = Path(OUT_BASE) / "runs" / "from_answers_to_hypotheses" / model_slug
run_root.mkdir(parents=True, exist_ok=True)

def _default_run_id(prefix: str) -> str:
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    return f"{prefix}_{question_set_id}_{timestamp}"

def _dir_mtime(path: Path) -> float:
    latest = path.stat().st_mtime
    for root, _, files in os.walk(path):
        for name in files:
            candidate = Path(root) / name
            try:
                latest = max(latest, candidate.stat().st_mtime)
            except FileNotFoundError:
                continue
    return latest

def _question_total(run_dir: Path) -> int | None:
    qs_path = run_dir / "question_set.json"
    if not qs_path.exists():
        return None
    with qs_path.open("r", encoding="utf-8") as f:
        payload = json.load(f)
    return len(payload.get("examples", []))

def _count_lines(path: Path) -> int | None:
    if not path.exists():
        return None
    with path.open("r", encoding="utf-8") as f:
        return sum(1 for _ in f)

def _latest_run(method: str):
    candidates = []
    for child in run_root.iterdir():
        if not child.is_dir():
            continue
        manifest = load_manifest(str(child / "manifest.json"))
        if manifest is None or manifest.method != method:
            continue
        candidates.append((child, _dir_mtime(child)))
    if not candidates:
        return None
    run_dir, _ = max(candidates, key=lambda item: item[1])
    total = _question_total(run_dir)
    progress = load_progress(str(run_dir / "progress.json"))
    summary_count = _count_lines(run_dir / "summary.jsonl")
    position = progress.get("position") if progress else None
    complete = False
    if total is not None:
        if position is not None:
            complete = position >= total
        elif summary_count is not None:
            complete = summary_count >= total
    return {
        "run_id": run_dir.name,
        "run_dir": run_dir,
        "total": total,
        "complete": complete,
    }

def _build_config(method: str, run_id: str, *, resume: bool | None, expand: bool, n: int):
    if method == "greedy":
        method_impl = GreedyDecode()
        write_full_records = False
    else:
        method_impl = BranchSamplingEnsemble(
            n_branches=N_BRANCHES,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            top_k=TOP_K,
        )
        write_full_records = True
    return GenerationConfig(
        method=method,
        model_id=MODEL_ID,
        model_slug=model_slug,
        out_dir=str(run_root),
        run_id=run_id,
        resume=resume,
        expand=expand,
        seed=SEED,
        max_new_tokens=MAX_NEW_TOKENS,
        n=n,
        subjects=SUBJECTS,
        dataset_name=DATASET_NAME,
        dataset_split=DATASET_SPLIT,
        question_set_path=str(qs_path),
        n_branches=N_BRANCHES if method == "k_sample_ensemble" else None,
        temperature=TEMPERATURE if method == "k_sample_ensemble" else None,
        top_p=TOP_P if method == "k_sample_ensemble" else None,
        top_k=TOP_K if method == "k_sample_ensemble" else None,
        progress=True,
        write_full_records=write_full_records,
        tokenizer=tokenizer,
        model=model,
        method_impl=method_impl,
    )

def _schedule_new_runs():
    runs = []
    runs.append(("greedy", _default_run_id("greedy"), None, False, N))
    runs.append(("k_sample_ensemble", _default_run_id("k_sample_ensemble"), None, False, N))
    return runs

greedy = _latest_run("greedy")
branches = _latest_run("k_sample_ensemble")
planned_runs = []

if greedy and not greedy["complete"]:
    planned_runs.append(("greedy", greedy["run_id"], True, False, N))
    if branches and not branches["complete"]:
        planned_runs.append(("k_sample_ensemble", branches["run_id"], True, False, N))
    elif branches is None:
        planned_runs.append(("k_sample_ensemble", _default_run_id("k_sample_ensemble"), None, False, N))
elif greedy and greedy["complete"]:
    if not branches or not branches["complete"]:
        run_id = branches["run_id"] if branches else _default_run_id("k_sample_ensemble")
        resume = True if branches else None
        planned_runs.append(("k_sample_ensemble", run_id, resume, False, N))
    else:
        greedy_total = greedy["total"]
        branches_total = branches["total"]
        if greedy_total and branches_total and greedy_total < N and branches_total < N:
            planned_runs.append(("greedy", greedy["run_id"], True, True, N))
            planned_runs.append(("k_sample_ensemble", branches["run_id"], True, True, N))
        else:
            planned_runs.extend(_schedule_new_runs())
else:
    planned_runs.extend(_schedule_new_runs())

print("Planned runs:")
for method, run_id, resume, expand, n in planned_runs:
    print(f"- {method} | run_id={run_id} | resume={resume} | expand={expand} | n={n}")

for method, run_id, resume, expand, n in planned_runs:
    config = _build_config(method, run_id, resume=resume, expand=expand, n=n)
    run_generation(config)


## Continue or expand a run

This notebook now auto-detects the most recent greedy/branches runs under `OUT_BASE`
and decides whether to resume, expand, or start fresh. Update the config values above
(like `N`, `N_BRANCHES`, or `MAX_NEW_TOKENS`) and rerun the generation cell.


## Publish your run artifacts to GitHub Releases (manual)

1. Open Google Drive and locate your run folder under `OUT_BASE/runs/<run_id>/`.
2. Download the run folder as a `.zip`.
3. Create a new GitHub Release in your repository.
4. Upload the `.zip` as a release asset.
5. Paste the asset URL into the analysis notebook so it can download the artifacts.
