# Tutorial: Planner Finetune and Evaluation Lab

Audience:
- Researchers running planner-only tuning experiments with fixed worker/verifier/generator endpoints.

Learning goals:
- Build experiment grids and launch commands reproducibly.
- Parse rollout artifacts and summarize planning metrics.


## Outline

1. Define experiment matrix
2. Generate training commands
3. Parse rollout artifacts
4. Compare checkpoints using planning-aware metrics


In [None]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
import json
import itertools

REPO = Path("/Users/admin/TuanDung/repos/AgentFlow")
ROLLOUT_ROOT = REPO / "rollout_data"
REPO


In [None]:
@dataclass
class Experiment:
    name: str
    train_temp: float
    max_steps: int
    rollout_n: int

grid = [
    Experiment(name=f"planner_t{t}_s{s}_r{r}", train_temp=t, max_steps=s, rollout_n=r)
    for t, s, r in itertools.product([0.3, 0.5, 0.7], [2, 3], [4, 8])
]

len(grid), grid[0]


In [None]:
def build_train_command(exp: Experiment) -> str:
    # Planner-only tuning with fixed worker/verifier/generator is configured in model-engine routing.
    # Adjust keys according to your `train/config.yaml` conventions.
    return (
        f"cd {REPO} && python train/train_agent.py "
        f"EXPERIMENT_NAME={exp.name} "
        f"TRAIN_TEMPERATURE={exp.train_temp} "
        f"TOOL_STEPS={exp.max_steps} "
        f"actor_rollout_ref.rollout.n={exp.rollout_n}"
    )

commands = [build_train_command(exp) for exp in grid[:5]]
commands


## Step - Parse rollout artifacts

This parser reads rollout JSON files and computes lightweight planning metrics.


In [None]:
def iter_rollout_files(root: Path):
    if not root.exists():
        return []
    return list(root.rglob("rollout_*.json"))

def summarize_rollouts(files):
    rows = []
    for fp in files:
        try:
            data = json.loads(fp.read_text())
        except Exception:
            continue

        reward = float(data.get("reward", 0.0))
        total_result = data.get("total_result", {})
        step_count = int(total_result.get("step_count", 0)) if isinstance(total_result, dict) else 0

        rows.append({
            "file": str(fp),
            "reward": reward,
            "step_count": step_count,
        })

    if not rows:
        return {"n": 0, "mean_reward": 0.0, "mean_step_count": 0.0}

    mean_reward = sum(r["reward"] for r in rows) / len(rows)
    mean_steps = sum(r["step_count"] for r in rows) / len(rows)
    return {"n": len(rows), "mean_reward": round(mean_reward, 4), "mean_step_count": round(mean_steps, 4)}

files = iter_rollout_files(ROLLOUT_ROOT)
summary = summarize_rollouts(files)
summary


In [None]:
def planning_score(mean_reward: float, mean_step_count: float, alpha: float = 0.1) -> float:
    # Example efficiency-adjusted score: reward penalized by step length.
    return mean_reward - alpha * mean_step_count

score = planning_score(summary["mean_reward"], summary["mean_step_count"])
{"summary": summary, "planning_score": round(score, 4)}


## Exercises

1. Add tool-failure rate and verifier-stop quality into the summary function.
2. Compare top-3 checkpoints by both `mean_reward` and `planning_score`.
3. Export results to CSV and attach confidence intervals over seeds.

Validation note:
- This notebook does not run heavy finetuning directly; it builds launch commands and evaluation harnesses.
- Run training in terminal/GPU cluster, then re-open this notebook for analysis.
