From cca62c2da6c4e8b8d72730bba59e48d5e6ba17d9 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Tue, 12 May 2026 22:59:56 +0800
Subject: [PATCH 01/17] benchmark: add tau2 eval scaffold

---
 benchmark/tau2/.gitignore              |   1 +
 benchmark/tau2/README.md               |  50 ++++++++
 benchmark/tau2/config/baseline.yaml    |  52 +++++++++
 benchmark/tau2/config/prewrite.yaml    |  12 ++
 benchmark/tau2/run_full_eval.sh        |  53 +++++++++
 benchmark/tau2/scripts/parity_check.py |  51 ++++++++
 benchmark/tau2/scripts/preflight.py    |  84 +++++++++++++
 benchmark/tau2/scripts/run_eval.py     | 156 +++++++++++++++++++++++++
 benchmark/tau2/scripts/summarize.py    |  50 ++++++++
 benchmark/tau2/scripts/tau2_common.py  | 117 +++++++++++++++++++
 10 files changed, 626 insertions(+)
 create mode 100644 benchmark/tau2/.gitignore
 create mode 100644 benchmark/tau2/README.md
 create mode 100644 benchmark/tau2/config/baseline.yaml
 create mode 100644 benchmark/tau2/config/prewrite.yaml
 create mode 100755 benchmark/tau2/run_full_eval.sh
 create mode 100755 benchmark/tau2/scripts/parity_check.py
 create mode 100755 benchmark/tau2/scripts/preflight.py
 create mode 100755 benchmark/tau2/scripts/run_eval.py
 create mode 100755 benchmark/tau2/scripts/summarize.py
 create mode 100755 benchmark/tau2/scripts/tau2_common.py

diff --git a/benchmark/tau2/.gitignore b/benchmark/tau2/.gitignore
new file mode 100644
index 000000000..1cd791b52
--- /dev/null
+++ b/benchmark/tau2/.gitignore
@@ -0,0 +1 @@
+result/
diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
new file mode 100644
index 000000000..ac47ffa63
--- /dev/null
+++ b/benchmark/tau2/README.md
@@ -0,0 +1,50 @@
+# TAU-2 Benchmark
+
+This directory contains a small OpenViking-style entry point for TAU-2 memory
+evaluation. The first version is intentionally narrow:
+
+- no-memory control;
+- fresh OpenViking memory baseline;
+- trajectory / procedure-view treatment;
+- optional pre-write recall.
+
+Category rerank and other harness-only diagnostics are not migrated here yet.
+
+## Layout
+
+```text
+benchmark/tau2/
+├── config/
+│   ├── baseline.yaml
+│   └── prewrite.yaml
+├── scripts/
+│   ├── preflight.py
+│   ├── run_eval.py
+│   ├── summarize.py
+│   └── parity_check.py
+└── run_full_eval.sh
+```
+
+Generated artifacts are written to `benchmark/tau2/result/<run_id>/`.
+
+## Quick Start
+
+Plan the default benchmark without running TAU-2:
+
+```bash
+python benchmark/tau2/scripts/preflight.py --config benchmark/tau2/config/baseline.yaml
+python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only
+```
+
+Run with execution enabled after TAU-2, model credentials, and OpenViking are
+configured:
+
+```bash
+benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute
+```
+
+## Evidence Boundary
+
+Only completed `retail + airline` runs with the same config, same seeds/repeats,
+and non-empty artifacts should be read as benchmark evidence. Partial runs,
+single-task probes, or missing OpenViking corpus identity are diagnostics.
diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
new file mode 100644
index 000000000..16a452d4a
--- /dev/null
+++ b/benchmark/tau2/config/baseline.yaml
@@ -0,0 +1,52 @@
+benchmark:
+  name: tau2_openviking_baseline
+  domains:
+    - retail
+    - airline
+  train_split_name: train
+  eval_split_name: test
+  repeat_count: 4
+  task_max_concurrency: 10
+  strategy_concurrency: 4
+  max_steps: 200
+  reasoning_effort: high
+
+paths:
+  tau2_repo: ${TAU2_REPO:-data/external_benchmarks/tau2-bench}
+  output_dir: benchmark/tau2/result
+
+model:
+  agent_llm: ${TAU2_AGENT_LLM:-doubao-seed-1-6-250615}
+  user_llm: ${TAU2_USER_LLM:-doubao-seed-1-6-250615}
+  evaluator_llm: ${TAU2_EVALUATOR_LLM:-doubao-seed-1-6-250615}
+  temperature: 0.0
+
+openviking:
+  url: ${OPENVIKING_URL:-http://localhost:1933}
+  account: ${OPENVIKING_ACCOUNT:-default}
+  agent_id: ${OPENVIKING_AGENT_ID:-tau2-openviking-agent}
+  retrieval_top_k: 4
+  replay_write_policy: read_only
+
+strategies:
+  - id: no_memory
+    label: No memory
+    memory_backend: none
+    train_required: false
+  - id: memory_v2_experience_only
+    label: OpenViking Memory V2 experience-only
+    memory_backend: openviking
+    train_required: true
+    train_memory_mode: experience_only
+    retrieval_mode: first_user
+  - id: trajectory_procedure_view
+    label: OpenViking trajectory procedure view
+    memory_backend: openviking
+    train_required: true
+    train_memory_mode: trajectory_procedure_view
+    operation_mode: add_only
+    retrieval_mode: first_user
+
+features:
+  prewrite_recall:
+    enabled: false
diff --git a/benchmark/tau2/config/prewrite.yaml b/benchmark/tau2/config/prewrite.yaml
new file mode 100644
index 000000000..965f09963
--- /dev/null
+++ b/benchmark/tau2/config/prewrite.yaml
@@ -0,0 +1,12 @@
+extends: baseline.yaml
+
+benchmark:
+  name: tau2_openviking_prewrite
+
+features:
+  prewrite_recall:
+    enabled: true
+    decision_nodes:
+      - before_write_tool_call
+    max_memories: 4
+    evidence_boundary: runtime_retrieval_trace_required
diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh
new file mode 100755
index 000000000..8abf41235
--- /dev/null
+++ b/benchmark/tau2/run_full_eval.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+PYTHON_BIN="${PYTHON_BIN:-python3}"
+CONFIG="$SCRIPT_DIR/config/baseline.yaml"
+EXECUTE=false
+RUN_ID=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --config)
+      CONFIG="$2"
+      shift 2
+      ;;
+    --run-id)
+      RUN_ID="$2"
+      shift 2
+      ;;
+    --execute)
+      EXECUTE=true
+      shift
+      ;;
+    --help|-h)
+      cat <<'EOF'
+Usage:
+  benchmark/tau2/run_full_eval.sh [--config PATH] [--run-id ID] [--execute]
+
+Without --execute the script only writes preflight and run_plan artifacts.
+EOF
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+RUN_ARGS=()
+if [[ -n "$RUN_ID" ]]; then
+  RUN_ARGS+=(--run-id "$RUN_ID")
+fi
+
+cd "$REPO_ROOT"
+"$PYTHON_BIN" "$SCRIPT_DIR/scripts/preflight.py" --config "$CONFIG" "${RUN_ARGS[@]}"
+
+if [[ "$EXECUTE" == true ]]; then
+  "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --execute
+else
+  "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --plan-only
+fi
diff --git a/benchmark/tau2/scripts/parity_check.py b/benchmark/tau2/scripts/parity_check.py
new file mode 100755
index 000000000..d513715f0
--- /dev/null
+++ b/benchmark/tau2/scripts/parity_check.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+from tau2_common import write_json
+
+
+def _load_json(path: Path) -> Any:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Compare OpenViking TAU-2 artifacts against a harness reference.")
+    parser.add_argument("--ov-run-plan", type=Path, required=True)
+    parser.add_argument("--harness-run-plan", type=Path, required=True)
+    parser.add_argument("--output", type=Path, required=True)
+    args = parser.parse_args()
+
+    ov_plan = _load_json(args.ov_run_plan)
+    harness_plan = _load_json(args.harness_run_plan)
+
+    ov_cells = ov_plan.get("cells") or []
+    harness_cells = harness_plan.get("cells") or harness_plan.get("treatments") or []
+    report = {
+        "status": "ok" if len(ov_cells) == len(harness_cells) else "mismatch",
+        "ov_run_plan": str(args.ov_run_plan.resolve()),
+        "harness_run_plan": str(args.harness_run_plan.resolve()),
+        "ov_cell_count": len(ov_cells),
+        "harness_cell_count": len(harness_cells),
+        "checks": {
+            "cell_count_match": len(ov_cells) == len(harness_cells),
+        },
+        "notes": [
+            "Initial parity is intentionally structural.",
+            "Train payload, retrieval trace, and scoreboard parity should be added as each migration layer lands.",
+        ],
+    }
+    write_json(args.output, report)
+    if report["status"] != "ok":
+        print(f"[parity][WARN] wrote mismatch report: {args.output}")
+        return 1
+    print(f"[parity][OK] wrote {args.output}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmark/tau2/scripts/preflight.py b/benchmark/tau2/scripts/preflight.py
new file mode 100755
index 000000000..32f78c3a3
--- /dev/null
+++ b/benchmark/tau2/scripts/preflight.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import sys
+from pathlib import Path
+from typing import Any
+
+from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json
+
+
+def _check_import(module: str) -> dict[str, Any]:
+    spec = importlib.util.find_spec(module)
+    return {"module": module, "ok": spec is not None}
+
+
+def _split_status(config: dict[str, Any]) -> list[dict[str, Any]]:
+    rows = []
+    for domain in domains(config):
+        path = split_file(config, domain)
+        rows.append(
+            {
+                "domain": domain,
+                "path": str(path),
+                "exists": path.is_file(),
+            }
+        )
+    return rows
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Preflight TAU-2 benchmark config.")
+    parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml")
+    parser.add_argument("--run-id", default=run_id())
+    parser.add_argument("--strict", action="store_true", help="Require optional runtime imports and TAU-2 split files.")
+    args = parser.parse_args()
+
+    config = load_config(args.config)
+    out = output_dir(config, args.run_id)
+
+    errors: list[str] = []
+    try:
+        strategy_ids(config)
+    except Exception as exc:
+        errors.append(str(exc))
+
+    split_rows = _split_status(config)
+    if args.strict:
+        for row in split_rows:
+            if not row["exists"]:
+                errors.append(f"missing split file for {row['domain']}: {row['path']}")
+
+    import_rows = [_check_import("openviking"), _check_import("openviking_cli"), _check_import("tau2")]
+    if args.strict:
+        for row in import_rows:
+            if not row["ok"]:
+                errors.append(f"missing Python module: {row['module']}")
+
+    report = {
+        "status": "failed" if errors else "ok",
+        "config": str(args.config.resolve()),
+        "run_id": args.run_id,
+        "tau2_repo": str(tau2_repo(config)),
+        "domains": domains(config),
+        "strategies": strategy_ids(config),
+        "strict": args.strict,
+        "imports": import_rows,
+        "split_files": split_rows,
+        "errors": errors,
+    }
+    write_json(out / "preflight.json", report)
+
+    if errors:
+        for error in errors:
+            print(f"[preflight][ERROR] {error}", file=sys.stderr)
+        print(f"[preflight] wrote {out / 'preflight.json'}", file=sys.stderr)
+        return 1
+    print(f"[preflight][OK] wrote {out / 'preflight.json'}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
new file mode 100755
index 000000000..8ab4c41f0
--- /dev/null
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any
+
+from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json
+
+
+def _tau2_command(config: dict[str, Any], *, domain: str, strategy: dict[str, Any], repeat_index: int, run_label: str) -> list[str]:
+    benchmark = config["benchmark"]
+    model = config["model"]
+    command = [
+        "tau2",
+        "run",
+        "--domain",
+        domain,
+        "--task-split-name",
+        str(benchmark.get("eval_split_name", "test")),
+        "--num-trials",
+        "1",
+        "--max-steps",
+        str(benchmark.get("max_steps", 200)),
+        "--max-concurrency",
+        str(benchmark.get("task_max_concurrency", 10)),
+        "--agent-llm",
+        str(model["agent_llm"]),
+        "--user-llm",
+        str(model["user_llm"]),
+        "--save-to",
+        run_label,
+    ]
+
+    reasoning_effort = benchmark.get("reasoning_effort")
+    if reasoning_effort:
+        command.extend(["--agent-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'])
+        command.extend(["--user-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'])
+
+    if strategy.get("memory_backend") == "none":
+        command.extend(["--memory-backend", "none"])
+    else:
+        command.extend(["--memory-backend", "openviking"])
+        command.extend(["--memory-retrieval-mode", str(strategy.get("retrieval_mode", "first_user"))])
+        command.extend(["--memory-replay-write-policy", str(config.get("openviking", {}).get("replay_write_policy", "read_only"))])
+
+    if config.get("features", {}).get("prewrite_recall", {}).get("enabled"):
+        command.append("--enable-prewrite-recall")
+
+    return command
+
+
+def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any]:
+    repeat_count = int(config["benchmark"].get("repeat_count", 4))
+    strategies = config.get("strategies") or []
+    cells = []
+    for domain in domains(config):
+        split_path = split_file(config, domain)
+        for strategy in strategies:
+            for repeat_index in range(repeat_count):
+                run_label = f"{configured_run_id}_{domain}_{strategy['id']}_r{repeat_index + 1}"
+                cells.append(
+                    {
+                        "domain": domain,
+                        "strategy_id": strategy["id"],
+                        "strategy_label": strategy.get("label", strategy["id"]),
+                        "repeat_index": repeat_index + 1,
+                        "run_label": run_label,
+                        "train_required": bool(strategy.get("train_required")),
+                        "memory_backend": strategy.get("memory_backend"),
+                        "split_file": str(split_path),
+                        "command": _tau2_command(
+                            config,
+                            domain=domain,
+                            strategy=strategy,
+                            repeat_index=repeat_index,
+                            run_label=run_label,
+                        ),
+                    }
+                )
+    return {
+        "schema_version": "openviking.tau2.run_plan.v0",
+        "run_id": configured_run_id,
+        "status": "planned",
+        "strategy_ids": strategy_ids(config),
+        "domains": domains(config),
+        "cell_count": len(cells),
+        "cells": cells,
+    }
+
+
+def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]:
+    rows = []
+    for cell in plan["cells"]:
+        print(f"[tau2] running {cell['run_label']}")
+        completed = subprocess.run(
+            cell["command"],
+            cwd=repo,
+            text=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=False,
+        )
+        row = {
+            "run_label": cell["run_label"],
+            "domain": cell["domain"],
+            "strategy_id": cell["strategy_id"],
+            "returncode": completed.returncode,
+            "stdout_tail": completed.stdout[-4000:],
+            "stderr_tail": completed.stderr[-4000:],
+        }
+        rows.append(row)
+        write_json(out / "cell_results" / f"{cell['run_label']}.json", row)
+        if completed.returncode != 0:
+            raise RuntimeError(f"cell failed: {cell['run_label']} returncode={completed.returncode}")
+    return rows
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Plan or run TAU-2 benchmark cells.")
+    parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml")
+    parser.add_argument("--run-id", default=run_id())
+    parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.")
+    parser.add_argument("--execute", action="store_true", help="Execute planned cells.")
+    args = parser.parse_args()
+
+    if args.plan_only and args.execute:
+        raise SystemExit("--plan-only and --execute are mutually exclusive")
+
+    config = load_config(args.config)
+    out = output_dir(config, args.run_id)
+    out.mkdir(parents=True, exist_ok=True)
+    plan = _build_plan(config, args.run_id)
+    write_json(out / "run_plan.json", plan)
+    write_json(out / "resolved_config.json", config)
+    print(f"[tau2] wrote {out / 'run_plan.json'}")
+
+    if args.execute:
+        try:
+            rows = _execute_cells(plan, tau2_repo(config), out)
+            plan["status"] = "succeeded"
+            plan["executed_cell_count"] = len(rows)
+            write_json(out / "run_plan.json", plan)
+        except Exception as exc:
+            plan["status"] = "failed"
+            plan["error"] = str(exc)
+            write_json(out / "run_plan.json", plan)
+            print(f"[tau2][ERROR] {exc}", file=sys.stderr)
+            return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmark/tau2/scripts/summarize.py b/benchmark/tau2/scripts/summarize.py
new file mode 100755
index 000000000..5e07c03f2
--- /dev/null
+++ b/benchmark/tau2/scripts/summarize.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from statistics import mean
+from typing import Any
+
+from tau2_common import write_json
+
+
+def _load_json(path: Path) -> Any:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Summarize TAU-2 cell result JSON files.")
+    parser.add_argument("--run-dir", type=Path, required=True)
+    args = parser.parse_args()
+
+    run_dir = args.run_dir.expanduser().resolve()
+    rows = []
+    for path in sorted((run_dir / "cell_results").glob("*.json")):
+        row = _load_json(path)
+        rows.append(row)
+
+    returncodes = [row.get("returncode") for row in rows]
+    summary = {
+        "run_dir": str(run_dir),
+        "cell_count": len(rows),
+        "succeeded_cell_count": sum(1 for code in returncodes if code == 0),
+        "failed_cell_count": sum(1 for code in returncodes if code != 0),
+        "returncodes": returncodes,
+        "average_reward": None,
+        "notes": [
+            "This summarizer only aggregates wrapper cell status in the initial PR.",
+            "TAU-2 reward parsing is added once the execution artifact shape is fixed.",
+        ],
+    }
+    rewards = [row.get("reward") for row in rows if isinstance(row.get("reward"), (int, float))]
+    if rewards:
+        summary["average_reward"] = mean(rewards)
+    write_json(run_dir / "summary.json", summary)
+    print(f"[tau2] wrote {run_dir / 'summary.json'}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py
new file mode 100755
index 000000000..5399fffee
--- /dev/null
+++ b/benchmark/tau2/scripts/tau2_common.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+TAU2_DIR = Path(__file__).resolve().parents[1]
+REPO_ROOT = TAU2_DIR.parents[1]
+
+
+_ENV_PATTERN = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}")
+
+
+def run_id() -> str:
+    return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+
+
+def render_env(value: Any) -> Any:
+    if isinstance(value, str):
+        def replace(match: re.Match[str]) -> str:
+            name = match.group(1)
+            default = match.group(2) or ""
+            return os.environ.get(name, default)
+
+        return _ENV_PATTERN.sub(replace, value)
+    if isinstance(value, list):
+        return [render_env(item) for item in value]
+    if isinstance(value, dict):
+        return {key: render_env(item) for key, item in value.items()}
+    return value
+
+
+def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
+    merged = dict(base)
+    for key, value in override.items():
+        if (
+            key in merged
+            and isinstance(merged[key], dict)
+            and isinstance(value, dict)
+        ):
+            merged[key] = deep_merge(merged[key], value)
+        else:
+            merged[key] = value
+    return merged
+
+
+def load_config(path: Path) -> dict[str, Any]:
+    path = path.expanduser().resolve()
+    with path.open("r", encoding="utf-8") as handle:
+        raw = yaml.safe_load(handle) or {}
+    if not isinstance(raw, dict):
+        raise ValueError(f"Config must be a mapping: {path}")
+
+    parent_name = raw.pop("extends", None)
+    if parent_name:
+        parent_path = (path.parent / str(parent_name)).resolve()
+        parent = load_config(parent_path)
+        raw = deep_merge(parent, raw)
+    return render_env(raw)
+
+
+def resolve_path(path_value: str | Path, *, base: Path | None = None) -> Path:
+    path = Path(path_value).expanduser()
+    if path.is_absolute():
+        return path
+    return ((base or REPO_ROOT) / path).resolve()
+
+
+def output_dir(config: dict[str, Any], configured_run_id: str) -> Path:
+    raw = config.get("paths", {}).get("output_dir", TAU2_DIR / "result")
+    return resolve_path(raw) / configured_run_id
+
+
+def write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+
+def strategy_ids(config: dict[str, Any]) -> list[str]:
+    strategies = config.get("strategies") or []
+    if not isinstance(strategies, list):
+        raise ValueError("strategies must be a list")
+    ids = []
+    for item in strategies:
+        if not isinstance(item, dict) or not item.get("id"):
+            raise ValueError("each strategy must be a mapping with id")
+        ids.append(str(item["id"]))
+    if len(ids) != len(set(ids)):
+        raise ValueError(f"duplicate strategy ids: {ids}")
+    return ids
+
+
+def domains(config: dict[str, Any]) -> list[str]:
+    values = config.get("benchmark", {}).get("domains") or []
+    if not isinstance(values, list) or not values:
+        raise ValueError("benchmark.domains must be a non-empty list")
+    return [str(item) for item in values]
+
+
+def tau2_repo(config: dict[str, Any]) -> Path:
+    raw = config.get("paths", {}).get("tau2_repo")
+    if not raw:
+        raise ValueError("paths.tau2_repo is required")
+    return resolve_path(raw)
+
+
+def split_file(config: dict[str, Any], domain: str) -> Path:
+    return tau2_repo(config) / "data" / "tau2" / "domains" / domain / "split_tasks.json"

From a132c3bbabbf7b95f2de7c1630ce5d1b7f7c3bb1 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Tue, 12 May 2026 23:17:26 +0800
Subject: [PATCH 02/17] benchmark: gate pending tau2 memory adapter

---
 benchmark/tau2/README.md            | 15 +++++
 benchmark/tau2/config/baseline.yaml |  4 ++
 benchmark/tau2/run_full_eval.sh     |  9 ++-
 benchmark/tau2/scripts/run_eval.py  | 98 ++++++++++++++++++++++-------
 4 files changed, 102 insertions(+), 24 deletions(-)

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index ac47ffa63..344b3e6b4 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -36,6 +36,17 @@ python benchmark/tau2/scripts/preflight.py --config benchmark/tau2/config/baseli
 python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only
 ```
 
+Plan a one-cell upstream TAU-2 smoke:
+
+```bash
+benchmark/tau2/run_full_eval.sh \
+  --config benchmark/tau2/config/baseline.yaml \
+  --domain retail \
+  --strategy-id no_memory \
+  --num-tasks 1 \
+  --repeat-count 1
+```
+
 Run with execution enabled after TAU-2, model credentials, and OpenViking are
 configured:
 
@@ -43,6 +54,10 @@ configured:
 benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute
 ```
 
+The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory
+cells are kept in the same plan, but marked adapter-pending until the TAU-2
+agent adapter is wired in this benchmark directory.
+
 ## Evidence Boundary
 
 Only completed `retail + airline` runs with the same config, same seeds/repeats,
diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
index 16a452d4a..81e1a2133 100644
--- a/benchmark/tau2/config/baseline.yaml
+++ b/benchmark/tau2/config/baseline.yaml
@@ -9,6 +9,8 @@ benchmark:
   task_max_concurrency: 10
   strategy_concurrency: 4
   max_steps: 200
+  agent: llm_agent
+  user: user_simulator
   reasoning_effort: high
 
 paths:
@@ -36,12 +38,14 @@ strategies:
   - id: memory_v2_experience_only
     label: OpenViking Memory V2 experience-only
     memory_backend: openviking
+    adapter_status: pending
     train_required: true
     train_memory_mode: experience_only
     retrieval_mode: first_user
   - id: trajectory_procedure_view
     label: OpenViking trajectory procedure view
     memory_backend: openviking
+    adapter_status: pending
     train_required: true
     train_memory_mode: trajectory_procedure_view
     operation_mode: add_only
diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh
index 8abf41235..d03df3f84 100755
--- a/benchmark/tau2/run_full_eval.sh
+++ b/benchmark/tau2/run_full_eval.sh
@@ -7,6 +7,7 @@ PYTHON_BIN="${PYTHON_BIN:-python3}"
 CONFIG="$SCRIPT_DIR/config/baseline.yaml"
 EXECUTE=false
 RUN_ID=""
+RUN_EVAL_EXTRA=()
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -22,6 +23,10 @@ while [[ $# -gt 0 ]]; do
       EXECUTE=true
       shift
       ;;
+    --domain|--repeat-count|--strategy-id|--task-id|--num-tasks)
+      RUN_EVAL_EXTRA+=("$1" "$2")
+      shift 2
+      ;;
     --help|-h)
       cat <<'EOF'
 Usage:
@@ -47,7 +52,7 @@ cd "$REPO_ROOT"
 "$PYTHON_BIN" "$SCRIPT_DIR/scripts/preflight.py" --config "$CONFIG" "${RUN_ARGS[@]}"
 
 if [[ "$EXECUTE" == true ]]; then
-  "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --execute
+  "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" "${RUN_EVAL_EXTRA[@]}" --execute
 else
-  "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --plan-only
+  "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" "${RUN_EVAL_EXTRA[@]}" --plan-only
 fi
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index 8ab4c41f0..3d6d517c0 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -10,14 +10,30 @@
 from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json
 
 
-def _tau2_command(config: dict[str, Any], *, domain: str, strategy: dict[str, Any], repeat_index: int, run_label: str) -> list[str]:
+def _tau2_command(
+    config: dict[str, Any],
+    *,
+    domain: str,
+    strategy: dict[str, Any],
+    run_label: str,
+    task_ids: list[str] | None,
+    num_tasks: int | None,
+) -> list[str] | None:
     benchmark = config["benchmark"]
     model = config["model"]
+
+    if strategy.get("memory_backend") != "none":
+        return None
+
     command = [
         "tau2",
         "run",
         "--domain",
         domain,
+        "--agent",
+        str(benchmark.get("agent", "llm_agent")),
+        "--user",
+        str(benchmark.get("user", "user_simulator")),
         "--task-split-name",
         str(benchmark.get("eval_split_name", "test")),
         "--num-trials",
@@ -39,28 +55,52 @@ def _tau2_command(config: dict[str, Any], *, domain: str, strategy: dict[str, An
         command.extend(["--agent-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'])
         command.extend(["--user-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'])
 
-    if strategy.get("memory_backend") == "none":
-        command.extend(["--memory-backend", "none"])
-    else:
-        command.extend(["--memory-backend", "openviking"])
-        command.extend(["--memory-retrieval-mode", str(strategy.get("retrieval_mode", "first_user"))])
-        command.extend(["--memory-replay-write-policy", str(config.get("openviking", {}).get("replay_write_policy", "read_only"))])
-
-    if config.get("features", {}).get("prewrite_recall", {}).get("enabled"):
-        command.append("--enable-prewrite-recall")
+    if task_ids:
+        command.append("--task-ids")
+        command.extend(task_ids)
+    elif num_tasks is not None:
+        command.extend(["--num-tasks", str(num_tasks)])
 
     return command
 
 
-def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any]:
-    repeat_count = int(config["benchmark"].get("repeat_count", 4))
+def _build_plan(
+    config: dict[str, Any],
+    configured_run_id: str,
+    *,
+    selected_domains: set[str] | None,
+    selected_strategy_ids: set[str] | None,
+    task_ids: list[str] | None,
+    num_tasks: int | None,
+    repeat_count_override: int | None,
+) -> dict[str, Any]:
+    repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4))
     strategies = config.get("strategies") or []
+    if selected_strategy_ids:
+        unknown = selected_strategy_ids - set(strategy_ids(config))
+        if unknown:
+            raise ValueError(f"unknown strategy ids: {sorted(unknown)}")
+        strategies = [strategy for strategy in strategies if strategy["id"] in selected_strategy_ids]
     cells = []
-    for domain in domains(config):
+    plan_domains = domains(config)
+    if selected_domains:
+        unknown_domains = selected_domains - set(plan_domains)
+        if unknown_domains:
+            raise ValueError(f"unknown domains: {sorted(unknown_domains)}")
+        plan_domains = [domain for domain in plan_domains if domain in selected_domains]
+    for domain in plan_domains:
         split_path = split_file(config, domain)
         for strategy in strategies:
             for repeat_index in range(repeat_count):
                 run_label = f"{configured_run_id}_{domain}_{strategy['id']}_r{repeat_index + 1}"
+                command = _tau2_command(
+                    config,
+                    domain=domain,
+                    strategy=strategy,
+                    run_label=run_label,
+                    task_ids=task_ids,
+                    num_tasks=num_tasks,
+                )
                 cells.append(
                     {
                         "domain": domain,
@@ -70,14 +110,10 @@ def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any
                         "run_label": run_label,
                         "train_required": bool(strategy.get("train_required")),
                         "memory_backend": strategy.get("memory_backend"),
+                        "adapter_status": strategy.get("adapter_status", "ready"),
+                        "executable": command is not None,
                         "split_file": str(split_path),
-                        "command": _tau2_command(
-                            config,
-                            domain=domain,
-                            strategy=strategy,
-                            repeat_index=repeat_index,
-                            run_label=run_label,
-                        ),
+                        "command": command,
                     }
                 )
     return {
@@ -85,7 +121,7 @@ def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any
         "run_id": configured_run_id,
         "status": "planned",
         "strategy_ids": strategy_ids(config),
-        "domains": domains(config),
+        "domains": plan_domains,
         "cell_count": len(cells),
         "cells": cells,
     }
@@ -94,6 +130,11 @@ def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any
 def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]:
     rows = []
     for cell in plan["cells"]:
+        if not cell.get("executable"):
+            raise RuntimeError(
+                f"cell is not executable yet: {cell['run_label']} "
+                f"(strategy_id={cell['strategy_id']}, adapter_status={cell.get('adapter_status')})"
+            )
         print(f"[tau2] running {cell['run_label']}")
         completed = subprocess.run(
             cell["command"],
@@ -122,6 +163,11 @@ def main() -> int:
     parser = argparse.ArgumentParser(description="Plan or run TAU-2 benchmark cells.")
     parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml")
     parser.add_argument("--run-id", default=run_id())
+    parser.add_argument("--domain", action="append", help="Run only this configured domain; may be repeated.")
+    parser.add_argument("--repeat-count", type=int, help="Override benchmark.repeat_count for smoke runs.")
+    parser.add_argument("--strategy-id", action="append", help="Run only this strategy id; may be repeated.")
+    parser.add_argument("--task-id", action="append", help="Run only this TAU-2 task id; may be repeated.")
+    parser.add_argument("--num-tasks", type=int, help="Run the first N tasks from the selected split.")
     parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.")
     parser.add_argument("--execute", action="store_true", help="Execute planned cells.")
     args = parser.parse_args()
@@ -132,7 +178,15 @@ def main() -> int:
     config = load_config(args.config)
     out = output_dir(config, args.run_id)
     out.mkdir(parents=True, exist_ok=True)
-    plan = _build_plan(config, args.run_id)
+    plan = _build_plan(
+        config,
+        args.run_id,
+        selected_domains=set(args.domain) if args.domain else None,
+        selected_strategy_ids=set(args.strategy_id) if args.strategy_id else None,
+        task_ids=args.task_id,
+        num_tasks=args.num_tasks,
+        repeat_count_override=args.repeat_count,
+    )
     write_json(out / "run_plan.json", plan)
     write_json(out / "resolved_config.json", config)
     print(f"[tau2] wrote {out / 'run_plan.json'}")

From b68e45922ab86b14c28263b4d2616f40b99164b5 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Tue, 12 May 2026 23:25:48 +0800
Subject: [PATCH 03/17] benchmark: use litellm provider model default

---
 benchmark/tau2/README.md            | 3 +++
 benchmark/tau2/config/baseline.yaml | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index 344b3e6b4..5821a83ac 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -54,6 +54,9 @@ configured:
 benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute
 ```
 
+When using Doubao through an OpenAI-compatible endpoint, set `OPENAI_API_KEY`
+and `OPENAI_API_BASE` for LiteLLM before running upstream TAU-2.
+
 The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory
 cells are kept in the same plan, but marked adapter-pending until the TAU-2
 agent adapter is wired in this benchmark directory.
diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
index 81e1a2133..c1e23e9a9 100644
--- a/benchmark/tau2/config/baseline.yaml
+++ b/benchmark/tau2/config/baseline.yaml
@@ -18,9 +18,9 @@ paths:
   output_dir: benchmark/tau2/result
 
 model:
-  agent_llm: ${TAU2_AGENT_LLM:-doubao-seed-1-6-250615}
-  user_llm: ${TAU2_USER_LLM:-doubao-seed-1-6-250615}
-  evaluator_llm: ${TAU2_EVALUATOR_LLM:-doubao-seed-1-6-250615}
+  agent_llm: ${TAU2_AGENT_LLM:-openai/doubao-seed-2-0-pro-260215}
+  user_llm: ${TAU2_USER_LLM:-openai/doubao-seed-2-0-pro-260215}
+  evaluator_llm: ${TAU2_EVALUATOR_LLM:-openai/doubao-seed-2-0-pro-260215}
   temperature: 0.0
 
 openviking:

From 37e9b5039af9e8bff4648bf715762b5ad5741be2 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Tue, 12 May 2026 23:35:47 +0800
Subject: [PATCH 04/17] benchmark: fold preflight into tau2 runner

---
 benchmark/tau2/README.md               |  8 +--
 benchmark/tau2/run_full_eval.sh        | 20 +++++-
 benchmark/tau2/scripts/parity_check.py | 51 ----------------
 benchmark/tau2/scripts/preflight.py    | 84 --------------------------
 benchmark/tau2/scripts/run_eval.py     | 44 ++++++++++++++
 benchmark/tau2/scripts/summarize.py    | 50 ---------------
 6 files changed, 65 insertions(+), 192 deletions(-)
 delete mode 100755 benchmark/tau2/scripts/parity_check.py
 delete mode 100755 benchmark/tau2/scripts/preflight.py
 delete mode 100755 benchmark/tau2/scripts/summarize.py

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index 5821a83ac..561a69e63 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -18,10 +18,8 @@ benchmark/tau2/
 │   ├── baseline.yaml
 │   └── prewrite.yaml
 ├── scripts/
-│   ├── preflight.py
 │   ├── run_eval.py
-│   ├── summarize.py
-│   └── parity_check.py
+│   └── tau2_common.py
 └── run_full_eval.sh
 ```
 
@@ -32,10 +30,12 @@ Generated artifacts are written to `benchmark/tau2/result/<run_id>/`.
 Plan the default benchmark without running TAU-2:
 
 ```bash
-python benchmark/tau2/scripts/preflight.py --config benchmark/tau2/config/baseline.yaml
 python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only
 ```
 
+Add `--preflight` or `--strict-preflight` when you want the runner to write a
+small environment/config check next to the run plan.
+
 Plan a one-cell upstream TAU-2 smoke:
 
 ```bash
diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh
index d03df3f84..22936e3d3 100755
--- a/benchmark/tau2/run_full_eval.sh
+++ b/benchmark/tau2/run_full_eval.sh
@@ -6,6 +6,8 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 PYTHON_BIN="${PYTHON_BIN:-python3}"
 CONFIG="$SCRIPT_DIR/config/baseline.yaml"
 EXECUTE=false
+PREFLIGHT=false
+STRICT_PREFLIGHT=false
 RUN_ID=""
 RUN_EVAL_EXTRA=()
 
@@ -23,6 +25,14 @@ while [[ $# -gt 0 ]]; do
       EXECUTE=true
       shift
       ;;
+    --preflight)
+      PREFLIGHT=true
+      shift
+      ;;
+    --strict-preflight)
+      STRICT_PREFLIGHT=true
+      shift
+      ;;
     --domain|--repeat-count|--strategy-id|--task-id|--num-tasks)
       RUN_EVAL_EXTRA+=("$1" "$2")
       shift 2
@@ -30,9 +40,9 @@ while [[ $# -gt 0 ]]; do
     --help|-h)
       cat <<'EOF'
 Usage:
-  benchmark/tau2/run_full_eval.sh [--config PATH] [--run-id ID] [--execute]
+  benchmark/tau2/run_full_eval.sh [--config PATH] [--run-id ID] [--execute] [--preflight]
 
-Without --execute the script only writes preflight and run_plan artifacts.
+Without --execute the script only writes run_plan artifacts.
 EOF
       exit 0
       ;;
@@ -49,7 +59,11 @@ if [[ -n "$RUN_ID" ]]; then
 fi
 
 cd "$REPO_ROOT"
-"$PYTHON_BIN" "$SCRIPT_DIR/scripts/preflight.py" --config "$CONFIG" "${RUN_ARGS[@]}"
+if [[ "$STRICT_PREFLIGHT" == true ]]; then
+  RUN_EVAL_EXTRA+=(--strict-preflight)
+elif [[ "$PREFLIGHT" == true ]]; then
+  RUN_EVAL_EXTRA+=(--preflight)
+fi
 
 if [[ "$EXECUTE" == true ]]; then
   "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" "${RUN_EVAL_EXTRA[@]}" --execute
diff --git a/benchmark/tau2/scripts/parity_check.py b/benchmark/tau2/scripts/parity_check.py
deleted file mode 100755
index d513715f0..000000000
--- a/benchmark/tau2/scripts/parity_check.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import json
-from pathlib import Path
-from typing import Any
-
-from tau2_common import write_json
-
-
-def _load_json(path: Path) -> Any:
-    return json.loads(path.read_text(encoding="utf-8"))
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(description="Compare OpenViking TAU-2 artifacts against a harness reference.")
-    parser.add_argument("--ov-run-plan", type=Path, required=True)
-    parser.add_argument("--harness-run-plan", type=Path, required=True)
-    parser.add_argument("--output", type=Path, required=True)
-    args = parser.parse_args()
-
-    ov_plan = _load_json(args.ov_run_plan)
-    harness_plan = _load_json(args.harness_run_plan)
-
-    ov_cells = ov_plan.get("cells") or []
-    harness_cells = harness_plan.get("cells") or harness_plan.get("treatments") or []
-    report = {
-        "status": "ok" if len(ov_cells) == len(harness_cells) else "mismatch",
-        "ov_run_plan": str(args.ov_run_plan.resolve()),
-        "harness_run_plan": str(args.harness_run_plan.resolve()),
-        "ov_cell_count": len(ov_cells),
-        "harness_cell_count": len(harness_cells),
-        "checks": {
-            "cell_count_match": len(ov_cells) == len(harness_cells),
-        },
-        "notes": [
-            "Initial parity is intentionally structural.",
-            "Train payload, retrieval trace, and scoreboard parity should be added as each migration layer lands.",
-        ],
-    }
-    write_json(args.output, report)
-    if report["status"] != "ok":
-        print(f"[parity][WARN] wrote mismatch report: {args.output}")
-        return 1
-    print(f"[parity][OK] wrote {args.output}")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/benchmark/tau2/scripts/preflight.py b/benchmark/tau2/scripts/preflight.py
deleted file mode 100755
index 32f78c3a3..000000000
--- a/benchmark/tau2/scripts/preflight.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import importlib.util
-import sys
-from pathlib import Path
-from typing import Any
-
-from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json
-
-
-def _check_import(module: str) -> dict[str, Any]:
-    spec = importlib.util.find_spec(module)
-    return {"module": module, "ok": spec is not None}
-
-
-def _split_status(config: dict[str, Any]) -> list[dict[str, Any]]:
-    rows = []
-    for domain in domains(config):
-        path = split_file(config, domain)
-        rows.append(
-            {
-                "domain": domain,
-                "path": str(path),
-                "exists": path.is_file(),
-            }
-        )
-    return rows
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(description="Preflight TAU-2 benchmark config.")
-    parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml")
-    parser.add_argument("--run-id", default=run_id())
-    parser.add_argument("--strict", action="store_true", help="Require optional runtime imports and TAU-2 split files.")
-    args = parser.parse_args()
-
-    config = load_config(args.config)
-    out = output_dir(config, args.run_id)
-
-    errors: list[str] = []
-    try:
-        strategy_ids(config)
-    except Exception as exc:
-        errors.append(str(exc))
-
-    split_rows = _split_status(config)
-    if args.strict:
-        for row in split_rows:
-            if not row["exists"]:
-                errors.append(f"missing split file for {row['domain']}: {row['path']}")
-
-    import_rows = [_check_import("openviking"), _check_import("openviking_cli"), _check_import("tau2")]
-    if args.strict:
-        for row in import_rows:
-            if not row["ok"]:
-                errors.append(f"missing Python module: {row['module']}")
-
-    report = {
-        "status": "failed" if errors else "ok",
-        "config": str(args.config.resolve()),
-        "run_id": args.run_id,
-        "tau2_repo": str(tau2_repo(config)),
-        "domains": domains(config),
-        "strategies": strategy_ids(config),
-        "strict": args.strict,
-        "imports": import_rows,
-        "split_files": split_rows,
-        "errors": errors,
-    }
-    write_json(out / "preflight.json", report)
-
-    if errors:
-        for error in errors:
-            print(f"[preflight][ERROR] {error}", file=sys.stderr)
-        print(f"[preflight] wrote {out / 'preflight.json'}", file=sys.stderr)
-        return 1
-    print(f"[preflight][OK] wrote {out / 'preflight.json'}")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index 3d6d517c0..2a4e9b812 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import argparse
+import importlib.util
 import subprocess
 import sys
 from pathlib import Path
@@ -159,6 +160,42 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str
     return rows
 
 
+def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
+    errors: list[str] = []
+    split_rows = []
+    for domain in domains(config):
+        path = split_file(config, domain)
+        exists = path.is_file()
+        split_rows.append({"domain": domain, "path": str(path), "exists": exists})
+        if strict and not exists:
+            errors.append(f"missing split file for {domain}: {path}")
+
+    import_rows = []
+    for module in ("openviking", "openviking_cli", "tau2"):
+        ok = importlib.util.find_spec(module) is not None
+        import_rows.append({"module": module, "ok": ok})
+        if strict and not ok:
+            errors.append(f"missing Python module: {module}")
+
+    report = {
+        "status": "failed" if errors else "ok",
+        "strict": strict,
+        "tau2_repo": str(tau2_repo(config)),
+        "domains": domains(config),
+        "strategies": strategy_ids(config),
+        "imports": import_rows,
+        "split_files": split_rows,
+        "errors": errors,
+    }
+    write_json(out / "preflight.json", report)
+    if errors:
+        for error in errors:
+            print(f"[preflight][ERROR] {error}", file=sys.stderr)
+        return 1
+    print(f"[preflight][OK] wrote {out / 'preflight.json'}")
+    return 0
+
+
 def main() -> int:
     parser = argparse.ArgumentParser(description="Plan or run TAU-2 benchmark cells.")
     parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml")
@@ -168,6 +205,8 @@ def main() -> int:
     parser.add_argument("--strategy-id", action="append", help="Run only this strategy id; may be repeated.")
     parser.add_argument("--task-id", action="append", help="Run only this TAU-2 task id; may be repeated.")
     parser.add_argument("--num-tasks", type=int, help="Run the first N tasks from the selected split.")
+    parser.add_argument("--preflight", action="store_true", help="Write a lightweight environment/config preflight report.")
+    parser.add_argument("--strict-preflight", action="store_true", help="Fail if optional runtime imports or split files are missing.")
     parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.")
     parser.add_argument("--execute", action="store_true", help="Execute planned cells.")
     args = parser.parse_args()
@@ -178,6 +217,11 @@ def main() -> int:
     config = load_config(args.config)
     out = output_dir(config, args.run_id)
     out.mkdir(parents=True, exist_ok=True)
+    if args.preflight or args.strict_preflight:
+        preflight_status = _preflight(config, out, strict=args.strict_preflight)
+        if args.strict_preflight and preflight_status != 0:
+            return preflight_status
+
     plan = _build_plan(
         config,
         args.run_id,
diff --git a/benchmark/tau2/scripts/summarize.py b/benchmark/tau2/scripts/summarize.py
deleted file mode 100755
index 5e07c03f2..000000000
--- a/benchmark/tau2/scripts/summarize.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import json
-from pathlib import Path
-from statistics import mean
-from typing import Any
-
-from tau2_common import write_json
-
-
-def _load_json(path: Path) -> Any:
-    return json.loads(path.read_text(encoding="utf-8"))
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(description="Summarize TAU-2 cell result JSON files.")
-    parser.add_argument("--run-dir", type=Path, required=True)
-    args = parser.parse_args()
-
-    run_dir = args.run_dir.expanduser().resolve()
-    rows = []
-    for path in sorted((run_dir / "cell_results").glob("*.json")):
-        row = _load_json(path)
-        rows.append(row)
-
-    returncodes = [row.get("returncode") for row in rows]
-    summary = {
-        "run_dir": str(run_dir),
-        "cell_count": len(rows),
-        "succeeded_cell_count": sum(1 for code in returncodes if code == 0),
-        "failed_cell_count": sum(1 for code in returncodes if code != 0),
-        "returncodes": returncodes,
-        "average_reward": None,
-        "notes": [
-            "This summarizer only aggregates wrapper cell status in the initial PR.",
-            "TAU-2 reward parsing is added once the execution artifact shape is fixed.",
-        ],
-    }
-    rewards = [row.get("reward") for row in rows if isinstance(row.get("reward"), (int, float))]
-    if rewards:
-        summary["average_reward"] = mean(rewards)
-    write_json(run_dir / "summary.json", summary)
-    print(f"[tau2] wrote {run_dir / 'summary.json'}")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())

From 95ea695b8f14079dac9c2dc3dfcb24503a28fc2a Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 00:03:50 +0800
Subject: [PATCH 05/17] benchmark: document tau2 dependency setup

---
 benchmark/tau2/.gitignore                 |  4 ++
 benchmark/tau2/README.md                  | 54 +++++++++++++++
 benchmark/tau2/config/baseline.yaml       |  7 ++
 benchmark/tau2/config/official.yaml       |  7 ++
 benchmark/tau2/scripts/run_eval.py        | 42 +++++++++++-
 benchmark/tau2/scripts/setup_tau2_repo.sh | 82 +++++++++++++++++++++++
 benchmark/tau2/scripts/tau2_common.py     | 72 ++++++++++++++++++++
 7 files changed, 265 insertions(+), 3 deletions(-)
 create mode 100644 benchmark/tau2/config/official.yaml
 create mode 100755 benchmark/tau2/scripts/setup_tau2_repo.sh

diff --git a/benchmark/tau2/.gitignore b/benchmark/tau2/.gitignore
index 1cd791b52..2577e5885 100644
--- a/benchmark/tau2/.gitignore
+++ b/benchmark/tau2/.gitignore
@@ -1 +1,5 @@
 result/
+.env.tau2
+.external/
+.venv-tau2/
+__pycache__/
diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index 561a69e63..4b6fb8c10 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -16,9 +16,11 @@ Category rerank and other harness-only diagnostics are not migrated here yet.
 benchmark/tau2/
 ├── config/
 │   ├── baseline.yaml
+│   ├── official.yaml
 │   └── prewrite.yaml
 ├── scripts/
 │   ├── run_eval.py
+│   ├── setup_tau2_repo.sh
 │   └── tau2_common.py
 └── run_full_eval.sh
 ```
@@ -27,6 +29,31 @@ Generated artifacts are written to `benchmark/tau2/result/<run_id>/`.
 
 ## Quick Start
 
+This benchmark delegates task simulation and scoring to an external TAU-2
+checkout. Point the runner at that checkout and CLI explicitly when they are not
+on the default path:
+
+```bash
+export TAU2_REPO=/path/to/tau2-bench
+export TAU2_CLI=/path/to/tau2
+```
+
+For a local one-command setup, clone and install TAU-2 into ignored benchmark
+directories:
+
+```bash
+benchmark/tau2/scripts/setup_tau2_repo.sh
+source benchmark/tau2/.env.tau2
+```
+
+Use `TAU2_REF` or `--ref` when you need a TAU-2 branch that already contains the
+confirmation-aware user simulator prompt:
+
+```bash
+benchmark/tau2/scripts/setup_tau2_repo.sh --ref <branch-or-commit>
+source benchmark/tau2/.env.tau2
+```
+
 Plan the default benchmark without running TAU-2:
 
 ```bash
@@ -36,6 +63,18 @@ python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baselin
 Add `--preflight` or `--strict-preflight` when you want the runner to write a
 small environment/config check next to the run plan.
 
+After setup, verify the local TAU-2 link and write a one-cell run plan:
+
+```bash
+benchmark/tau2/run_full_eval.sh \
+  --config benchmark/tau2/config/baseline.yaml \
+  --strict-preflight \
+  --domain retail \
+  --strategy-id no_memory \
+  --task-id 5 \
+  --repeat-count 1
+```
+
 Plan a one-cell upstream TAU-2 smoke:
 
 ```bash
@@ -61,6 +100,21 @@ The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory
 cells are kept in the same plan, but marked adapter-pending until the TAU-2
 agent adapter is wired in this benchmark directory.
 
+## User Simulator Policy
+
+The runner default is the official TAU-2 user simulator if
+`eval.user_simulator_policy` is omitted. The bundled OpenViking memory benchmark
+config sets `confirmation_aware`, because a memory benchmark should not treat
+user confirmation as task completion before the backend write has happened.
+
+`confirmation_aware` does not monkey-patch TAU-2 from this directory. It requires
+the configured `TAU2_REPO` to contain the corresponding upstream TAU-2 simulator
+prompt fix. `--strict-preflight` fails fast when that prompt is not detected, so
+the artifact cannot silently claim confirmation-aware semantics while running an
+older official simulator.
+
+Use `config/official.yaml` when you need an official-user-simulator parity run.
+
 ## Evidence Boundary
 
 Only completed `retail + airline` runs with the same config, same seeds/repeats,
diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
index c1e23e9a9..bb233417b 100644
--- a/benchmark/tau2/config/baseline.yaml
+++ b/benchmark/tau2/config/baseline.yaml
@@ -15,8 +15,15 @@ benchmark:
 
 paths:
   tau2_repo: ${TAU2_REPO:-data/external_benchmarks/tau2-bench}
+  tau2_cli: ${TAU2_CLI:-tau2}
   output_dir: benchmark/tau2/result
 
+eval:
+  # The runner default is official if this field is omitted. The OpenViking
+  # memory benchmark config opts into the confirmation-aware TAU-2 prompt when
+  # the referenced TAU-2 checkout contains that upstream fix.
+  user_simulator_policy: confirmation_aware
+
 model:
   agent_llm: ${TAU2_AGENT_LLM:-openai/doubao-seed-2-0-pro-260215}
   user_llm: ${TAU2_USER_LLM:-openai/doubao-seed-2-0-pro-260215}
diff --git a/benchmark/tau2/config/official.yaml b/benchmark/tau2/config/official.yaml
new file mode 100644
index 000000000..d10bee872
--- /dev/null
+++ b/benchmark/tau2/config/official.yaml
@@ -0,0 +1,7 @@
+extends: baseline.yaml
+
+benchmark:
+  name: tau2_openviking_official_user_simulator
+
+eval:
+  user_simulator_policy: official
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index 2a4e9b812..31fda233c 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -8,7 +8,20 @@
 from pathlib import Path
 from typing import Any
 
-from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json
+from tau2_common import (
+    domains,
+    load_config,
+    output_dir,
+    run_id,
+    simulator_policy_report,
+    split_file,
+    strategy_ids,
+    tau2_cli,
+    tau2_context,
+    tau2_repo,
+    user_simulator_policy,
+    write_json,
+)
 
 
 def _tau2_command(
@@ -27,7 +40,7 @@ def _tau2_command(
         return None
 
     command = [
-        "tau2",
+        tau2_cli(config),
         "run",
         "--domain",
         domain,
@@ -76,6 +89,7 @@ def _build_plan(
     repeat_count_override: int | None,
 ) -> dict[str, Any]:
     repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4))
+    policy_report = simulator_policy_report(config)
     strategies = config.get("strategies") or []
     if selected_strategy_ids:
         unknown = selected_strategy_ids - set(strategy_ids(config))
@@ -113,6 +127,8 @@ def _build_plan(
                         "memory_backend": strategy.get("memory_backend"),
                         "adapter_status": strategy.get("adapter_status", "ready"),
                         "executable": command is not None,
+                        "user_simulator_policy": user_simulator_policy(config),
+                        "user_simulator_policy_supported": policy_report["supported"],
                         "split_file": str(split_path),
                         "command": command,
                     }
@@ -123,12 +139,20 @@ def _build_plan(
         "status": "planned",
         "strategy_ids": strategy_ids(config),
         "domains": plan_domains,
+        "tau2": tau2_context(config),
+        "simulator_policy": policy_report,
         "cell_count": len(cells),
         "cells": cells,
     }
 
 
 def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]:
+    policy_report = plan.get("simulator_policy") or {}
+    if not policy_report.get("supported", False):
+        raise RuntimeError(
+            "configured user simulator policy is not supported by this TAU-2 checkout: "
+            f"{policy_report}"
+        )
     rows = []
     for cell in plan["cells"]:
         if not cell.get("executable"):
@@ -162,6 +186,17 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str
 
 def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
     errors: list[str] = []
+    tau2_info = tau2_context(config)
+    policy_report = simulator_policy_report(config)
+    if strict and not tau2_info["tau2_repo_exists"]:
+        errors.append(f"missing TAU-2 repo: {tau2_info['tau2_repo']}")
+    if strict and not tau2_info["tau2_cli_resolved"]:
+        errors.append(f"missing TAU-2 CLI: {tau2_info['tau2_cli']}")
+    if strict and not policy_report["supported"]:
+        errors.append(
+            "configured confirmation-aware user simulator policy requires a TAU-2 "
+            f"checkout with the prompt fix: {policy_report['prompt_files']}"
+        )
     split_rows = []
     for domain in domains(config):
         path = split_file(config, domain)
@@ -180,7 +215,8 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
     report = {
         "status": "failed" if errors else "ok",
         "strict": strict,
-        "tau2_repo": str(tau2_repo(config)),
+        "tau2": tau2_info,
+        "simulator_policy": policy_report,
         "domains": domains(config),
         "strategies": strategy_ids(config),
         "imports": import_rows,
diff --git a/benchmark/tau2/scripts/setup_tau2_repo.sh b/benchmark/tau2/scripts/setup_tau2_repo.sh
new file mode 100755
index 000000000..3cee2655a
--- /dev/null
+++ b/benchmark/tau2/scripts/setup_tau2_repo.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+TAU2_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+DEFAULT_REPO_DIR="$TAU2_DIR/.external/tau2-bench"
+DEFAULT_VENV_DIR="$TAU2_DIR/.venv-tau2"
+
+REPO_URL="${TAU2_REPO_URL:-https://github.com/sierra-research/tau2-bench.git}"
+REPO_DIR="${TAU2_REPO:-$DEFAULT_REPO_DIR}"
+VENV_DIR="${TAU2_VENV:-$DEFAULT_VENV_DIR}"
+REF="${TAU2_REF:-}"
+INSTALL=true
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --repo-url)
+      REPO_URL="$2"
+      shift 2
+      ;;
+    --repo-dir)
+      REPO_DIR="$2"
+      shift 2
+      ;;
+    --venv)
+      VENV_DIR="$2"
+      shift 2
+      ;;
+    --ref)
+      REF="$2"
+      shift 2
+      ;;
+    --no-install)
+      INSTALL=false
+      shift
+      ;;
+    --help|-h)
+      cat <<'EOF'
+Usage:
+  benchmark/tau2/scripts/setup_tau2_repo.sh [--repo-url URL] [--repo-dir DIR] [--venv DIR] [--ref REF] [--no-install]
+
+Clones TAU-2 into a local ignored directory and optionally installs it into a
+local virtualenv. The script writes benchmark/tau2/.env.tau2 with TAU2_REPO and
+TAU2_CLI for the benchmark runner.
+EOF
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+mkdir -p "$(dirname "$REPO_DIR")"
+if [[ ! -d "$REPO_DIR/.git" ]]; then
+  git clone "$REPO_URL" "$REPO_DIR"
+else
+  git -C "$REPO_DIR" fetch --all --prune
+fi
+
+if [[ -n "$REF" ]]; then
+  git -C "$REPO_DIR" checkout "$REF"
+fi
+
+TAU2_CLI="tau2"
+if [[ "$INSTALL" == true ]]; then
+  python3 -m venv "$VENV_DIR"
+  "$VENV_DIR/bin/python" -m pip install --upgrade pip
+  "$VENV_DIR/bin/python" -m pip install -e "$REPO_DIR"
+  TAU2_CLI="$VENV_DIR/bin/tau2"
+fi
+
+cat > "$TAU2_DIR/.env.tau2" <<EOF
+export TAU2_REPO="$REPO_DIR"
+export TAU2_CLI="$TAU2_CLI"
+EOF
+
+echo "[tau2-setup] repo: $REPO_DIR"
+echo "[tau2-setup] cli:  $TAU2_CLI"
+echo "[tau2-setup] wrote $TAU2_DIR/.env.tau2"
+echo "[tau2-setup] next: source $TAU2_DIR/.env.tau2"
diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py
index 5399fffee..0502e83c4 100755
--- a/benchmark/tau2/scripts/tau2_common.py
+++ b/benchmark/tau2/scripts/tau2_common.py
@@ -3,6 +3,8 @@
 import json
 import os
 import re
+import shutil
+import subprocess
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
@@ -113,5 +115,75 @@ def tau2_repo(config: dict[str, Any]) -> Path:
     return resolve_path(raw)
 
 
+def tau2_cli(config: dict[str, Any]) -> str:
+    return str(config.get("paths", {}).get("tau2_cli") or "tau2")
+
+
+def _git_commit(path: Path) -> str | None:
+    if not path.exists():
+        return None
+    completed = subprocess.run(
+        ["git", "-C", str(path), "rev-parse", "HEAD"],
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.DEVNULL,
+        check=False,
+    )
+    if completed.returncode != 0:
+        return None
+    return completed.stdout.strip() or None
+
+
+def tau2_context(config: dict[str, Any]) -> dict[str, Any]:
+    repo = tau2_repo(config)
+    cli = tau2_cli(config)
+    return {
+        "tau2_repo": str(repo),
+        "tau2_repo_exists": repo.exists(),
+        "tau2_commit": _git_commit(repo),
+        "tau2_cli": cli,
+        "tau2_cli_resolved": shutil.which(cli),
+    }
+
+
+def user_simulator_policy(config: dict[str, Any]) -> str:
+    policy = config.get("eval", {}).get("user_simulator_policy", "official")
+    policy = str(policy)
+    if policy not in {"official", "confirmation_aware"}:
+        raise ValueError(
+            "eval.user_simulator_policy must be 'official' or 'confirmation_aware'"
+        )
+    return policy
+
+
+def simulator_policy_report(config: dict[str, Any]) -> dict[str, Any]:
+    policy = user_simulator_policy(config)
+    repo = tau2_repo(config)
+    prompt_paths = [
+        repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines.md",
+        repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines_tools.md",
+    ]
+    prompt_text = "\n".join(
+        path.read_text(encoding="utf-8") for path in prompt_paths if path.is_file()
+    )
+    confirmation_aware_prompt = (
+        "do not emit" in prompt_text
+        and "###STOP###" in prompt_text
+        and "confirm" in prompt_text.lower()
+    )
+    supported = policy == "official" or confirmation_aware_prompt
+    return {
+        "user_simulator_policy": policy,
+        "supported": supported,
+        "confirmation_aware_prompt_detected": confirmation_aware_prompt,
+        "prompt_files": [str(path) for path in prompt_paths],
+        "claim_boundary": (
+            "official_tau2_user_simulator"
+            if policy == "official"
+            else "requires_tau2_confirmation_aware_user_simulator_prompt"
+        ),
+    }
+
+
 def split_file(config: dict[str, Any], domain: str) -> Path:
     return tau2_repo(config) / "data" / "tau2" / "domains" / domain / "split_tasks.json"

From e59a4a052279af30fba19fd4c269e2552f00bc14 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 00:24:41 +0800
Subject: [PATCH 06/17] benchmark: simplify tau2 simulator patch

---
 benchmark/tau2/README.md              | 22 +++-----
 benchmark/tau2/scripts/tau2_common.py | 79 ++++++++++++++++++++++-----
 2 files changed, 73 insertions(+), 28 deletions(-)

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index 4b6fb8c10..198e869c7 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -46,14 +46,6 @@ benchmark/tau2/scripts/setup_tau2_repo.sh
 source benchmark/tau2/.env.tau2
 ```
 
-Use `TAU2_REF` or `--ref` when you need a TAU-2 branch that already contains the
-confirmation-aware user simulator prompt:
-
-```bash
-benchmark/tau2/scripts/setup_tau2_repo.sh --ref <branch-or-commit>
-source benchmark/tau2/.env.tau2
-```
-
 Plan the default benchmark without running TAU-2:
 
 ```bash
@@ -107,13 +99,15 @@ The runner default is the official TAU-2 user simulator if
 config sets `confirmation_aware`, because a memory benchmark should not treat
 user confirmation as task completion before the backend write has happened.
 
-`confirmation_aware` does not monkey-patch TAU-2 from this directory. It requires
-the configured `TAU2_REPO` to contain the corresponding upstream TAU-2 simulator
-prompt fix. `--strict-preflight` fails fast when that prompt is not detected, so
-the artifact cannot silently claim confirmation-aware semantics while running an
-older official simulator.
+`confirmation_aware` applies a small idempotent prompt patch to the configured
+TAU-2 checkout before planning or running. The patch appends the confirmation
+boundary from [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297)
+to the TAU-2 user simulator guidelines when it is not already present, and the
+run artifacts record whether the patch was applied.
 
-Use `config/official.yaml` when you need an official-user-simulator parity run.
+Use `config/official.yaml` with a clean TAU-2 checkout when you need an
+official-user-simulator parity run. If the checkout was already patched, the
+artifact records that boundary instead of labeling the run pure official.
 
 ## Evidence Boundary
 
diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py
index 0502e83c4..0281d4285 100755
--- a/benchmark/tau2/scripts/tau2_common.py
+++ b/benchmark/tau2/scripts/tau2_common.py
@@ -14,6 +14,21 @@
 
 TAU2_DIR = Path(__file__).resolve().parents[1]
 REPO_ROOT = TAU2_DIR.parents[1]
+CONFIRMATION_AWARE_UPSTREAM_PR = "https://github.com/sierra-research/tau2-bench/pull/297"
+CONFIRMATION_AWARE_MARKER = "OpenViking TAU-2 confirmation-aware user simulator patch"
+CONFIRMATION_AWARE_APPENDIX = f"""
+
+## {CONFIRMATION_AWARE_MARKER}
+
+Reference: {CONFIRMATION_AWARE_UPSTREAM_PR}
+
+- If the agent asks you to confirm, authorize, or approve a backend action,
+  reply with the requested confirmation but do not emit `###STOP###` in the
+  same turn.
+- Emit `###STOP###` only after the agent clearly reports that the requested
+  backend action has been completed, or when the official transfer /
+  out-of-scope rules apply.
+"""
 
 
 _ENV_PATTERN = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}")
@@ -146,6 +161,37 @@ def tau2_context(config: dict[str, Any]) -> dict[str, Any]:
     }
 
 
+def _prompt_paths(repo: Path) -> list[Path]:
+    return [
+        repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines.md",
+        repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines_tools.md",
+    ]
+
+
+def _has_confirmation_aware_prompt(prompt_text: str) -> bool:
+    return (
+        CONFIRMATION_AWARE_MARKER in prompt_text
+        or (
+            "do not emit" in prompt_text
+            and "###STOP###" in prompt_text
+            and "confirm" in prompt_text.lower()
+        )
+    )
+
+
+def _ensure_confirmation_aware_prompt(repo: Path) -> bool:
+    patched = False
+    for path in _prompt_paths(repo):
+        if not path.is_file():
+            continue
+        text = path.read_text(encoding="utf-8")
+        if _has_confirmation_aware_prompt(text):
+            continue
+        path.write_text(text.rstrip() + CONFIRMATION_AWARE_APPENDIX + "\n", encoding="utf-8")
+        patched = True
+    return patched
+
+
 def user_simulator_policy(config: dict[str, Any]) -> str:
     policy = config.get("eval", {}).get("user_simulator_policy", "official")
     policy = str(policy)
@@ -159,29 +205,34 @@ def user_simulator_policy(config: dict[str, Any]) -> str:
 def simulator_policy_report(config: dict[str, Any]) -> dict[str, Any]:
     policy = user_simulator_policy(config)
     repo = tau2_repo(config)
-    prompt_paths = [
-        repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines.md",
-        repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines_tools.md",
-    ]
+    patch_applied = policy == "confirmation_aware" and _ensure_confirmation_aware_prompt(repo)
+    patch_mode = "direct_prompt_append" if patch_applied else "none"
+    if policy == "confirmation_aware":
+        if not patch_applied:
+            patch_mode = "upstream_or_existing_prompt"
+
+    prompt_paths = _prompt_paths(repo)
     prompt_text = "\n".join(
         path.read_text(encoding="utf-8") for path in prompt_paths if path.is_file()
     )
-    confirmation_aware_prompt = (
-        "do not emit" in prompt_text
-        and "###STOP###" in prompt_text
-        and "confirm" in prompt_text.lower()
-    )
+    confirmation_aware_prompt = _has_confirmation_aware_prompt(prompt_text)
     supported = policy == "official" or confirmation_aware_prompt
+    claim_boundary = "confirmation_aware_user_simulator_prompt"
+    if policy == "official":
+        claim_boundary = (
+            "official_policy_with_confirmation_aware_checkout"
+            if confirmation_aware_prompt
+            else "official_tau2_user_simulator"
+        )
     return {
         "user_simulator_policy": policy,
         "supported": supported,
         "confirmation_aware_prompt_detected": confirmation_aware_prompt,
+        "confirmation_aware_upstream_pr": CONFIRMATION_AWARE_UPSTREAM_PR,
+        "patch_applied": patch_applied,
+        "patch_mode": patch_mode,
         "prompt_files": [str(path) for path in prompt_paths],
-        "claim_boundary": (
-            "official_tau2_user_simulator"
-            if policy == "official"
-            else "requires_tau2_confirmation_aware_user_simulator_prompt"
-        ),
+        "claim_boundary": claim_boundary,
     }
 
 

From 32b9b42d371b90a9860e0dceb136b0c7c0a13bd3 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 00:30:43 +0800
Subject: [PATCH 07/17] benchmark: keep simulator patch prompt clean

---
 benchmark/tau2/README.md              |  8 ++++----
 benchmark/tau2/scripts/tau2_common.py | 16 ++++------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index 198e869c7..f8e4dc642 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -100,10 +100,10 @@ config sets `confirmation_aware`, because a memory benchmark should not treat
 user confirmation as task completion before the backend write has happened.
 
 `confirmation_aware` applies a small idempotent prompt patch to the configured
-TAU-2 checkout before planning or running. The patch appends the confirmation
-boundary from [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297)
-to the TAU-2 user simulator guidelines when it is not already present, and the
-run artifacts record whether the patch was applied.
+TAU-2 checkout before planning or running. The patch appends only the behavioral
+confirmation boundary to the TAU-2 user simulator guidelines; metadata such as
+the upstream PR link is kept in run artifacts, not in the simulator prompt.
+Reference: [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297).
 
 Use `config/official.yaml` with a clean TAU-2 checkout when you need an
 official-user-simulator parity run. If the checkout was already patched, the
diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py
index 0281d4285..ae00e4c6c 100755
--- a/benchmark/tau2/scripts/tau2_common.py
+++ b/benchmark/tau2/scripts/tau2_common.py
@@ -15,12 +15,7 @@
 TAU2_DIR = Path(__file__).resolve().parents[1]
 REPO_ROOT = TAU2_DIR.parents[1]
 CONFIRMATION_AWARE_UPSTREAM_PR = "https://github.com/sierra-research/tau2-bench/pull/297"
-CONFIRMATION_AWARE_MARKER = "OpenViking TAU-2 confirmation-aware user simulator patch"
-CONFIRMATION_AWARE_APPENDIX = f"""
-
-## {CONFIRMATION_AWARE_MARKER}
-
-Reference: {CONFIRMATION_AWARE_UPSTREAM_PR}
+CONFIRMATION_AWARE_APPENDIX = """
 
 - If the agent asks you to confirm, authorize, or approve a backend action,
   reply with the requested confirmation but do not emit `###STOP###` in the
@@ -169,13 +164,10 @@ def _prompt_paths(repo: Path) -> list[Path]:
 
 
 def _has_confirmation_aware_prompt(prompt_text: str) -> bool:
+    normalized = " ".join(prompt_text.split())
     return (
-        CONFIRMATION_AWARE_MARKER in prompt_text
-        or (
-            "do not emit" in prompt_text
-            and "###STOP###" in prompt_text
-            and "confirm" in prompt_text.lower()
-        )
+        "reply with the requested confirmation" in normalized
+        and "do not emit `###STOP###` in the same turn" in normalized
     )
 
 

From 7ef274334d7a1ae4557c40671d2ba372dfa2d80a Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 00:35:46 +0800
Subject: [PATCH 08/17] benchmark: clarify simulator patch config

---
 benchmark/tau2/config/baseline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
index bb233417b..602545ee4 100644
--- a/benchmark/tau2/config/baseline.yaml
+++ b/benchmark/tau2/config/baseline.yaml
@@ -20,8 +20,8 @@ paths:
 
 eval:
   # The runner default is official if this field is omitted. The OpenViking
-  # memory benchmark config opts into the confirmation-aware TAU-2 prompt when
-  # the referenced TAU-2 checkout contains that upstream fix.
+  # memory benchmark config opts into a confirmation-aware TAU-2 user simulator
+  # prompt; run_eval.py applies that small prompt patch idempotently when needed.
   user_simulator_policy: confirmation_aware
 
 model:

From 00bd6add35d5ce31a5fb8047cbd45871b67c595f Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 00:51:21 +0800
Subject: [PATCH 09/17] benchmark: clarify tau2 adapter boundary

---
 benchmark/tau2/README.md           | 18 ++++++++++++++++++
 benchmark/tau2/scripts/run_eval.py | 10 ++++++++++
 2 files changed, 28 insertions(+)

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index f8e4dc642..143b32540 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -92,6 +92,24 @@ The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory
 cells are kept in the same plan, but marked adapter-pending until the TAU-2
 agent adapter is wired in this benchmark directory.
 
+## Memory Adapter Boundary
+
+The first PR keeps memory strategies visible in `run_plan.json` without
+pretending they are executable through upstream TAU-2 flags. `no_memory` cells
+can run immediately through the external TAU-2 CLI. OpenViking memory cells are
+planned with corpus / strategy metadata and `adapter_status: pending`; the plan
+also records `non_executable_reason` for those cells.
+
+The next adapter step should register a TAU-2 agent entry point that can:
+
+- train by writing TAU-2 training conversations into OpenViking sessions;
+- evaluate by retrieving OpenViking memory at the configured decision node;
+- emit enough artifact metadata to identify the OpenViking account, agent,
+  corpus, retrieval mode, and simulator policy used by each cell.
+
+Until that adapter exists, `--execute` is expected to fail fast if a selected
+cell needs OpenViking memory.
+
 ## User Simulator Policy
 
 The runner default is the official TAU-2 user simulator if
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index 31fda233c..67a229a5f 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -116,6 +116,12 @@ def _build_plan(
                     task_ids=task_ids,
                     num_tasks=num_tasks,
                 )
+                non_executable_reason = None
+                if command is None:
+                    non_executable_reason = (
+                        "OpenViking memory strategy requires a TAU-2 agent adapter; "
+                        "this benchmark scaffold only executes upstream TAU-2 no-memory cells."
+                    )
                 cells.append(
                     {
                         "domain": domain,
@@ -131,8 +137,10 @@ def _build_plan(
                         "user_simulator_policy_supported": policy_report["supported"],
                         "split_file": str(split_path),
                         "command": command,
+                        "non_executable_reason": non_executable_reason,
                     }
                 )
+    executable_cell_count = sum(1 for cell in cells if cell["executable"])
     return {
         "schema_version": "openviking.tau2.run_plan.v0",
         "run_id": configured_run_id,
@@ -142,6 +150,8 @@ def _build_plan(
         "tau2": tau2_context(config),
         "simulator_policy": policy_report,
         "cell_count": len(cells),
+        "executable_cell_count": executable_cell_count,
+        "pending_cell_count": len(cells) - executable_cell_count,
         "cells": cells,
     }
 

From 85d536342adc9af02c30eb6227b775d365f9bcf1 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 02:37:49 +0800
Subject: [PATCH 10/17] benchmark: wire tau2 memory v2 eval

---
 benchmark/tau2/README.md                     |  55 ++-
 benchmark/tau2/config/baseline.yaml          |   2 +-
 benchmark/tau2/run_full_eval.sh              |   2 +-
 benchmark/tau2/scripts/run_eval.py           | 222 +++++++++-
 benchmark/tau2/scripts/run_memory_v2_eval.py | 412 +++++++++++++++++++
 5 files changed, 668 insertions(+), 25 deletions(-)
 create mode 100644 benchmark/tau2/scripts/run_memory_v2_eval.py

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index 143b32540..6b617cb48 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -4,11 +4,13 @@ This directory contains a small OpenViking-style entry point for TAU-2 memory
 evaluation. The first version is intentionally narrow:
 
 - no-memory control;
-- fresh OpenViking memory baseline;
+- fresh OpenViking Memory V2 experience-only baseline;
 - trajectory / procedure-view treatment;
 - optional pre-write recall.
 
 Category rerank and other harness-only diagnostics are not migrated here yet.
+The Memory V2 baseline is wired end to end; trajectory / procedure-view remains
+visible in the plan but adapter-pending.
 
 ## Layout
 
@@ -85,30 +87,49 @@ configured:
 benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute
 ```
 
+Run the Memory V2 8-trial baseline (`retail + airline` x 4 repeats):
+
+```bash
+benchmark/tau2/run_full_eval.sh \
+  --config benchmark/tau2/config/baseline.yaml \
+  --strategy-id memory_v2_experience_only \
+  --execute
+```
+
+For a small E2E smoke, keep both the eval and train slices tiny:
+
+```bash
+benchmark/tau2/run_full_eval.sh \
+  --config benchmark/tau2/config/baseline.yaml \
+  --domain retail \
+  --strategy-id memory_v2_experience_only \
+  --num-tasks 1 \
+  --train-num-tasks 1 \
+  --repeat-count 1 \
+  --execute
+```
+
 When using Doubao through an OpenAI-compatible endpoint, set `OPENAI_API_KEY`
 and `OPENAI_API_BASE` for LiteLLM before running upstream TAU-2.
 
-The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory
-cells are kept in the same plan, but marked adapter-pending until the TAU-2
-agent adapter is wired in this benchmark directory.
+Start the OpenViking service before executing memory cells, and verify it with
+`ov status`. For evidence runs, use a clean OpenViking workspace/config and set
+`OPENVIKING_URL` explicitly so local custom memory templates do not pollute the
+Memory V2 baseline.
 
 ## Memory Adapter Boundary
 
-The first PR keeps memory strategies visible in `run_plan.json` without
-pretending they are executable through upstream TAU-2 flags. `no_memory` cells
-can run immediately through the external TAU-2 CLI. OpenViking memory cells are
-planned with corpus / strategy metadata and `adapter_status: pending`; the plan
-also records `non_executable_reason` for those cells.
-
-The next adapter step should register a TAU-2 agent entry point that can:
+`no_memory` cells run through the external TAU-2 CLI. `memory_v2_experience_only`
+cells run through a small TAU-2 agent adapter in this directory:
 
 - train by writing TAU-2 training conversations into OpenViking sessions;
-- evaluate by retrieving OpenViking memory at the configured decision node;
-- emit enough artifact metadata to identify the OpenViking account, agent,
+- evaluate by retrieving OpenViking experience memory at the first user turn;
+- emit artifact metadata to identify the OpenViking account, agent,
   corpus, retrieval mode, and simulator policy used by each cell.
 
-Until that adapter exists, `--execute` is expected to fail fast if a selected
-cell needs OpenViking memory.
+The trajectory / procedure-view treatment is kept in the same plan but remains
+`adapter_status: pending`; `--execute` fails fast if that strategy is selected
+before its adapter is implemented.
 
 ## User Simulator Policy
 
@@ -132,3 +153,7 @@ artifact records that boundary instead of labeling the run pure official.
 Only completed `retail + airline` runs with the same config, same seeds/repeats,
 and non-empty artifacts should be read as benchmark evidence. Partial runs,
 single-task probes, or missing OpenViking corpus identity are diagnostics.
+Executed runs write per-cell JSON under `cell_results/` and a strategy/domain
+aggregate under `scoreboard.json`. Memory training artifacts are shared by
+domain and strategy under `memory_corpora/`, so repeated eval cells reuse the
+same fresh corpus instead of rewriting it.
diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
index 602545ee4..08c0a6bdd 100644
--- a/benchmark/tau2/config/baseline.yaml
+++ b/benchmark/tau2/config/baseline.yaml
@@ -45,7 +45,7 @@ strategies:
   - id: memory_v2_experience_only
     label: OpenViking Memory V2 experience-only
     memory_backend: openviking
-    adapter_status: pending
+    adapter_status: ready
     train_required: true
     train_memory_mode: experience_only
     retrieval_mode: first_user
diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh
index 22936e3d3..ca69a7a32 100755
--- a/benchmark/tau2/run_full_eval.sh
+++ b/benchmark/tau2/run_full_eval.sh
@@ -33,7 +33,7 @@ while [[ $# -gt 0 ]]; do
       STRICT_PREFLIGHT=true
       shift
       ;;
-    --domain|--repeat-count|--strategy-id|--task-id|--num-tasks)
+    --domain|--repeat-count|--strategy-id|--task-id|--num-tasks|--train-num-tasks)
       RUN_EVAL_EXTRA+=("$1" "$2")
       shift 2
       ;;
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index 67a229a5f..8b6a81a4a 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -3,6 +3,7 @@
 
 import argparse
 import importlib.util
+import json
 import subprocess
 import sys
 from pathlib import Path
@@ -24,18 +25,125 @@
 )
 
 
+def _reward(sim: dict[str, Any]) -> float:
+    info = sim.get("reward_info") or {}
+    value = info.get("reward", sim.get("reward", 0.0))
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+
+def _db_match(sim: dict[str, Any]) -> bool | None:
+    info = sim.get("reward_info") or {}
+    db = info.get("db_check") or {}
+    if isinstance(db, dict):
+        if "score" in db:
+            return bool(db["score"])
+        if "db_match" in db:
+            return bool(db["db_match"])
+    return sim.get("db_match")
+
+
+def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]:
+    data = json.loads(results_path.read_text(encoding="utf-8"))
+    sims = data.get("simulations") or []
+    rewards = [_reward(sim) for sim in sims]
+    db_values = [_db_match(sim) for sim in sims]
+    db_known = [value for value in db_values if value is not None]
+    return {
+        "simulation_count": len(sims),
+        "avg_reward": sum(rewards) / len(rewards) if rewards else 0.0,
+        "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) if db_known else None,
+    }
+
+
 def _tau2_command(
     config: dict[str, Any],
     *,
     domain: str,
     strategy: dict[str, Any],
+    configured_run_id: str,
     run_label: str,
     task_ids: list[str] | None,
     num_tasks: int | None,
+    train_num_tasks: int | None,
 ) -> list[str] | None:
     benchmark = config["benchmark"]
     model = config["model"]
 
+    reasoning_effort = benchmark.get("reasoning_effort")
+    agent_llm_args = '{"temperature":0.0}'
+    user_llm_args = '{"temperature":0.0}'
+    if reasoning_effort:
+        agent_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'
+        user_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'
+
+    if (
+        strategy.get("memory_backend") == "openviking"
+        and strategy.get("train_memory_mode") == "experience_only"
+    ):
+        openviking = config["openviking"]
+        account = f"{openviking['account']}-{configured_run_id}-{domain}-{strategy['id']}"
+        agent_id = f"{openviking['agent_id']}-{domain}-{strategy['id']}"
+        user = f"tau2-{domain}-{strategy['id']}"
+        search_uri = f"viking://agent/{agent_id}/memories/experiences"
+        command = [
+            sys.executable,
+            str(Path(__file__).with_name("run_memory_v2_eval.py")),
+            "--tau2-repo",
+            str(tau2_repo(config)),
+            "--run-dir",
+            str(output_dir(config, configured_run_id) / "memory_cells" / run_label),
+            "--corpus-dir",
+            str(
+                output_dir(config, configured_run_id)
+                / "memory_corpora"
+                / f"{domain}_{strategy['id']}"
+            ),
+            "--run-label",
+            run_label,
+            "--domain",
+            domain,
+            "--train-split-name",
+            str(benchmark.get("train_split_name", "train")),
+            "--eval-split-name",
+            str(benchmark.get("eval_split_name", "test")),
+            "--max-steps",
+            str(benchmark.get("max_steps", 200)),
+            "--max-concurrency",
+            str(benchmark.get("task_max_concurrency", 10)),
+            "--agent-llm",
+            str(model["agent_llm"]),
+            "--user-llm",
+            str(model["user_llm"]),
+            "--agent-llm-args",
+            agent_llm_args,
+            "--user-llm-args",
+            user_llm_args,
+            "--openviking-url",
+            str(openviking["url"]),
+            "--openviking-account",
+            account,
+            "--openviking-user",
+            user,
+            "--openviking-agent-id",
+            agent_id,
+            "--search-uri",
+            search_uri,
+            "--retrieval-top-k",
+            str(openviking.get("retrieval_top_k", 4)),
+        ]
+        if task_ids:
+            for task_id in task_ids:
+                command.extend(["--task-id", task_id])
+        elif num_tasks is not None:
+            command.extend(["--num-tasks", str(num_tasks)])
+        train_num_tasks = train_num_tasks if train_num_tasks is not None else strategy.get("train_num_tasks")
+        if train_num_tasks is not None:
+            command.extend(["--train-num-tasks", str(train_num_tasks)])
+        return command
+
     if strategy.get("memory_backend") != "none":
         return None
 
@@ -64,10 +172,8 @@ def _tau2_command(
         run_label,
     ]
 
-    reasoning_effort = benchmark.get("reasoning_effort")
-    if reasoning_effort:
-        command.extend(["--agent-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'])
-        command.extend(["--user-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}'])
+    command.extend(["--agent-llm-args", agent_llm_args])
+    command.extend(["--user-llm-args", user_llm_args])
 
     if task_ids:
         command.append("--task-ids")
@@ -86,6 +192,7 @@ def _build_plan(
     selected_strategy_ids: set[str] | None,
     task_ids: list[str] | None,
     num_tasks: int | None,
+    train_num_tasks: int | None,
     repeat_count_override: int | None,
 ) -> dict[str, Any]:
     repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4))
@@ -112,15 +219,17 @@ def _build_plan(
                     config,
                     domain=domain,
                     strategy=strategy,
+                    configured_run_id=configured_run_id,
                     run_label=run_label,
                     task_ids=task_ids,
                     num_tasks=num_tasks,
+                    train_num_tasks=train_num_tasks,
                 )
                 non_executable_reason = None
                 if command is None:
                     non_executable_reason = (
-                        "OpenViking memory strategy requires a TAU-2 agent adapter; "
-                        "this benchmark scaffold only executes upstream TAU-2 no-memory cells."
+                        "This OpenViking memory strategy is planned but not wired to "
+                        "the TAU-2 adapter in this PR."
                     )
                 cells.append(
                     {
@@ -156,6 +265,90 @@ def _build_plan(
     }
 
 
+def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, str]:
+    if cell["strategy_id"] == "memory_v2_experience_only":
+        run_dir = out / "memory_cells" / cell["run_label"]
+        corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{cell['strategy_id']}"
+        return {
+            "summary": str(run_dir / f"{cell['run_label']}.summary.json"),
+            "results": str(run_dir / f"{cell['run_label']}.json"),
+            "retrieval_trace": str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl"),
+            "corpus_manifest": str(corpus_dir / "corpus_manifest.json"),
+        }
+    return {
+        "results": str(repo / "data" / "simulations" / f"{cell['run_label']}.json")
+    }
+
+
+def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any] | None:
+    if cell["strategy_id"] == "memory_v2_experience_only":
+        summary_path = Path(artifacts["summary"])
+        if not summary_path.is_file():
+            return None
+        summary = json.loads(summary_path.read_text(encoding="utf-8"))
+        return summary.get("metrics")
+
+    results_path = Path(artifacts["results"])
+    if not results_path.is_file():
+        return None
+    return _metrics_from_tau2_results(results_path)
+
+
+def _summarize(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]:
+        metric_rows = [row for row in rows_for_group if row.get("metrics")]
+        sim_count = sum(int(row["metrics"].get("simulation_count") or 0) for row in metric_rows)
+        reward_sum = sum(
+            float(row["metrics"].get("avg_reward") or 0.0)
+            * int(row["metrics"].get("simulation_count") or 0)
+            for row in metric_rows
+        )
+        db_weighted_rows = [
+            row
+            for row in metric_rows
+            if row["metrics"].get("db_match_rate") is not None
+            and int(row["metrics"].get("simulation_count") or 0) > 0
+        ]
+        db_weight = sum(int(row["metrics"].get("simulation_count") or 0) for row in db_weighted_rows)
+        db_sum = sum(
+            float(row["metrics"]["db_match_rate"])
+            * int(row["metrics"].get("simulation_count") or 0)
+            for row in db_weighted_rows
+        )
+        return {
+            "cell_count": len(rows_for_group),
+            "completed_cell_count": len(metric_rows),
+            "simulation_count": sim_count,
+            "avg_reward": reward_sum / sim_count if sim_count else None,
+            "db_match_rate": db_sum / db_weight if db_weight else None,
+        }
+
+    by_strategy: dict[str, dict[str, Any]] = {}
+    for row in rows:
+        strategy_id = row["strategy_id"]
+        strategy_summary = by_strategy.setdefault(
+            strategy_id,
+            {
+                "strategy_id": strategy_id,
+                "domains": {},
+                "task_weighted_total": {},
+            },
+        )
+        strategy_summary["domains"].setdefault(row["domain"], []).append(row)
+
+    for strategy_summary in by_strategy.values():
+        all_rows = []
+        for domain, domain_rows in list(strategy_summary["domains"].items()):
+            strategy_summary["domains"][domain] = weighted(domain_rows)
+            all_rows.extend(domain_rows)
+        strategy_summary["task_weighted_total"] = weighted(all_rows)
+
+    return {
+        "schema_version": "openviking.tau2.scoreboard.v0",
+        "strategies": by_strategy,
+    }
+
+
 def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]:
     policy_report = plan.get("simulator_policy") or {}
     if not policy_report.get("supported", False):
@@ -187,6 +380,8 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str
             "stdout_tail": completed.stdout[-4000:],
             "stderr_tail": completed.stderr[-4000:],
         }
+        row["artifacts"] = _cell_artifacts(cell, repo, out)
+        row["metrics"] = _cell_metrics(cell, row["artifacts"])
         rows.append(row)
         write_json(out / "cell_results" / f"{cell['run_label']}.json", row)
         if completed.returncode != 0:
@@ -251,8 +446,17 @@ def main() -> int:
     parser.add_argument("--strategy-id", action="append", help="Run only this strategy id; may be repeated.")
     parser.add_argument("--task-id", action="append", help="Run only this TAU-2 task id; may be repeated.")
     parser.add_argument("--num-tasks", type=int, help="Run the first N tasks from the selected split.")
-    parser.add_argument("--preflight", action="store_true", help="Write a lightweight environment/config preflight report.")
-    parser.add_argument("--strict-preflight", action="store_true", help="Fail if optional runtime imports or split files are missing.")
+    parser.add_argument("--train-num-tasks", type=int, help="Train OpenViking memory on the first N train tasks.")
+    parser.add_argument(
+        "--preflight",
+        action="store_true",
+        help="Write a lightweight environment/config preflight report.",
+    )
+    parser.add_argument(
+        "--strict-preflight",
+        action="store_true",
+        help="Fail if optional runtime imports or split files are missing.",
+    )
     parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.")
     parser.add_argument("--execute", action="store_true", help="Execute planned cells.")
     args = parser.parse_args()
@@ -275,6 +479,7 @@ def main() -> int:
         selected_strategy_ids=set(args.strategy_id) if args.strategy_id else None,
         task_ids=args.task_id,
         num_tasks=args.num_tasks,
+        train_num_tasks=args.train_num_tasks,
         repeat_count_override=args.repeat_count,
     )
     write_json(out / "run_plan.json", plan)
@@ -287,6 +492,7 @@ def main() -> int:
             plan["status"] = "succeeded"
             plan["executed_cell_count"] = len(rows)
             write_json(out / "run_plan.json", plan)
+            write_json(out / "scoreboard.json", _summarize(rows))
         except Exception as exc:
             plan["status"] = "failed"
             plan["error"] = str(exc)
diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
new file mode 100644
index 000000000..ff0d6f32c
--- /dev/null
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+
+AGENT_NAME = "openviking_memory_agent"
+REPO_ROOT = Path(__file__).resolve().parents[3]
+
+
+def _json(text: str) -> dict[str, Any]:
+    return json.loads(text) if text else {}
+
+
+def _write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n")
+
+
+def _add_tau2_to_path(tau2_repo: Path) -> None:
+    src = tau2_repo / "src"
+    sys.path.insert(0, str(REPO_ROOT))
+    sys.path.insert(0, str(src if src.is_dir() else tau2_repo))
+
+
+def _save_to_arg(path: Path) -> str:
+    # TAU-2 run_domain appends ".json" to save_to. Keep our artifact paths
+    # stable by passing the stem when callers hand us a JSON path.
+    return str(path.with_suffix("") if path.suffix == ".json" else path)
+
+
+def _reward(sim: dict[str, Any]) -> float:
+    info = sim.get("reward_info") or {}
+    value = info.get("reward", sim.get("reward", 0.0))
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+
+def _db_match(sim: dict[str, Any]) -> bool | None:
+    info = sim.get("reward_info") or {}
+    db = info.get("db_check") or {}
+    if isinstance(db, dict):
+        if "score" in db:
+            return bool(db["score"])
+        if "db_match" in db:
+            return bool(db["db_match"])
+    return sim.get("db_match")
+
+
+def _metrics(results_path: Path) -> dict[str, Any]:
+    data = json.loads(results_path.read_text())
+    sims = data.get("simulations") or []
+    rewards = [_reward(sim) for sim in sims]
+    db_values = [_db_match(sim) for sim in sims]
+    db_known = [value for value in db_values if value is not None]
+    return {
+        "simulation_count": len(sims),
+        "avg_reward": sum(rewards) / len(rewards) if rewards else 0.0,
+        "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) if db_known else None,
+    }
+
+
+def _message_text(message: dict[str, Any]) -> tuple[str, str]:
+    role = str(message.get("role") or "assistant")
+    if role == "user":
+        return "user", str(message.get("content") or "")
+    if role == "tool":
+        return "assistant", f"Tool result: {message.get('content') or ''}"
+    calls = message.get("tool_calls") or []
+    if calls:
+        rendered = []
+        for call in calls:
+            name = call.get("name") or call.get("function", {}).get("name") or "unknown_tool"
+            arguments = call.get("arguments") or call.get("function", {}).get("arguments") or {}
+            rendered.append(f"{name}({json.dumps(arguments, ensure_ascii=False, sort_keys=True)})")
+        return "assistant", "Assistant tool call: " + "; ".join(rendered)
+    return "assistant", str(message.get("content") or "")
+
+
+def _run_tau2(
+    *,
+    tau2_repo: Path,
+    domain: str,
+    split: str,
+    task_ids: list[str] | None,
+    num_tasks: int | None,
+    trials: int,
+    max_steps: int,
+    max_concurrency: int,
+    agent: str,
+    user: str,
+    agent_llm: str,
+    user_llm: str,
+    agent_llm_args: dict[str, Any],
+    user_llm_args: dict[str, Any],
+    seed: int,
+    save_to: Path,
+):
+    _add_tau2_to_path(tau2_repo)
+    from tau2.data_model.simulation import RunConfig
+    from tau2.run import run_domain
+
+    if save_to.exists():
+        save_to.unlink()
+    return run_domain(
+        RunConfig(
+            domain=domain,
+            task_split_name=split,
+            task_ids=task_ids,
+            num_tasks=num_tasks,
+            agent=agent,
+            llm_agent=agent_llm,
+            llm_args_agent=agent_llm_args,
+            user=user,
+            llm_user=user_llm,
+            llm_args_user=user_llm_args,
+            num_trials=trials,
+            max_steps=max_steps,
+            save_to=_save_to_arg(save_to),
+            max_concurrency=max_concurrency,
+            seed=seed,
+            log_level="INFO",
+        )
+    )
+
+
+def _client(args: argparse.Namespace):
+    import openviking as ov
+
+    client = ov.SyncHTTPClient(
+        url=args.openviking_url,
+        api_key="",
+        user=args.openviking_user,
+        agent_id=args.openviking_agent_id,
+        account=args.openviking_account,
+        timeout=args.openviking_timeout,
+        extra_headers={},
+    )
+    client.initialize()
+    return client
+
+
+def _wait_task(client: Any, task_id: str | None, timeout: int) -> dict[str, Any]:
+    if not task_id:
+        return {"status": "no_task"}
+    deadline = time.time() + timeout
+    last = None
+    while time.time() < deadline:
+        last = client.get_task(task_id)
+        status = (last or {}).get("status")
+        if status == "completed":
+            return last or {"status": status}
+        if status in {"failed", "cancelled"}:
+            raise RuntimeError(f"OpenViking task {task_id} {status}: {last}")
+        time.sleep(2)
+    raise TimeoutError(f"OpenViking task {task_id} did not finish within {timeout}s: {last}")
+
+
+def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) -> dict[str, Any]:
+    if corpus_manifest.is_file() and not args.force_train:
+        return json.loads(corpus_manifest.read_text())
+
+    _run_tau2(
+        tau2_repo=args.tau2_repo,
+        domain=args.domain,
+        split=args.train_split_name,
+        task_ids=args.train_task_ids,
+        num_tasks=args.train_num_tasks,
+        trials=1,
+        max_steps=args.max_steps,
+        max_concurrency=args.max_concurrency,
+        agent=args.base_agent,
+        user=args.user,
+        agent_llm=args.agent_llm,
+        user_llm=args.user_llm,
+        agent_llm_args=args.agent_llm_args,
+        user_llm_args=args.user_llm_args,
+        seed=args.seed,
+        save_to=train_results,
+    )
+
+    data = json.loads(train_results.read_text())
+    client = _client(args)
+    committed = []
+    try:
+        for sim in data.get("simulations") or []:
+            session_id = f"tau2-{args.domain}-train-{sim.get('task_id')}-trial-{sim.get('trial', 0)}"
+            created = client.create_session(session_id=session_id)
+            sid = created.get("session_id", session_id)
+            for msg in sim.get("messages") or []:
+                role, text = _message_text(msg)
+                if not text.strip():
+                    continue
+                client.add_message(
+                    sid,
+                    role=role,
+                    parts=[{"type": "text", "text": text}],
+                    created_at=msg.get("timestamp"),
+                )
+            result = client.commit_session(sid, telemetry=True)
+            task = _wait_task(client, result.get("task_id"), args.openviking_wait_timeout)
+            committed.append(
+                {
+                    "session_id": sid,
+                    "task_id": sim.get("task_id"),
+                    "commit_status": result.get("status"),
+                    "openviking_task_id": result.get("task_id"),
+                    "openviking_task_status": task.get("status"),
+                }
+            )
+    finally:
+        client.close()
+
+    manifest = {
+        "domain": args.domain,
+        "train_results": str(train_results),
+        "openviking": {
+            "url": args.openviking_url,
+            "account": args.openviking_account,
+            "user": args.openviking_user,
+            "agent_id": args.openviking_agent_id,
+            "search_uri": args.search_uri,
+        },
+        "committed_sessions": committed,
+        "committed_session_count": len(committed),
+    }
+    _write_json(corpus_manifest, manifest)
+    return manifest
+
+
+def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None:
+    _add_tau2_to_path(args.tau2_repo)
+
+    from tau2.agent.llm_agent import LLMAgent, LLMAgentState
+    from tau2.data_model.message import MultiToolMessage, SystemMessage
+    from tau2.registry import registry
+    from tau2.utils.llm_utils import generate
+
+    class OpenVikingMemoryAgent(LLMAgent):
+        def get_init_state(self, message_history=None):
+            state = super().get_init_state(message_history)
+            state.system_messages.append(
+                SystemMessage(role="system", content="<openviking_memory_not_loaded/>")
+            )
+            return state
+
+        def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]:
+            client = _client(args)
+            rows: list[dict[str, Any]] = []
+            try:
+                result = client.search(query=query, target_uri=args.search_uri, limit=args.retrieval_top_k)
+                memories = list(getattr(result, "memories", []) or [])
+                blocks = []
+                for index, match in enumerate(memories[: args.retrieval_top_k], 1):
+                    uri = getattr(match, "uri", "")
+                    text = ""
+                    try:
+                        text = client.read(uri)
+                    except Exception:
+                        text = getattr(match, "abstract", "") or getattr(match, "overview", "") or ""
+                    rows.append(
+                        {
+                            "uri": uri,
+                            "score": getattr(match, "score", None),
+                            "level": getattr(match, "level", None),
+                            "text_chars": len(text),
+                        }
+                    )
+                    if text.strip():
+                        blocks.append(f"Memory {index} ({uri}):\n{text.strip()}")
+                return "\n\n".join(blocks), rows
+            finally:
+                client.close()
+
+        def generate_next_message(self, message, state: LLMAgentState):
+            if isinstance(message, MultiToolMessage):
+                state.messages.extend(message.tool_messages)
+            else:
+                state.messages.append(message)
+            marker_index = next(
+                (
+                    i
+                    for i, item in enumerate(state.system_messages)
+                    if isinstance(item, SystemMessage) and item.content == "<openviking_memory_not_loaded/>"
+                ),
+                None,
+            )
+            role = getattr(message, "role", "")
+            role_value = getattr(role, "value", role)
+            if marker_index is not None and str(role_value) == "user":
+                query = str(getattr(message, "content", "") or "")
+                block, matches = self._retrieve(query)
+                prompt = (
+                    "No OpenViking memory matched this user request."
+                    if not block
+                    else "Use these OpenViking experience memories only when they match the current task:\n\n"
+                    + block
+                )
+                state.system_messages[marker_index] = SystemMessage(role="system", content=prompt)
+                with trace_path.open("a", encoding="utf-8") as handle:
+                    handle.write(
+                        json.dumps(
+                            {
+                                "query": query,
+                                "match_count": len(matches),
+                                "matches": matches,
+                            },
+                            ensure_ascii=False,
+                            sort_keys=True,
+                        )
+                        + "\n"
+                    )
+
+            assistant_message = generate(
+                model=self.llm,
+                tools=self.tools,
+                messages=state.system_messages + state.messages,
+                **self.llm_args,
+            )
+            state.messages.append(assistant_message)
+            return assistant_message, state
+
+    if AGENT_NAME not in registry.get_agents():
+        registry.register_agent(OpenVikingMemoryAgent, AGENT_NAME)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run TAU-2 with OpenViking Memory V2.")
+    parser.add_argument("--tau2-repo", type=Path, required=True)
+    parser.add_argument("--run-dir", type=Path, required=True)
+    parser.add_argument("--corpus-dir", type=Path)
+    parser.add_argument("--run-label", required=True)
+    parser.add_argument("--domain", required=True)
+    parser.add_argument("--train-split-name", default="train")
+    parser.add_argument("--eval-split-name", default="test")
+    parser.add_argument("--task-id", dest="task_ids", action="append")
+    parser.add_argument("--num-tasks", type=int)
+    parser.add_argument("--train-task-id", dest="train_task_ids", action="append")
+    parser.add_argument("--train-num-tasks", type=int)
+    parser.add_argument("--max-steps", type=int, default=200)
+    parser.add_argument("--max-concurrency", type=int, default=10)
+    parser.add_argument("--seed", type=int, default=300)
+    parser.add_argument("--base-agent", default="llm_agent")
+    parser.add_argument("--user", default="user_simulator")
+    parser.add_argument("--agent-llm", required=True)
+    parser.add_argument("--user-llm", required=True)
+    parser.add_argument("--agent-llm-args", type=_json, default={})
+    parser.add_argument("--user-llm-args", type=_json, default={})
+    parser.add_argument("--openviking-url", required=True)
+    parser.add_argument("--openviking-account", required=True)
+    parser.add_argument("--openviking-user", required=True)
+    parser.add_argument("--openviking-agent-id", required=True)
+    parser.add_argument("--openviking-timeout", type=float, default=600.0)
+    parser.add_argument("--openviking-wait-timeout", type=int, default=600)
+    parser.add_argument("--search-uri", required=True)
+    parser.add_argument("--retrieval-top-k", type=int, default=4)
+    parser.add_argument("--force-train", action="store_true")
+    args = parser.parse_args()
+
+    args.tau2_repo = args.tau2_repo.resolve()
+    args.run_dir.mkdir(parents=True, exist_ok=True)
+    corpus_dir = args.corpus_dir or args.run_dir
+    corpus_dir.mkdir(parents=True, exist_ok=True)
+    train_results = corpus_dir / "train_results.json"
+    corpus_manifest = corpus_dir / "corpus_manifest.json"
+    eval_results = args.run_dir / f"{args.run_label}.json"
+    trace_path = args.run_dir / f"{args.run_label}.retrieval_trace.jsonl"
+    summary_path = args.run_dir / f"{args.run_label}.summary.json"
+
+    corpus = _train(args, train_results, corpus_manifest)
+    _register_memory_agent(args, trace_path)
+    _run_tau2(
+        tau2_repo=args.tau2_repo,
+        domain=args.domain,
+        split=args.eval_split_name,
+        task_ids=args.task_ids,
+        num_tasks=args.num_tasks,
+        trials=1,
+        max_steps=args.max_steps,
+        max_concurrency=args.max_concurrency,
+        agent=AGENT_NAME,
+        user=args.user,
+        agent_llm=args.agent_llm,
+        user_llm=args.user_llm,
+        agent_llm_args=args.agent_llm_args,
+        user_llm_args=args.user_llm_args,
+        seed=args.seed,
+        save_to=eval_results,
+    )
+    summary = {
+        "run_label": args.run_label,
+        "domain": args.domain,
+        "strategy_id": "memory_v2_experience_only",
+        "corpus": corpus,
+        "eval_results": str(eval_results),
+        "retrieval_trace": str(trace_path),
+        "metrics": _metrics(eval_results),
+    }
+    _write_json(summary_path, summary)
+    print(json.dumps(summary, ensure_ascii=False, sort_keys=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 90f040aa589619b37c1a396fd77559ffe30d3d7c Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 05:18:16 +0800
Subject: [PATCH 11/17] benchmark: harden tau2 memory agent tool calls

---
 benchmark/tau2/scripts/run_memory_v2_eval.py | 44 ++++++++++++++++----
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
index ff0d6f32c..1b8832fc0 100644
--- a/benchmark/tau2/scripts/run_memory_v2_eval.py
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -239,7 +239,7 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None:
     _add_tau2_to_path(args.tau2_repo)
 
     from tau2.agent.llm_agent import LLMAgent, LLMAgentState
-    from tau2.data_model.message import MultiToolMessage, SystemMessage
+    from tau2.data_model.message import AssistantMessage, MultiToolMessage, SystemMessage
     from tau2.registry import registry
     from tau2.utils.llm_utils import generate
 
@@ -279,6 +279,41 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]:
             finally:
                 client.close()
 
+        def _generate(self, messages):
+            try:
+                return generate(
+                    model=self.llm,
+                    tools=self.tools,
+                    messages=messages,
+                    **self.llm_args,
+                )
+            except json.JSONDecodeError:
+                retry_messages = messages + [
+                    SystemMessage(
+                        role="system",
+                        content=(
+                            "Retry the last assistant step once. If you call a tool, "
+                            "the tool arguments must be syntactically valid JSON."
+                        ),
+                    )
+                ]
+                try:
+                    return generate(
+                        model=self.llm,
+                        tools=self.tools,
+                        messages=retry_messages,
+                        **self.llm_args,
+                    )
+                except json.JSONDecodeError as exc:
+                    return AssistantMessage(
+                        role="assistant",
+                        content="I need to continue with the available task information.",
+                        raw_data={
+                            "openviking_memory_agent_error": "invalid_tool_call_json",
+                            "error": str(exc),
+                        },
+                    )
+
         def generate_next_message(self, message, state: LLMAgentState):
             if isinstance(message, MultiToolMessage):
                 state.messages.extend(message.tool_messages)
@@ -318,12 +353,7 @@ def generate_next_message(self, message, state: LLMAgentState):
                         + "\n"
                     )
 
-            assistant_message = generate(
-                model=self.llm,
-                tools=self.tools,
-                messages=state.system_messages + state.messages,
-                **self.llm_args,
-            )
+            assistant_message = self._generate(state.system_messages + state.messages)
             state.messages.append(assistant_message)
             return assistant_message, state
 

From b52e65b790eb6e02e2d98e15a0db6ab6602bdd7c Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 10:07:59 +0800
Subject: [PATCH 12/17] benchmark: tolerate empty tau2 assistant responses

---
 benchmark/tau2/scripts/run_memory_v2_eval.py | 57 ++++++++++++++------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
index 1b8832fc0..6dfe5b6f6 100644
--- a/benchmark/tau2/scripts/run_memory_v2_eval.py
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -280,13 +280,20 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]:
                 client.close()
 
         def _generate(self, messages):
+            def _is_empty_assistant(response) -> bool:
+                content = str(getattr(response, "content", "") or "")
+                tool_calls = getattr(response, "tool_calls", None) or []
+                return not content.strip() and not tool_calls
+
             try:
-                return generate(
+                response = generate(
                     model=self.llm,
                     tools=self.tools,
                     messages=messages,
                     **self.llm_args,
                 )
+                if not _is_empty_assistant(response):
+                    return response
             except json.JSONDecodeError:
                 retry_messages = messages + [
                     SystemMessage(
@@ -297,22 +304,40 @@ def _generate(self, messages):
                         ),
                     )
                 ]
-                try:
-                    return generate(
-                        model=self.llm,
-                        tools=self.tools,
-                        messages=retry_messages,
-                        **self.llm_args,
-                    )
-                except json.JSONDecodeError as exc:
-                    return AssistantMessage(
-                        role="assistant",
-                        content="I need to continue with the available task information.",
-                        raw_data={
-                            "openviking_memory_agent_error": "invalid_tool_call_json",
-                            "error": str(exc),
-                        },
+            else:
+                retry_messages = messages + [
+                    SystemMessage(
+                        role="system",
+                        content=(
+                            "Retry the last assistant step once. Return either a useful "
+                            "natural language response or a valid tool call; do not return "
+                            "an empty assistant message."
+                        ),
                     )
+                ]
+            try:
+                response = generate(
+                    model=self.llm,
+                    tools=self.tools,
+                    messages=retry_messages,
+                    **self.llm_args,
+                )
+                if not _is_empty_assistant(response):
+                    return response
+                return AssistantMessage(
+                    role="assistant",
+                    content="I need to continue with the available task information.",
+                    raw_data={"openviking_memory_agent_error": "empty_assistant_message"},
+                )
+            except json.JSONDecodeError as exc:
+                return AssistantMessage(
+                    role="assistant",
+                    content="I need to continue with the available task information.",
+                    raw_data={
+                        "openviking_memory_agent_error": "invalid_tool_call_json",
+                        "error": str(exc),
+                    },
+                )
 
         def generate_next_message(self, message, state: LLMAgentState):
             if isinstance(message, MultiToolMessage):

From 1c84468a2e34937508cf96fd38260c644b2cbae4 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 10:37:42 +0800
Subject: [PATCH 13/17] benchmark: normalize tau2 llm environment

---
 benchmark/tau2/scripts/run_eval.py           |  8 ++++++
 benchmark/tau2/scripts/run_memory_v2_eval.py |  3 +++
 benchmark/tau2/scripts/tau2_common.py        | 28 ++++++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index 8b6a81a4a..aaab01cbd 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -13,6 +13,7 @@
     domains,
     load_config,
     output_dir,
+    normalize_litellm_env,
     run_id,
     simulator_policy_report,
     split_file,
@@ -391,12 +392,17 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str
 
 def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
     errors: list[str] = []
+    llm_env = normalize_litellm_env()
     tau2_info = tau2_context(config)
     policy_report = simulator_policy_report(config)
     if strict and not tau2_info["tau2_repo_exists"]:
         errors.append(f"missing TAU-2 repo: {tau2_info['tau2_repo']}")
     if strict and not tau2_info["tau2_cli_resolved"]:
         errors.append(f"missing TAU-2 CLI: {tau2_info['tau2_cli']}")
+    if strict and not llm_env["has_api_key"]:
+        errors.append("missing LLM API key: set OPENAI_API_KEY or ARK_API_KEY")
+    if strict and not llm_env["has_base_url"]:
+        errors.append("missing OpenAI-compatible base URL: set OPENAI_API_BASE, OPENAI_BASE_URL, or ARK_BASE_URL")
     if strict and not policy_report["supported"]:
         errors.append(
             "configured confirmation-aware user simulator policy requires a TAU-2 "
@@ -421,6 +427,7 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int:
         "status": "failed" if errors else "ok",
         "strict": strict,
         "tau2": tau2_info,
+        "llm_env": llm_env,
         "simulator_policy": policy_report,
         "domains": domains(config),
         "strategies": strategy_ids(config),
@@ -460,6 +467,7 @@ def main() -> int:
     parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.")
     parser.add_argument("--execute", action="store_true", help="Execute planned cells.")
     args = parser.parse_args()
+    normalize_litellm_env()
 
     if args.plan_only and args.execute:
         raise SystemExit("--plan-only and --execute are mutually exclusive")
diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
index 6dfe5b6f6..052c6afd9 100644
--- a/benchmark/tau2/scripts/run_memory_v2_eval.py
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -8,6 +8,8 @@
 from pathlib import Path
 from typing import Any
 
+from tau2_common import normalize_litellm_env
+
 
 AGENT_NAME = "openviking_memory_agent"
 REPO_ROOT = Path(__file__).resolve().parents[3]
@@ -418,6 +420,7 @@ def main() -> int:
     parser.add_argument("--retrieval-top-k", type=int, default=4)
     parser.add_argument("--force-train", action="store_true")
     args = parser.parse_args()
+    normalize_litellm_env()
 
     args.tau2_repo = args.tau2_repo.resolve()
     args.run_dir.mkdir(parents=True, exist_ok=True)
diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py
index ae00e4c6c..15f504cf3 100755
--- a/benchmark/tau2/scripts/tau2_common.py
+++ b/benchmark/tau2/scripts/tau2_common.py
@@ -33,6 +33,34 @@ def run_id() -> str:
     return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
 
 
+def normalize_litellm_env() -> dict[str, Any]:
+    aliases = []
+    if not os.environ.get("OPENAI_API_KEY") and os.environ.get("ARK_API_KEY"):
+        os.environ["OPENAI_API_KEY"] = os.environ["ARK_API_KEY"]
+        aliases.append("OPENAI_API_KEY<-ARK_API_KEY")
+    ark_base = os.environ.get("ARK_BASE_URL")
+    openai_base = os.environ.get("OPENAI_API_BASE") or os.environ.get("OPENAI_BASE_URL")
+    if not openai_base and ark_base:
+        os.environ["OPENAI_API_BASE"] = ark_base
+        os.environ["OPENAI_BASE_URL"] = ark_base
+        aliases.append("OPENAI_API_BASE<-ARK_BASE_URL")
+    elif os.environ.get("OPENAI_API_BASE") and not os.environ.get("OPENAI_BASE_URL"):
+        os.environ["OPENAI_BASE_URL"] = os.environ["OPENAI_API_BASE"]
+        aliases.append("OPENAI_BASE_URL<-OPENAI_API_BASE")
+    elif os.environ.get("OPENAI_BASE_URL") and not os.environ.get("OPENAI_API_BASE"):
+        os.environ["OPENAI_API_BASE"] = os.environ["OPENAI_BASE_URL"]
+        aliases.append("OPENAI_API_BASE<-OPENAI_BASE_URL")
+    return {
+        "aliases": aliases,
+        "has_api_key": bool(os.environ.get("OPENAI_API_KEY") or os.environ.get("ARK_API_KEY")),
+        "has_base_url": bool(
+            os.environ.get("OPENAI_API_BASE")
+            or os.environ.get("OPENAI_BASE_URL")
+            or os.environ.get("ARK_BASE_URL")
+        ),
+    }
+
+
 def render_env(value: Any) -> Any:
     if isinstance(value, str):
         def replace(match: re.Match[str]) -> str:

From a6b753537e6d284d35df8a1705172c275c93b58f Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 12:18:01 +0800
Subject: [PATCH 14/17] benchmark: add tau2 memory prewrite strategy

---
 benchmark/tau2/README.md                     |  37 +++----
 benchmark/tau2/config/baseline.yaml          |  25 ++---
 benchmark/tau2/config/prewrite.yaml          |  15 +--
 benchmark/tau2/scripts/run_eval.py           |  24 +++--
 benchmark/tau2/scripts/run_memory_v2_eval.py | 105 ++++++++++++++++---
 5 files changed, 131 insertions(+), 75 deletions(-)

diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md
index 6b617cb48..7ebdb0807 100644
--- a/benchmark/tau2/README.md
+++ b/benchmark/tau2/README.md
@@ -3,14 +3,11 @@
 This directory contains a small OpenViking-style entry point for TAU-2 memory
 evaluation. The first version is intentionally narrow:
 
-- no-memory control;
 - fresh OpenViking Memory V2 experience-only baseline;
-- trajectory / procedure-view treatment;
-- optional pre-write recall.
+- Memory V2 pre-write recall treatment.
 
-Category rerank and other harness-only diagnostics are not migrated here yet.
-The Memory V2 baseline is wired end to end; trajectory / procedure-view remains
-visible in the plan but adapter-pending.
+Trajectory / procedure-view prompts, category rerank, and other harness-only
+diagnostics are intentionally left out of this first PR.
 
 ## Layout
 
@@ -64,35 +61,27 @@ benchmark/tau2/run_full_eval.sh \
   --config benchmark/tau2/config/baseline.yaml \
   --strict-preflight \
   --domain retail \
-  --strategy-id no_memory \
+  --strategy-id memory_v2_experience_only \
   --task-id 5 \
   --repeat-count 1
 ```
 
-Plan a one-cell upstream TAU-2 smoke:
+Plan a one-cell Memory V2 pre-write smoke:
 
 ```bash
 benchmark/tau2/run_full_eval.sh \
   --config benchmark/tau2/config/baseline.yaml \
   --domain retail \
-  --strategy-id no_memory \
+  --strategy-id memory_v2_prewrite \
   --num-tasks 1 \
   --repeat-count 1
 ```
 
-Run with execution enabled after TAU-2, model credentials, and OpenViking are
-configured:
-
-```bash
-benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute
-```
-
-Run the Memory V2 8-trial baseline (`retail + airline` x 4 repeats):
+Run the Memory V2 8-trial matrix (`retail + airline` x 2 strategies x 8 repeats):
 
 ```bash
 benchmark/tau2/run_full_eval.sh \
   --config benchmark/tau2/config/baseline.yaml \
-  --strategy-id memory_v2_experience_only \
   --execute
 ```
 
@@ -117,20 +106,18 @@ Start the OpenViking service before executing memory cells, and verify it with
 `OPENVIKING_URL` explicitly so local custom memory templates do not pollute the
 Memory V2 baseline.
 
-## Memory Adapter Boundary
+## Memory Adapter
 
-`no_memory` cells run through the external TAU-2 CLI. `memory_v2_experience_only`
-cells run through a small TAU-2 agent adapter in this directory:
+`memory_v2_experience_only` and `memory_v2_prewrite` cells run through a small
+TAU-2 agent adapter in this directory:
 
 - train by writing TAU-2 training conversations into OpenViking sessions;
 - evaluate by retrieving OpenViking experience memory at the first user turn;
+- for pre-write recall, retrieve again before write-like tool calls and
+  regenerate that step with the matched memories;
 - emit artifact metadata to identify the OpenViking account, agent,
   corpus, retrieval mode, and simulator policy used by each cell.
 
-The trajectory / procedure-view treatment is kept in the same plan but remains
-`adapter_status: pending`; `--execute` fails fast if that strategy is selected
-before its adapter is implemented.
-
 ## User Simulator Policy
 
 The runner default is the official TAU-2 user simulator if
diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
index 08c0a6bdd..2dc8a9d2c 100644
--- a/benchmark/tau2/config/baseline.yaml
+++ b/benchmark/tau2/config/baseline.yaml
@@ -5,9 +5,8 @@ benchmark:
     - airline
   train_split_name: train
   eval_split_name: test
-  repeat_count: 4
+  repeat_count: 8
   task_max_concurrency: 10
-  strategy_concurrency: 4
   max_steps: 200
   agent: llm_agent
   user: user_simulator
@@ -27,7 +26,6 @@ eval:
 model:
   agent_llm: ${TAU2_AGENT_LLM:-openai/doubao-seed-2-0-pro-260215}
   user_llm: ${TAU2_USER_LLM:-openai/doubao-seed-2-0-pro-260215}
-  evaluator_llm: ${TAU2_EVALUATOR_LLM:-openai/doubao-seed-2-0-pro-260215}
   temperature: 0.0
 
 openviking:
@@ -38,26 +36,17 @@ openviking:
   replay_write_policy: read_only
 
 strategies:
-  - id: no_memory
-    label: No memory
-    memory_backend: none
-    train_required: false
   - id: memory_v2_experience_only
     label: OpenViking Memory V2 experience-only
     memory_backend: openviking
-    adapter_status: ready
     train_required: true
+    corpus_id: memory_v2_experience_only
     train_memory_mode: experience_only
     retrieval_mode: first_user
-  - id: trajectory_procedure_view
-    label: OpenViking trajectory procedure view
+  - id: memory_v2_prewrite
+    label: OpenViking Memory V2 pre-write recall
     memory_backend: openviking
-    adapter_status: pending
     train_required: true
-    train_memory_mode: trajectory_procedure_view
-    operation_mode: add_only
-    retrieval_mode: first_user
-
-features:
-  prewrite_recall:
-    enabled: false
+    corpus_id: memory_v2_experience_only
+    train_memory_mode: experience_only
+    retrieval_mode: prewrite
diff --git a/benchmark/tau2/config/prewrite.yaml b/benchmark/tau2/config/prewrite.yaml
index 965f09963..e8b12d9cf 100644
--- a/benchmark/tau2/config/prewrite.yaml
+++ b/benchmark/tau2/config/prewrite.yaml
@@ -3,10 +3,11 @@ extends: baseline.yaml
 benchmark:
   name: tau2_openviking_prewrite
 
-features:
-  prewrite_recall:
-    enabled: true
-    decision_nodes:
-      - before_write_tool_call
-    max_memories: 4
-    evidence_boundary: runtime_retrieval_trace_required
+strategies:
+  - id: memory_v2_prewrite
+    label: OpenViking Memory V2 pre-write recall
+    memory_backend: openviking
+    train_required: true
+    corpus_id: memory_v2_experience_only
+    train_memory_mode: experience_only
+    retrieval_mode: prewrite
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index aaab01cbd..57a6069e8 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -85,9 +85,10 @@ def _tau2_command(
         and strategy.get("train_memory_mode") == "experience_only"
     ):
         openviking = config["openviking"]
-        account = f"{openviking['account']}-{configured_run_id}-{domain}-{strategy['id']}"
-        agent_id = f"{openviking['agent_id']}-{domain}-{strategy['id']}"
-        user = f"tau2-{domain}-{strategy['id']}"
+        corpus_id = str(strategy.get("corpus_id") or strategy["id"])
+        account = f"{openviking['account']}-{configured_run_id}-{domain}-{corpus_id}"
+        agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}"
+        user = f"tau2-{domain}-{corpus_id}"
         search_uri = f"viking://agent/{agent_id}/memories/experiences"
         command = [
             sys.executable,
@@ -100,10 +101,12 @@ def _tau2_command(
             str(
                 output_dir(config, configured_run_id)
                 / "memory_corpora"
-                / f"{domain}_{strategy['id']}"
+                / f"{domain}_{corpus_id}"
             ),
             "--run-label",
             run_label,
+            "--strategy-id",
+            strategy["id"],
             "--domain",
             domain,
             "--train-split-name",
@@ -134,6 +137,8 @@ def _tau2_command(
             search_uri,
             "--retrieval-top-k",
             str(openviking.get("retrieval_top_k", 4)),
+            "--retrieval-mode",
+            str(strategy.get("retrieval_mode", "first_user")),
         ]
         if task_ids:
             for task_id in task_ids:
@@ -196,7 +201,7 @@ def _build_plan(
     train_num_tasks: int | None,
     repeat_count_override: int | None,
 ) -> dict[str, Any]:
-    repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4))
+    repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 8))
     policy_report = simulator_policy_report(config)
     strategies = config.get("strategies") or []
     if selected_strategy_ids:
@@ -241,6 +246,8 @@ def _build_plan(
                         "run_label": run_label,
                         "train_required": bool(strategy.get("train_required")),
                         "memory_backend": strategy.get("memory_backend"),
+                        "corpus_id": strategy.get("corpus_id", strategy["id"]),
+                        "retrieval_mode": strategy.get("retrieval_mode"),
                         "adapter_status": strategy.get("adapter_status", "ready"),
                         "executable": command is not None,
                         "user_simulator_policy": user_simulator_policy(config),
@@ -267,9 +274,10 @@ def _build_plan(
 
 
 def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, str]:
-    if cell["strategy_id"] == "memory_v2_experience_only":
+    if cell.get("memory_backend") == "openviking":
         run_dir = out / "memory_cells" / cell["run_label"]
-        corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{cell['strategy_id']}"
+        corpus_id = str(cell.get("corpus_id") or cell["strategy_id"])
+        corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{corpus_id}"
         return {
             "summary": str(run_dir / f"{cell['run_label']}.summary.json"),
             "results": str(run_dir / f"{cell['run_label']}.json"),
@@ -282,7 +290,7 @@ def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, st
 
 
 def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any] | None:
-    if cell["strategy_id"] == "memory_v2_experience_only":
+    if cell.get("memory_backend") == "openviking":
         summary_path = Path(artifacts["summary"])
         if not summary_path.is_file():
             return None
diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
index 052c6afd9..a1220a7e6 100644
--- a/benchmark/tau2/scripts/run_memory_v2_eval.py
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -13,6 +13,15 @@
 
 AGENT_NAME = "openviking_memory_agent"
 REPO_ROOT = Path(__file__).resolve().parents[3]
+READ_TOOL_PREFIXES = (
+    "get_",
+    "find_",
+    "list_",
+    "search_",
+    "calculate",
+    "think",
+    "transfer_",
+)
 
 
 def _json(text: str) -> dict[str, Any]:
@@ -69,6 +78,32 @@ def _metrics(results_path: Path) -> dict[str, Any]:
     }
 
 
+def _is_write_tool_call(tool_call: Any) -> bool:
+    name = str(getattr(tool_call, "name", "") or "")
+    return bool(name) and not name.startswith(READ_TOOL_PREFIXES)
+
+
+def _tool_call_query(tool_calls: list[Any], state_messages: list[Any]) -> str:
+    rendered = []
+    for call in tool_calls:
+        rendered.append(
+            f"{getattr(call, 'name', 'unknown_tool')}("
+            f"{json.dumps(getattr(call, 'arguments', {}) or {}, ensure_ascii=False, sort_keys=True)}"
+            ")"
+        )
+    recent_user = [
+        str(getattr(message, "content", "") or "")
+        for message in state_messages[-8:]
+        if str(getattr(message, "role", "")) == "user" and str(getattr(message, "content", "") or "").strip()
+    ]
+    return (
+        "Before executing write-like tool call(s): "
+        + "; ".join(rendered)
+        + "\nRecent user context: "
+        + " | ".join(recent_user[-3:])
+    )
+
+
 def _message_text(message: dict[str, Any]) -> tuple[str, str]:
     role = str(message.get("role") or "assistant")
     if role == "user":
@@ -248,9 +283,10 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None:
     class OpenVikingMemoryAgent(LLMAgent):
         def get_init_state(self, message_history=None):
             state = super().get_init_state(message_history)
-            state.system_messages.append(
-                SystemMessage(role="system", content="<openviking_memory_not_loaded/>")
-            )
+            if args.retrieval_mode == "first_user":
+                state.system_messages.append(
+                    SystemMessage(role="system", content="<openviking_memory_not_loaded/>")
+                )
             return state
 
         def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]:
@@ -281,6 +317,10 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]:
             finally:
                 client.close()
 
+        def _trace(self, event: dict[str, Any]) -> None:
+            with trace_path.open("a", encoding="utf-8") as handle:
+                handle.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n")
+
         def _generate(self, messages):
             def _is_empty_assistant(response) -> bool:
                 content = str(getattr(response, "content", "") or "")
@@ -366,21 +406,48 @@ def generate_next_message(self, message, state: LLMAgentState):
                     + block
                 )
                 state.system_messages[marker_index] = SystemMessage(role="system", content=prompt)
-                with trace_path.open("a", encoding="utf-8") as handle:
-                    handle.write(
-                        json.dumps(
-                            {
-                                "query": query,
-                                "match_count": len(matches),
-                                "matches": matches,
-                            },
-                            ensure_ascii=False,
-                            sort_keys=True,
-                        )
-                        + "\n"
-                    )
+                self._trace(
+                    {
+                        "decision_node": "first_user",
+                        "query": query,
+                        "match_count": len(matches),
+                        "matches": matches,
+                    }
+                )
 
             assistant_message = self._generate(state.system_messages + state.messages)
+            if args.retrieval_mode == "prewrite":
+                tool_calls = list(getattr(assistant_message, "tool_calls", None) or [])
+                write_calls = [call for call in tool_calls if _is_write_tool_call(call)]
+                if write_calls:
+                    query = _tool_call_query(write_calls, state.messages)
+                    block, matches = self._retrieve(query)
+                    self._trace(
+                        {
+                            "decision_node": "before_write_tool_call",
+                            "query": query,
+                            "match_count": len(matches),
+                            "matches": matches,
+                            "tool_calls": [
+                                {
+                                    "name": getattr(call, "name", ""),
+                                    "arguments": getattr(call, "arguments", {}) or {},
+                                }
+                                for call in write_calls
+                            ],
+                        }
+                    )
+                    if block:
+                        prompt = (
+                            "Before executing the pending write-like tool call, use these "
+                            "OpenViking experience memories only when they match the current task:\n\n"
+                            + block
+                        )
+                        assistant_message = self._generate(
+                            state.system_messages
+                            + state.messages
+                            + [SystemMessage(role="system", content=prompt)]
+                        )
             state.messages.append(assistant_message)
             return assistant_message, state
 
@@ -394,6 +461,7 @@ def main() -> int:
     parser.add_argument("--run-dir", type=Path, required=True)
     parser.add_argument("--corpus-dir", type=Path)
     parser.add_argument("--run-label", required=True)
+    parser.add_argument("--strategy-id", default="memory_v2_experience_only")
     parser.add_argument("--domain", required=True)
     parser.add_argument("--train-split-name", default="train")
     parser.add_argument("--eval-split-name", default="test")
@@ -418,6 +486,7 @@ def main() -> int:
     parser.add_argument("--openviking-wait-timeout", type=int, default=600)
     parser.add_argument("--search-uri", required=True)
     parser.add_argument("--retrieval-top-k", type=int, default=4)
+    parser.add_argument("--retrieval-mode", choices=["first_user", "prewrite"], default="first_user")
     parser.add_argument("--force-train", action="store_true")
     args = parser.parse_args()
     normalize_litellm_env()
@@ -433,6 +502,7 @@ def main() -> int:
     summary_path = args.run_dir / f"{args.run_label}.summary.json"
 
     corpus = _train(args, train_results, corpus_manifest)
+    trace_path.touch()
     _register_memory_agent(args, trace_path)
     _run_tau2(
         tau2_repo=args.tau2_repo,
@@ -455,7 +525,8 @@ def main() -> int:
     summary = {
         "run_label": args.run_label,
         "domain": args.domain,
-        "strategy_id": "memory_v2_experience_only",
+        "strategy_id": args.strategy_id,
+        "retrieval_mode": args.retrieval_mode,
         "corpus": corpus,
         "eval_results": str(eval_results),
         "retrieval_trace": str(trace_path),

From d44b07cd674d8b04da3ff7fabff73257d234cad3 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 13:00:18 +0800
Subject: [PATCH 15/17] benchmark: support current tau2 runner api

---
 benchmark/tau2/scripts/run_memory_v2_eval.py | 36 ++++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
index a1220a7e6..8de3927e5 100644
--- a/benchmark/tau2/scripts/run_memory_v2_eval.py
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -3,6 +3,7 @@
 
 import argparse
 import json
+import shutil
 import sys
 import time
 from pathlib import Path
@@ -40,11 +41,16 @@ def _add_tau2_to_path(tau2_repo: Path) -> None:
 
 
 def _save_to_arg(path: Path) -> str:
-    # TAU-2 run_domain appends ".json" to save_to. Keep our artifact paths
-    # stable by passing the stem when callers hand us a JSON path.
+    # Some TAU-2 versions append ".json"; newer versions treat save_to as a
+    # run directory and write results.json under it.
     return str(path.with_suffix("") if path.suffix == ".json" else path)
 
 
+def _compat_results_path(path: Path) -> Path:
+    run_dir = path.with_suffix("") if path.suffix == ".json" else path
+    return run_dir / "results.json"
+
+
 def _reward(sim: dict[str, Any]) -> float:
     info = sim.get("reward_info") or {}
     value = info.get("reward", sim.get("reward", 0.0))
@@ -141,13 +147,17 @@ def _run_tau2(
     save_to: Path,
 ):
     _add_tau2_to_path(tau2_repo)
-    from tau2.data_model.simulation import RunConfig
+    from tau2.data_model.simulation import RunConfig, TextRunConfig
     from tau2.run import run_domain
 
+    compat_results = _compat_results_path(save_to)
     if save_to.exists():
         save_to.unlink()
-    return run_domain(
-        RunConfig(
+    if compat_results.parent.is_dir():
+        shutil.rmtree(compat_results.parent)
+    config_cls = TextRunConfig if getattr(RunConfig, "__origin__", None) is not None else RunConfig
+    result = run_domain(
+        config_cls(
             domain=domain,
             task_split_name=split,
             task_ids=task_ids,
@@ -166,6 +176,9 @@ def _run_tau2(
             log_level="INFO",
         )
     )
+    if not save_to.exists() and compat_results.exists():
+        shutil.copyfile(compat_results, save_to)
+    return result
 
 
 def _client(args: argparse.Namespace):
@@ -452,7 +465,18 @@ def generate_next_message(self, message, state: LLMAgentState):
             return assistant_message, state
 
     if AGENT_NAME not in registry.get_agents():
-        registry.register_agent(OpenVikingMemoryAgent, AGENT_NAME)
+        def create_openviking_memory_agent(tools, domain_policy, **kwargs):
+            return OpenVikingMemoryAgent(
+                tools=tools,
+                domain_policy=domain_policy,
+                llm=kwargs.get("llm"),
+                llm_args=kwargs.get("llm_args"),
+            )
+
+        if hasattr(registry, "register_agent"):
+            registry.register_agent(OpenVikingMemoryAgent, AGENT_NAME)
+        else:
+            registry.register_agent_factory(create_openviking_memory_agent, AGENT_NAME)
 
 
 def main() -> int:

From 581594ac780613143eb114a92e1d11a71eb7efd4 Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 14:30:03 +0800
Subject: [PATCH 16/17] benchmark: align tau2 memory prewrite parity

---
 benchmark/tau2/config/baseline.yaml          |   3 +-
 benchmark/tau2/config/prewrite.yaml          |   2 +-
 benchmark/tau2/scripts/run_eval.py           |   9 ++
 benchmark/tau2/scripts/run_memory_v2_eval.py | 124 +++++++++++++++----
 4 files changed, 113 insertions(+), 25 deletions(-)

diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml
index 2dc8a9d2c..4c4a5060e 100644
--- a/benchmark/tau2/config/baseline.yaml
+++ b/benchmark/tau2/config/baseline.yaml
@@ -8,6 +8,7 @@ benchmark:
   repeat_count: 8
   task_max_concurrency: 10
   max_steps: 200
+  seed: 300
   agent: llm_agent
   user: user_simulator
   reasoning_effort: high
@@ -49,4 +50,4 @@ strategies:
     train_required: true
     corpus_id: memory_v2_experience_only
     train_memory_mode: experience_only
-    retrieval_mode: prewrite
+    retrieval_mode: first_user_prewrite
diff --git a/benchmark/tau2/config/prewrite.yaml b/benchmark/tau2/config/prewrite.yaml
index e8b12d9cf..834963b41 100644
--- a/benchmark/tau2/config/prewrite.yaml
+++ b/benchmark/tau2/config/prewrite.yaml
@@ -10,4 +10,4 @@ strategies:
     train_required: true
     corpus_id: memory_v2_experience_only
     train_memory_mode: experience_only
-    retrieval_mode: prewrite
+    retrieval_mode: first_user_prewrite
diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py
index 57a6069e8..5458ba61a 100755
--- a/benchmark/tau2/scripts/run_eval.py
+++ b/benchmark/tau2/scripts/run_eval.py
@@ -69,6 +69,7 @@ def _tau2_command(
     task_ids: list[str] | None,
     num_tasks: int | None,
     train_num_tasks: int | None,
+    seed: int,
 ) -> list[str] | None:
     benchmark = config["benchmark"]
     model = config["model"]
@@ -139,6 +140,8 @@ def _tau2_command(
             str(openviking.get("retrieval_top_k", 4)),
             "--retrieval-mode",
             str(strategy.get("retrieval_mode", "first_user")),
+            "--seed",
+            str(seed),
         ]
         if task_ids:
             for task_id in task_ids:
@@ -176,6 +179,8 @@ def _tau2_command(
         str(model["user_llm"]),
         "--save-to",
         run_label,
+        "--seed",
+        str(seed),
     ]
 
     command.extend(["--agent-llm-args", agent_llm_args])
@@ -202,6 +207,7 @@ def _build_plan(
     repeat_count_override: int | None,
 ) -> dict[str, Any]:
     repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 8))
+    base_seed = int(config["benchmark"].get("seed", 300))
     policy_report = simulator_policy_report(config)
     strategies = config.get("strategies") or []
     if selected_strategy_ids:
@@ -220,6 +226,7 @@ def _build_plan(
         split_path = split_file(config, domain)
         for strategy in strategies:
             for repeat_index in range(repeat_count):
+                seed = base_seed + repeat_index
                 run_label = f"{configured_run_id}_{domain}_{strategy['id']}_r{repeat_index + 1}"
                 command = _tau2_command(
                     config,
@@ -230,6 +237,7 @@ def _build_plan(
                     task_ids=task_ids,
                     num_tasks=num_tasks,
                     train_num_tasks=train_num_tasks,
+                    seed=seed,
                 )
                 non_executable_reason = None
                 if command is None:
@@ -243,6 +251,7 @@ def _build_plan(
                         "strategy_id": strategy["id"],
                         "strategy_label": strategy.get("label", strategy["id"]),
                         "repeat_index": repeat_index + 1,
+                        "seed": seed,
                         "run_label": run_label,
                         "train_required": bool(strategy.get("train_required")),
                         "memory_backend": strategy.get("memory_backend"),
diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
index 8de3927e5..818802041 100644
--- a/benchmark/tau2/scripts/run_memory_v2_eval.py
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -14,14 +14,20 @@
 
 AGENT_NAME = "openviking_memory_agent"
 REPO_ROOT = Path(__file__).resolve().parents[3]
-READ_TOOL_PREFIXES = (
-    "get_",
-    "find_",
-    "list_",
-    "search_",
-    "calculate",
-    "think",
-    "transfer_",
+WRITE_TOOL_PREFIXES = (
+    "toggle_",
+    "enable_",
+    "disable_",
+    "set_",
+    "reset_",
+    "update_",
+    "modify_",
+    "cancel_",
+    "book_",
+    "exchange_",
+    "return_",
+    "grant_",
+    "reboot_",
 )
 
 
@@ -84,17 +90,29 @@ def _metrics(results_path: Path) -> dict[str, Any]:
     }
 
 
+def _tool_call_name(tool_call: Any) -> str:
+    if isinstance(tool_call, dict):
+        return str(tool_call.get("name") or tool_call.get("function", {}).get("name") or "")
+    return str(getattr(tool_call, "name", "") or "")
+
+
+def _tool_call_arguments(tool_call: Any) -> Any:
+    if isinstance(tool_call, dict):
+        return tool_call.get("arguments") or tool_call.get("function", {}).get("arguments") or {}
+    return getattr(tool_call, "arguments", {}) or {}
+
+
 def _is_write_tool_call(tool_call: Any) -> bool:
-    name = str(getattr(tool_call, "name", "") or "")
-    return bool(name) and not name.startswith(READ_TOOL_PREFIXES)
+    name = _tool_call_name(tool_call)
+    return bool(name) and name.startswith(WRITE_TOOL_PREFIXES)
 
 
 def _tool_call_query(tool_calls: list[Any], state_messages: list[Any]) -> str:
     rendered = []
     for call in tool_calls:
         rendered.append(
-            f"{getattr(call, 'name', 'unknown_tool')}("
-            f"{json.dumps(getattr(call, 'arguments', {}) or {}, ensure_ascii=False, sort_keys=True)}"
+            f"{_tool_call_name(call) or 'unknown_tool'}("
+            f"{json.dumps(_tool_call_arguments(call), ensure_ascii=False, sort_keys=True)}"
             ")"
         )
     recent_user = [
@@ -102,12 +120,18 @@ def _tool_call_query(tool_calls: list[Any], state_messages: list[Any]) -> str:
         for message in state_messages[-8:]
         if str(getattr(message, "role", "")) == "user" and str(getattr(message, "content", "") or "").strip()
     ]
-    return (
-        "Before executing write-like tool call(s): "
-        + "; ".join(rendered)
-        + "\nRecent user context: "
-        + " | ".join(recent_user[-3:])
-    )
+    recent_observations = [
+        str(getattr(message, "content", "") or "")[:600]
+        for message in state_messages[-12:]
+        if str(getattr(message, "role", "")) == "tool" and str(getattr(message, "content", "") or "").strip()
+    ]
+    parts = [
+        "Before executing write-like tool call(s): " + "; ".join(rendered),
+        "Recent user context: " + " | ".join(recent_user[-3:]),
+    ]
+    if recent_observations:
+        parts.append("Recent tool observations: " + " | ".join(recent_observations[-4:]))
+    return "\n".join(parts)
 
 
 def _message_text(message: dict[str, Any]) -> tuple[str, str]:
@@ -213,6 +237,37 @@ def _wait_task(client: Any, task_id: str | None, timeout: int) -> dict[str, Any]
     raise TimeoutError(f"OpenViking task {task_id} did not finish within {timeout}s: {last}")
 
 
+def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]:
+    result = client.search(
+        query=f"{args.domain} customer service order reservation booking cancellation exchange return update",
+        target_uri=args.search_uri,
+        limit=args.retrieval_top_k,
+    )
+    memories = list(getattr(result, "memories", []) or [])
+    reads = []
+    for match in memories[: args.retrieval_top_k]:
+        uri = getattr(match, "uri", "")
+        text = ""
+        try:
+            text = client.read(uri)
+        except Exception:
+            text = getattr(match, "abstract", "") or getattr(match, "overview", "") or ""
+        reads.append(
+            {
+                "uri": uri,
+                "score": getattr(match, "score", None),
+                "text_chars": len(text),
+                "non_empty": bool(str(text).strip()),
+            }
+        )
+    return {
+        "query": f"{args.domain} customer service order reservation booking cancellation exchange return update",
+        "match_count": len(memories),
+        "read_non_empty_count": sum(1 for row in reads if row["non_empty"]),
+        "matches": reads,
+    }
+
+
 def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) -> dict[str, Any]:
     if corpus_manifest.is_file() and not args.force_train:
         return json.loads(corpus_manifest.read_text())
@@ -268,6 +323,12 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
     finally:
         client.close()
 
+    client = _client(args)
+    try:
+        corpus_probe = _probe_corpus(args, client)
+    finally:
+        client.close()
+
     manifest = {
         "domain": args.domain,
         "train_results": str(train_results),
@@ -280,6 +341,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path)
         },
         "committed_sessions": committed,
         "committed_session_count": len(committed),
+        "corpus_probe": corpus_probe,
     }
     _write_json(corpus_manifest, manifest)
     return manifest
@@ -296,7 +358,7 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None:
     class OpenVikingMemoryAgent(LLMAgent):
         def get_init_state(self, message_history=None):
             state = super().get_init_state(message_history)
-            if args.retrieval_mode == "first_user":
+            if args.retrieval_mode in {"first_user", "first_user_prewrite"}:
                 state.system_messages.append(
                     SystemMessage(role="system", content="<openviking_memory_not_loaded/>")
                 )
@@ -334,6 +396,15 @@ def _trace(self, event: dict[str, Any]) -> None:
             with trace_path.open("a", encoding="utf-8") as handle:
                 handle.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n")
 
+        @staticmethod
+        def _trace_injection_fields(block: str, matches: list[dict[str, Any]]) -> dict[str, Any]:
+            injected_count = sum(1 for row in matches if int(row.get("text_chars") or 0) > 0)
+            return {
+                "injected": bool(block.strip()),
+                "injected_count": injected_count if block.strip() else 0,
+                "retrieval_action_taken": "retrieve_and_inject" if block.strip() else "retrieve_no_injection",
+            }
+
         def _generate(self, messages):
             def _is_empty_assistant(response) -> bool:
                 content = str(getattr(response, "content", "") or "")
@@ -425,11 +496,12 @@ def generate_next_message(self, message, state: LLMAgentState):
                         "query": query,
                         "match_count": len(matches),
                         "matches": matches,
+                        **self._trace_injection_fields(block, matches),
                     }
                 )
 
             assistant_message = self._generate(state.system_messages + state.messages)
-            if args.retrieval_mode == "prewrite":
+            if args.retrieval_mode in {"prewrite", "first_user_prewrite"}:
                 tool_calls = list(getattr(assistant_message, "tool_calls", None) or [])
                 write_calls = [call for call in tool_calls if _is_write_tool_call(call)]
                 if write_calls:
@@ -441,10 +513,11 @@ def generate_next_message(self, message, state: LLMAgentState):
                             "query": query,
                             "match_count": len(matches),
                             "matches": matches,
+                            **self._trace_injection_fields(block, matches),
                             "tool_calls": [
                                 {
-                                    "name": getattr(call, "name", ""),
-                                    "arguments": getattr(call, "arguments", {}) or {},
+                                    "name": _tool_call_name(call),
+                                    "arguments": _tool_call_arguments(call),
                                 }
                                 for call in write_calls
                             ],
@@ -510,7 +583,11 @@ def main() -> int:
     parser.add_argument("--openviking-wait-timeout", type=int, default=600)
     parser.add_argument("--search-uri", required=True)
     parser.add_argument("--retrieval-top-k", type=int, default=4)
-    parser.add_argument("--retrieval-mode", choices=["first_user", "prewrite"], default="first_user")
+    parser.add_argument(
+        "--retrieval-mode",
+        choices=["first_user", "prewrite", "first_user_prewrite"],
+        default="first_user",
+    )
     parser.add_argument("--force-train", action="store_true")
     args = parser.parse_args()
     normalize_litellm_env()
@@ -551,6 +628,7 @@ def main() -> int:
         "domain": args.domain,
         "strategy_id": args.strategy_id,
         "retrieval_mode": args.retrieval_mode,
+        "seed": args.seed,
         "corpus": corpus,
         "eval_results": str(eval_results),
         "retrieval_trace": str(trace_path),

From 14c43918877ef13a59a39e08fa22495e5f0a40ac Mon Sep 17 00:00:00 2001
From: huangruiteng <huangruiteng@bytedance.com>
Date: Wed, 13 May 2026 14:53:39 +0800
Subject: [PATCH 17/17] benchmark: make tau2 eval traces safer

---
 benchmark/tau2/scripts/run_memory_v2_eval.py | 54 ++++++++++----------
 benchmark/tau2/scripts/tau2_common.py        |  8 +++
 2 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py
index 818802041..de5ef5441 100644
--- a/benchmark/tau2/scripts/run_memory_v2_eval.py
+++ b/benchmark/tau2/scripts/run_memory_v2_eval.py
@@ -237,6 +237,14 @@ def _wait_task(client: Any, task_id: str | None, timeout: int) -> dict[str, Any]
     raise TimeoutError(f"OpenViking task {task_id} did not finish within {timeout}s: {last}")
 
 
+def _read_memory_text(client: Any, match: Any) -> tuple[str, str | None]:
+    try:
+        return client.read(getattr(match, "uri", "")), None
+    except Exception as exc:
+        fallback = getattr(match, "abstract", "") or getattr(match, "overview", "") or ""
+        return fallback, f"{type(exc).__name__}: {exc}"
+
+
 def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]:
     result = client.search(
         query=f"{args.domain} customer service order reservation booking cancellation exchange return update",
@@ -247,19 +255,16 @@ def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]:
     reads = []
     for match in memories[: args.retrieval_top_k]:
         uri = getattr(match, "uri", "")
-        text = ""
-        try:
-            text = client.read(uri)
-        except Exception:
-            text = getattr(match, "abstract", "") or getattr(match, "overview", "") or ""
-        reads.append(
-            {
-                "uri": uri,
-                "score": getattr(match, "score", None),
-                "text_chars": len(text),
-                "non_empty": bool(str(text).strip()),
-            }
-        )
+        text, read_error = _read_memory_text(client, match)
+        row = {
+            "uri": uri,
+            "score": getattr(match, "score", None),
+            "text_chars": len(text),
+            "non_empty": bool(str(text).strip()),
+        }
+        if read_error:
+            row["read_error"] = read_error
+        reads.append(row)
     return {
         "query": f"{args.domain} customer service order reservation booking cancellation exchange return update",
         "match_count": len(memories),
@@ -373,19 +378,16 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]:
                 blocks = []
                 for index, match in enumerate(memories[: args.retrieval_top_k], 1):
                     uri = getattr(match, "uri", "")
-                    text = ""
-                    try:
-                        text = client.read(uri)
-                    except Exception:
-                        text = getattr(match, "abstract", "") or getattr(match, "overview", "") or ""
-                    rows.append(
-                        {
-                            "uri": uri,
-                            "score": getattr(match, "score", None),
-                            "level": getattr(match, "level", None),
-                            "text_chars": len(text),
-                        }
-                    )
+                    text, read_error = _read_memory_text(client, match)
+                    row = {
+                        "uri": uri,
+                        "score": getattr(match, "score", None),
+                        "level": getattr(match, "level", None),
+                        "text_chars": len(text),
+                    }
+                    if read_error:
+                        row["read_error"] = read_error
+                    rows.append(row)
                     if text.strip():
                         blocks.append(f"Memory {index} ({uri}):\n{text.strip()}")
                 return "\n\n".join(blocks), rows
diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py
index 15f504cf3..a8b5ce201 100755
--- a/benchmark/tau2/scripts/tau2_common.py
+++ b/benchmark/tau2/scripts/tau2_common.py
@@ -207,6 +207,9 @@ def _ensure_confirmation_aware_prompt(repo: Path) -> bool:
         text = path.read_text(encoding="utf-8")
         if _has_confirmation_aware_prompt(text):
             continue
+        backup = path.with_suffix(path.suffix + ".openviking.bak")
+        if not backup.exists():
+            backup.write_text(text, encoding="utf-8")
         path.write_text(text.rstrip() + CONFIRMATION_AWARE_APPENDIX + "\n", encoding="utf-8")
         patched = True
     return patched
@@ -252,6 +255,11 @@ def simulator_policy_report(config: dict[str, Any]) -> dict[str, Any]:
         "patch_applied": patch_applied,
         "patch_mode": patch_mode,
         "prompt_files": [str(path) for path in prompt_paths],
+        "backup_files": [
+            str(path.with_suffix(path.suffix + ".openviking.bak"))
+            for path in prompt_paths
+            if path.with_suffix(path.suffix + ".openviking.bak").exists()
+        ],
         "claim_boundary": claim_boundary,
     }