From cca62c2da6c4e8b8d72730bba59e48d5e6ba17d9 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Tue, 12 May 2026 22:59:56 +0800 Subject: [PATCH 01/17] benchmark: add tau2 eval scaffold --- benchmark/tau2/.gitignore | 1 + benchmark/tau2/README.md | 50 ++++++++ benchmark/tau2/config/baseline.yaml | 52 +++++++++ benchmark/tau2/config/prewrite.yaml | 12 ++ benchmark/tau2/run_full_eval.sh | 53 +++++++++ benchmark/tau2/scripts/parity_check.py | 51 ++++++++ benchmark/tau2/scripts/preflight.py | 84 +++++++++++++ benchmark/tau2/scripts/run_eval.py | 156 +++++++++++++++++++++++++ benchmark/tau2/scripts/summarize.py | 50 ++++++++ benchmark/tau2/scripts/tau2_common.py | 117 +++++++++++++++++++ 10 files changed, 626 insertions(+) create mode 100644 benchmark/tau2/.gitignore create mode 100644 benchmark/tau2/README.md create mode 100644 benchmark/tau2/config/baseline.yaml create mode 100644 benchmark/tau2/config/prewrite.yaml create mode 100755 benchmark/tau2/run_full_eval.sh create mode 100755 benchmark/tau2/scripts/parity_check.py create mode 100755 benchmark/tau2/scripts/preflight.py create mode 100755 benchmark/tau2/scripts/run_eval.py create mode 100755 benchmark/tau2/scripts/summarize.py create mode 100755 benchmark/tau2/scripts/tau2_common.py diff --git a/benchmark/tau2/.gitignore b/benchmark/tau2/.gitignore new file mode 100644 index 000000000..1cd791b52 --- /dev/null +++ b/benchmark/tau2/.gitignore @@ -0,0 +1 @@ +result/ diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md new file mode 100644 index 000000000..ac47ffa63 --- /dev/null +++ b/benchmark/tau2/README.md @@ -0,0 +1,50 @@ +# TAU-2 Benchmark + +This directory contains a small OpenViking-style entry point for TAU-2 memory +evaluation. The first version is intentionally narrow: + +- no-memory control; +- fresh OpenViking memory baseline; +- trajectory / procedure-view treatment; +- optional pre-write recall. + +Category rerank and other harness-only diagnostics are not migrated here yet. + +## Layout + +```text +benchmark/tau2/ +├── config/ +│ ├── baseline.yaml +│ └── prewrite.yaml +├── scripts/ +│ ├── preflight.py +│ ├── run_eval.py +│ ├── summarize.py +│ └── parity_check.py +└── run_full_eval.sh +``` + +Generated artifacts are written to `benchmark/tau2/result//`. + +## Quick Start + +Plan the default benchmark without running TAU-2: + +```bash +python benchmark/tau2/scripts/preflight.py --config benchmark/tau2/config/baseline.yaml +python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only +``` + +Run with execution enabled after TAU-2, model credentials, and OpenViking are +configured: + +```bash +benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute +``` + +## Evidence Boundary + +Only completed `retail + airline` runs with the same config, same seeds/repeats, +and non-empty artifacts should be read as benchmark evidence. Partial runs, +single-task probes, or missing OpenViking corpus identity are diagnostics. diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml new file mode 100644 index 000000000..16a452d4a --- /dev/null +++ b/benchmark/tau2/config/baseline.yaml @@ -0,0 +1,52 @@ +benchmark: + name: tau2_openviking_baseline + domains: + - retail + - airline + train_split_name: train + eval_split_name: test + repeat_count: 4 + task_max_concurrency: 10 + strategy_concurrency: 4 + max_steps: 200 + reasoning_effort: high + +paths: + tau2_repo: ${TAU2_REPO:-data/external_benchmarks/tau2-bench} + output_dir: benchmark/tau2/result + +model: + agent_llm: ${TAU2_AGENT_LLM:-doubao-seed-1-6-250615} + user_llm: ${TAU2_USER_LLM:-doubao-seed-1-6-250615} + evaluator_llm: ${TAU2_EVALUATOR_LLM:-doubao-seed-1-6-250615} + temperature: 0.0 + +openviking: + url: ${OPENVIKING_URL:-http://localhost:1933} + account: ${OPENVIKING_ACCOUNT:-default} + agent_id: ${OPENVIKING_AGENT_ID:-tau2-openviking-agent} + retrieval_top_k: 4 + replay_write_policy: read_only + +strategies: + - id: no_memory + label: No memory + memory_backend: none + train_required: false + - id: memory_v2_experience_only + label: OpenViking Memory V2 experience-only + memory_backend: openviking + train_required: true + train_memory_mode: experience_only + retrieval_mode: first_user + - id: trajectory_procedure_view + label: OpenViking trajectory procedure view + memory_backend: openviking + train_required: true + train_memory_mode: trajectory_procedure_view + operation_mode: add_only + retrieval_mode: first_user + +features: + prewrite_recall: + enabled: false diff --git a/benchmark/tau2/config/prewrite.yaml b/benchmark/tau2/config/prewrite.yaml new file mode 100644 index 000000000..965f09963 --- /dev/null +++ b/benchmark/tau2/config/prewrite.yaml @@ -0,0 +1,12 @@ +extends: baseline.yaml + +benchmark: + name: tau2_openviking_prewrite + +features: + prewrite_recall: + enabled: true + decision_nodes: + - before_write_tool_call + max_memories: 4 + evidence_boundary: runtime_retrieval_trace_required diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh new file mode 100755 index 000000000..8abf41235 --- /dev/null +++ b/benchmark/tau2/run_full_eval.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +PYTHON_BIN="${PYTHON_BIN:-python3}" +CONFIG="$SCRIPT_DIR/config/baseline.yaml" +EXECUTE=false +RUN_ID="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --config) + CONFIG="$2" + shift 2 + ;; + --run-id) + RUN_ID="$2" + shift 2 + ;; + --execute) + EXECUTE=true + shift + ;; + --help|-h) + cat <<'EOF' +Usage: + benchmark/tau2/run_full_eval.sh [--config PATH] [--run-id ID] [--execute] + +Without --execute the script only writes preflight and run_plan artifacts. +EOF + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + exit 1 + ;; + esac +done + +RUN_ARGS=() +if [[ -n "$RUN_ID" ]]; then + RUN_ARGS+=(--run-id "$RUN_ID") +fi + +cd "$REPO_ROOT" +"$PYTHON_BIN" "$SCRIPT_DIR/scripts/preflight.py" --config "$CONFIG" "${RUN_ARGS[@]}" + +if [[ "$EXECUTE" == true ]]; then + "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --execute +else + "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --plan-only +fi diff --git a/benchmark/tau2/scripts/parity_check.py b/benchmark/tau2/scripts/parity_check.py new file mode 100755 index 000000000..d513715f0 --- /dev/null +++ b/benchmark/tau2/scripts/parity_check.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from tau2_common import write_json + + +def _load_json(path: Path) -> Any: + return json.loads(path.read_text(encoding="utf-8")) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Compare OpenViking TAU-2 artifacts against a harness reference.") + parser.add_argument("--ov-run-plan", type=Path, required=True) + parser.add_argument("--harness-run-plan", type=Path, required=True) + parser.add_argument("--output", type=Path, required=True) + args = parser.parse_args() + + ov_plan = _load_json(args.ov_run_plan) + harness_plan = _load_json(args.harness_run_plan) + + ov_cells = ov_plan.get("cells") or [] + harness_cells = harness_plan.get("cells") or harness_plan.get("treatments") or [] + report = { + "status": "ok" if len(ov_cells) == len(harness_cells) else "mismatch", + "ov_run_plan": str(args.ov_run_plan.resolve()), + "harness_run_plan": str(args.harness_run_plan.resolve()), + "ov_cell_count": len(ov_cells), + "harness_cell_count": len(harness_cells), + "checks": { + "cell_count_match": len(ov_cells) == len(harness_cells), + }, + "notes": [ + "Initial parity is intentionally structural.", + "Train payload, retrieval trace, and scoreboard parity should be added as each migration layer lands.", + ], + } + write_json(args.output, report) + if report["status"] != "ok": + print(f"[parity][WARN] wrote mismatch report: {args.output}") + return 1 + print(f"[parity][OK] wrote {args.output}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/tau2/scripts/preflight.py b/benchmark/tau2/scripts/preflight.py new file mode 100755 index 000000000..32f78c3a3 --- /dev/null +++ b/benchmark/tau2/scripts/preflight.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import importlib.util +import sys +from pathlib import Path +from typing import Any + +from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json + + +def _check_import(module: str) -> dict[str, Any]: + spec = importlib.util.find_spec(module) + return {"module": module, "ok": spec is not None} + + +def _split_status(config: dict[str, Any]) -> list[dict[str, Any]]: + rows = [] + for domain in domains(config): + path = split_file(config, domain) + rows.append( + { + "domain": domain, + "path": str(path), + "exists": path.is_file(), + } + ) + return rows + + +def main() -> int: + parser = argparse.ArgumentParser(description="Preflight TAU-2 benchmark config.") + parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml") + parser.add_argument("--run-id", default=run_id()) + parser.add_argument("--strict", action="store_true", help="Require optional runtime imports and TAU-2 split files.") + args = parser.parse_args() + + config = load_config(args.config) + out = output_dir(config, args.run_id) + + errors: list[str] = [] + try: + strategy_ids(config) + except Exception as exc: + errors.append(str(exc)) + + split_rows = _split_status(config) + if args.strict: + for row in split_rows: + if not row["exists"]: + errors.append(f"missing split file for {row['domain']}: {row['path']}") + + import_rows = [_check_import("openviking"), _check_import("openviking_cli"), _check_import("tau2")] + if args.strict: + for row in import_rows: + if not row["ok"]: + errors.append(f"missing Python module: {row['module']}") + + report = { + "status": "failed" if errors else "ok", + "config": str(args.config.resolve()), + "run_id": args.run_id, + "tau2_repo": str(tau2_repo(config)), + "domains": domains(config), + "strategies": strategy_ids(config), + "strict": args.strict, + "imports": import_rows, + "split_files": split_rows, + "errors": errors, + } + write_json(out / "preflight.json", report) + + if errors: + for error in errors: + print(f"[preflight][ERROR] {error}", file=sys.stderr) + print(f"[preflight] wrote {out / 'preflight.json'}", file=sys.stderr) + return 1 + print(f"[preflight][OK] wrote {out / 'preflight.json'}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py new file mode 100755 index 000000000..8ab4c41f0 --- /dev/null +++ b/benchmark/tau2/scripts/run_eval.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import subprocess +import sys +from pathlib import Path +from typing import Any + +from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json + + +def _tau2_command(config: dict[str, Any], *, domain: str, strategy: dict[str, Any], repeat_index: int, run_label: str) -> list[str]: + benchmark = config["benchmark"] + model = config["model"] + command = [ + "tau2", + "run", + "--domain", + domain, + "--task-split-name", + str(benchmark.get("eval_split_name", "test")), + "--num-trials", + "1", + "--max-steps", + str(benchmark.get("max_steps", 200)), + "--max-concurrency", + str(benchmark.get("task_max_concurrency", 10)), + "--agent-llm", + str(model["agent_llm"]), + "--user-llm", + str(model["user_llm"]), + "--save-to", + run_label, + ] + + reasoning_effort = benchmark.get("reasoning_effort") + if reasoning_effort: + command.extend(["--agent-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}']) + command.extend(["--user-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}']) + + if strategy.get("memory_backend") == "none": + command.extend(["--memory-backend", "none"]) + else: + command.extend(["--memory-backend", "openviking"]) + command.extend(["--memory-retrieval-mode", str(strategy.get("retrieval_mode", "first_user"))]) + command.extend(["--memory-replay-write-policy", str(config.get("openviking", {}).get("replay_write_policy", "read_only"))]) + + if config.get("features", {}).get("prewrite_recall", {}).get("enabled"): + command.append("--enable-prewrite-recall") + + return command + + +def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any]: + repeat_count = int(config["benchmark"].get("repeat_count", 4)) + strategies = config.get("strategies") or [] + cells = [] + for domain in domains(config): + split_path = split_file(config, domain) + for strategy in strategies: + for repeat_index in range(repeat_count): + run_label = f"{configured_run_id}_{domain}_{strategy['id']}_r{repeat_index + 1}" + cells.append( + { + "domain": domain, + "strategy_id": strategy["id"], + "strategy_label": strategy.get("label", strategy["id"]), + "repeat_index": repeat_index + 1, + "run_label": run_label, + "train_required": bool(strategy.get("train_required")), + "memory_backend": strategy.get("memory_backend"), + "split_file": str(split_path), + "command": _tau2_command( + config, + domain=domain, + strategy=strategy, + repeat_index=repeat_index, + run_label=run_label, + ), + } + ) + return { + "schema_version": "openviking.tau2.run_plan.v0", + "run_id": configured_run_id, + "status": "planned", + "strategy_ids": strategy_ids(config), + "domains": domains(config), + "cell_count": len(cells), + "cells": cells, + } + + +def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]: + rows = [] + for cell in plan["cells"]: + print(f"[tau2] running {cell['run_label']}") + completed = subprocess.run( + cell["command"], + cwd=repo, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + row = { + "run_label": cell["run_label"], + "domain": cell["domain"], + "strategy_id": cell["strategy_id"], + "returncode": completed.returncode, + "stdout_tail": completed.stdout[-4000:], + "stderr_tail": completed.stderr[-4000:], + } + rows.append(row) + write_json(out / "cell_results" / f"{cell['run_label']}.json", row) + if completed.returncode != 0: + raise RuntimeError(f"cell failed: {cell['run_label']} returncode={completed.returncode}") + return rows + + +def main() -> int: + parser = argparse.ArgumentParser(description="Plan or run TAU-2 benchmark cells.") + parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml") + parser.add_argument("--run-id", default=run_id()) + parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.") + parser.add_argument("--execute", action="store_true", help="Execute planned cells.") + args = parser.parse_args() + + if args.plan_only and args.execute: + raise SystemExit("--plan-only and --execute are mutually exclusive") + + config = load_config(args.config) + out = output_dir(config, args.run_id) + out.mkdir(parents=True, exist_ok=True) + plan = _build_plan(config, args.run_id) + write_json(out / "run_plan.json", plan) + write_json(out / "resolved_config.json", config) + print(f"[tau2] wrote {out / 'run_plan.json'}") + + if args.execute: + try: + rows = _execute_cells(plan, tau2_repo(config), out) + plan["status"] = "succeeded" + plan["executed_cell_count"] = len(rows) + write_json(out / "run_plan.json", plan) + except Exception as exc: + plan["status"] = "failed" + plan["error"] = str(exc) + write_json(out / "run_plan.json", plan) + print(f"[tau2][ERROR] {exc}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/tau2/scripts/summarize.py b/benchmark/tau2/scripts/summarize.py new file mode 100755 index 000000000..5e07c03f2 --- /dev/null +++ b/benchmark/tau2/scripts/summarize.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from statistics import mean +from typing import Any + +from tau2_common import write_json + + +def _load_json(path: Path) -> Any: + return json.loads(path.read_text(encoding="utf-8")) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Summarize TAU-2 cell result JSON files.") + parser.add_argument("--run-dir", type=Path, required=True) + args = parser.parse_args() + + run_dir = args.run_dir.expanduser().resolve() + rows = [] + for path in sorted((run_dir / "cell_results").glob("*.json")): + row = _load_json(path) + rows.append(row) + + returncodes = [row.get("returncode") for row in rows] + summary = { + "run_dir": str(run_dir), + "cell_count": len(rows), + "succeeded_cell_count": sum(1 for code in returncodes if code == 0), + "failed_cell_count": sum(1 for code in returncodes if code != 0), + "returncodes": returncodes, + "average_reward": None, + "notes": [ + "This summarizer only aggregates wrapper cell status in the initial PR.", + "TAU-2 reward parsing is added once the execution artifact shape is fixed.", + ], + } + rewards = [row.get("reward") for row in rows if isinstance(row.get("reward"), (int, float))] + if rewards: + summary["average_reward"] = mean(rewards) + write_json(run_dir / "summary.json", summary) + print(f"[tau2] wrote {run_dir / 'summary.json'}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py new file mode 100755 index 000000000..5399fffee --- /dev/null +++ b/benchmark/tau2/scripts/tau2_common.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import json +import os +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import yaml + + +TAU2_DIR = Path(__file__).resolve().parents[1] +REPO_ROOT = TAU2_DIR.parents[1] + + +_ENV_PATTERN = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}") + + +def run_id() -> str: + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + + +def render_env(value: Any) -> Any: + if isinstance(value, str): + def replace(match: re.Match[str]) -> str: + name = match.group(1) + default = match.group(2) or "" + return os.environ.get(name, default) + + return _ENV_PATTERN.sub(replace, value) + if isinstance(value, list): + return [render_env(item) for item in value] + if isinstance(value, dict): + return {key: render_env(item) for key, item in value.items()} + return value + + +def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: + merged = dict(base) + for key, value in override.items(): + if ( + key in merged + and isinstance(merged[key], dict) + and isinstance(value, dict) + ): + merged[key] = deep_merge(merged[key], value) + else: + merged[key] = value + return merged + + +def load_config(path: Path) -> dict[str, Any]: + path = path.expanduser().resolve() + with path.open("r", encoding="utf-8") as handle: + raw = yaml.safe_load(handle) or {} + if not isinstance(raw, dict): + raise ValueError(f"Config must be a mapping: {path}") + + parent_name = raw.pop("extends", None) + if parent_name: + parent_path = (path.parent / str(parent_name)).resolve() + parent = load_config(parent_path) + raw = deep_merge(parent, raw) + return render_env(raw) + + +def resolve_path(path_value: str | Path, *, base: Path | None = None) -> Path: + path = Path(path_value).expanduser() + if path.is_absolute(): + return path + return ((base or REPO_ROOT) / path).resolve() + + +def output_dir(config: dict[str, Any], configured_run_id: str) -> Path: + raw = config.get("paths", {}).get("output_dir", TAU2_DIR / "result") + return resolve_path(raw) / configured_run_id + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + + +def strategy_ids(config: dict[str, Any]) -> list[str]: + strategies = config.get("strategies") or [] + if not isinstance(strategies, list): + raise ValueError("strategies must be a list") + ids = [] + for item in strategies: + if not isinstance(item, dict) or not item.get("id"): + raise ValueError("each strategy must be a mapping with id") + ids.append(str(item["id"])) + if len(ids) != len(set(ids)): + raise ValueError(f"duplicate strategy ids: {ids}") + return ids + + +def domains(config: dict[str, Any]) -> list[str]: + values = config.get("benchmark", {}).get("domains") or [] + if not isinstance(values, list) or not values: + raise ValueError("benchmark.domains must be a non-empty list") + return [str(item) for item in values] + + +def tau2_repo(config: dict[str, Any]) -> Path: + raw = config.get("paths", {}).get("tau2_repo") + if not raw: + raise ValueError("paths.tau2_repo is required") + return resolve_path(raw) + + +def split_file(config: dict[str, Any], domain: str) -> Path: + return tau2_repo(config) / "data" / "tau2" / "domains" / domain / "split_tasks.json" From a132c3bbabbf7b95f2de7c1630ce5d1b7f7c3bb1 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Tue, 12 May 2026 23:17:26 +0800 Subject: [PATCH 02/17] benchmark: gate pending tau2 memory adapter --- benchmark/tau2/README.md | 15 +++++ benchmark/tau2/config/baseline.yaml | 4 ++ benchmark/tau2/run_full_eval.sh | 9 ++- benchmark/tau2/scripts/run_eval.py | 98 ++++++++++++++++++++++------- 4 files changed, 102 insertions(+), 24 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index ac47ffa63..344b3e6b4 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -36,6 +36,17 @@ python benchmark/tau2/scripts/preflight.py --config benchmark/tau2/config/baseli python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only ``` +Plan a one-cell upstream TAU-2 smoke: + +```bash +benchmark/tau2/run_full_eval.sh \ + --config benchmark/tau2/config/baseline.yaml \ + --domain retail \ + --strategy-id no_memory \ + --num-tasks 1 \ + --repeat-count 1 +``` + Run with execution enabled after TAU-2, model credentials, and OpenViking are configured: @@ -43,6 +54,10 @@ configured: benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute ``` +The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory +cells are kept in the same plan, but marked adapter-pending until the TAU-2 +agent adapter is wired in this benchmark directory. + ## Evidence Boundary Only completed `retail + airline` runs with the same config, same seeds/repeats, diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 16a452d4a..81e1a2133 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -9,6 +9,8 @@ benchmark: task_max_concurrency: 10 strategy_concurrency: 4 max_steps: 200 + agent: llm_agent + user: user_simulator reasoning_effort: high paths: @@ -36,12 +38,14 @@ strategies: - id: memory_v2_experience_only label: OpenViking Memory V2 experience-only memory_backend: openviking + adapter_status: pending train_required: true train_memory_mode: experience_only retrieval_mode: first_user - id: trajectory_procedure_view label: OpenViking trajectory procedure view memory_backend: openviking + adapter_status: pending train_required: true train_memory_mode: trajectory_procedure_view operation_mode: add_only diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh index 8abf41235..d03df3f84 100755 --- a/benchmark/tau2/run_full_eval.sh +++ b/benchmark/tau2/run_full_eval.sh @@ -7,6 +7,7 @@ PYTHON_BIN="${PYTHON_BIN:-python3}" CONFIG="$SCRIPT_DIR/config/baseline.yaml" EXECUTE=false RUN_ID="" +RUN_EVAL_EXTRA=() while [[ $# -gt 0 ]]; do case "$1" in @@ -22,6 +23,10 @@ while [[ $# -gt 0 ]]; do EXECUTE=true shift ;; + --domain|--repeat-count|--strategy-id|--task-id|--num-tasks) + RUN_EVAL_EXTRA+=("$1" "$2") + shift 2 + ;; --help|-h) cat <<'EOF' Usage: @@ -47,7 +52,7 @@ cd "$REPO_ROOT" "$PYTHON_BIN" "$SCRIPT_DIR/scripts/preflight.py" --config "$CONFIG" "${RUN_ARGS[@]}" if [[ "$EXECUTE" == true ]]; then - "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --execute + "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" "${RUN_EVAL_EXTRA[@]}" --execute else - "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" --plan-only + "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" "${RUN_EVAL_EXTRA[@]}" --plan-only fi diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 8ab4c41f0..3d6d517c0 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -10,14 +10,30 @@ from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json -def _tau2_command(config: dict[str, Any], *, domain: str, strategy: dict[str, Any], repeat_index: int, run_label: str) -> list[str]: +def _tau2_command( + config: dict[str, Any], + *, + domain: str, + strategy: dict[str, Any], + run_label: str, + task_ids: list[str] | None, + num_tasks: int | None, +) -> list[str] | None: benchmark = config["benchmark"] model = config["model"] + + if strategy.get("memory_backend") != "none": + return None + command = [ "tau2", "run", "--domain", domain, + "--agent", + str(benchmark.get("agent", "llm_agent")), + "--user", + str(benchmark.get("user", "user_simulator")), "--task-split-name", str(benchmark.get("eval_split_name", "test")), "--num-trials", @@ -39,28 +55,52 @@ def _tau2_command(config: dict[str, Any], *, domain: str, strategy: dict[str, An command.extend(["--agent-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}']) command.extend(["--user-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}']) - if strategy.get("memory_backend") == "none": - command.extend(["--memory-backend", "none"]) - else: - command.extend(["--memory-backend", "openviking"]) - command.extend(["--memory-retrieval-mode", str(strategy.get("retrieval_mode", "first_user"))]) - command.extend(["--memory-replay-write-policy", str(config.get("openviking", {}).get("replay_write_policy", "read_only"))]) - - if config.get("features", {}).get("prewrite_recall", {}).get("enabled"): - command.append("--enable-prewrite-recall") + if task_ids: + command.append("--task-ids") + command.extend(task_ids) + elif num_tasks is not None: + command.extend(["--num-tasks", str(num_tasks)]) return command -def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any]: - repeat_count = int(config["benchmark"].get("repeat_count", 4)) +def _build_plan( + config: dict[str, Any], + configured_run_id: str, + *, + selected_domains: set[str] | None, + selected_strategy_ids: set[str] | None, + task_ids: list[str] | None, + num_tasks: int | None, + repeat_count_override: int | None, +) -> dict[str, Any]: + repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4)) strategies = config.get("strategies") or [] + if selected_strategy_ids: + unknown = selected_strategy_ids - set(strategy_ids(config)) + if unknown: + raise ValueError(f"unknown strategy ids: {sorted(unknown)}") + strategies = [strategy for strategy in strategies if strategy["id"] in selected_strategy_ids] cells = [] - for domain in domains(config): + plan_domains = domains(config) + if selected_domains: + unknown_domains = selected_domains - set(plan_domains) + if unknown_domains: + raise ValueError(f"unknown domains: {sorted(unknown_domains)}") + plan_domains = [domain for domain in plan_domains if domain in selected_domains] + for domain in plan_domains: split_path = split_file(config, domain) for strategy in strategies: for repeat_index in range(repeat_count): run_label = f"{configured_run_id}_{domain}_{strategy['id']}_r{repeat_index + 1}" + command = _tau2_command( + config, + domain=domain, + strategy=strategy, + run_label=run_label, + task_ids=task_ids, + num_tasks=num_tasks, + ) cells.append( { "domain": domain, @@ -70,14 +110,10 @@ def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any "run_label": run_label, "train_required": bool(strategy.get("train_required")), "memory_backend": strategy.get("memory_backend"), + "adapter_status": strategy.get("adapter_status", "ready"), + "executable": command is not None, "split_file": str(split_path), - "command": _tau2_command( - config, - domain=domain, - strategy=strategy, - repeat_index=repeat_index, - run_label=run_label, - ), + "command": command, } ) return { @@ -85,7 +121,7 @@ def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any "run_id": configured_run_id, "status": "planned", "strategy_ids": strategy_ids(config), - "domains": domains(config), + "domains": plan_domains, "cell_count": len(cells), "cells": cells, } @@ -94,6 +130,11 @@ def _build_plan(config: dict[str, Any], configured_run_id: str) -> dict[str, Any def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]: rows = [] for cell in plan["cells"]: + if not cell.get("executable"): + raise RuntimeError( + f"cell is not executable yet: {cell['run_label']} " + f"(strategy_id={cell['strategy_id']}, adapter_status={cell.get('adapter_status')})" + ) print(f"[tau2] running {cell['run_label']}") completed = subprocess.run( cell["command"], @@ -122,6 +163,11 @@ def main() -> int: parser = argparse.ArgumentParser(description="Plan or run TAU-2 benchmark cells.") parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml") parser.add_argument("--run-id", default=run_id()) + parser.add_argument("--domain", action="append", help="Run only this configured domain; may be repeated.") + parser.add_argument("--repeat-count", type=int, help="Override benchmark.repeat_count for smoke runs.") + parser.add_argument("--strategy-id", action="append", help="Run only this strategy id; may be repeated.") + parser.add_argument("--task-id", action="append", help="Run only this TAU-2 task id; may be repeated.") + parser.add_argument("--num-tasks", type=int, help="Run the first N tasks from the selected split.") parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.") parser.add_argument("--execute", action="store_true", help="Execute planned cells.") args = parser.parse_args() @@ -132,7 +178,15 @@ def main() -> int: config = load_config(args.config) out = output_dir(config, args.run_id) out.mkdir(parents=True, exist_ok=True) - plan = _build_plan(config, args.run_id) + plan = _build_plan( + config, + args.run_id, + selected_domains=set(args.domain) if args.domain else None, + selected_strategy_ids=set(args.strategy_id) if args.strategy_id else None, + task_ids=args.task_id, + num_tasks=args.num_tasks, + repeat_count_override=args.repeat_count, + ) write_json(out / "run_plan.json", plan) write_json(out / "resolved_config.json", config) print(f"[tau2] wrote {out / 'run_plan.json'}") From b68e45922ab86b14c28263b4d2616f40b99164b5 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Tue, 12 May 2026 23:25:48 +0800 Subject: [PATCH 03/17] benchmark: use litellm provider model default --- benchmark/tau2/README.md | 3 +++ benchmark/tau2/config/baseline.yaml | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 344b3e6b4..5821a83ac 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -54,6 +54,9 @@ configured: benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute ``` +When using Doubao through an OpenAI-compatible endpoint, set `OPENAI_API_KEY` +and `OPENAI_API_BASE` for LiteLLM before running upstream TAU-2. + The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory cells are kept in the same plan, but marked adapter-pending until the TAU-2 agent adapter is wired in this benchmark directory. diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 81e1a2133..c1e23e9a9 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -18,9 +18,9 @@ paths: output_dir: benchmark/tau2/result model: - agent_llm: ${TAU2_AGENT_LLM:-doubao-seed-1-6-250615} - user_llm: ${TAU2_USER_LLM:-doubao-seed-1-6-250615} - evaluator_llm: ${TAU2_EVALUATOR_LLM:-doubao-seed-1-6-250615} + agent_llm: ${TAU2_AGENT_LLM:-openai/doubao-seed-2-0-pro-260215} + user_llm: ${TAU2_USER_LLM:-openai/doubao-seed-2-0-pro-260215} + evaluator_llm: ${TAU2_EVALUATOR_LLM:-openai/doubao-seed-2-0-pro-260215} temperature: 0.0 openviking: From 37e9b5039af9e8bff4648bf715762b5ad5741be2 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Tue, 12 May 2026 23:35:47 +0800 Subject: [PATCH 04/17] benchmark: fold preflight into tau2 runner --- benchmark/tau2/README.md | 8 +-- benchmark/tau2/run_full_eval.sh | 20 +++++- benchmark/tau2/scripts/parity_check.py | 51 ---------------- benchmark/tau2/scripts/preflight.py | 84 -------------------------- benchmark/tau2/scripts/run_eval.py | 44 ++++++++++++++ benchmark/tau2/scripts/summarize.py | 50 --------------- 6 files changed, 65 insertions(+), 192 deletions(-) delete mode 100755 benchmark/tau2/scripts/parity_check.py delete mode 100755 benchmark/tau2/scripts/preflight.py delete mode 100755 benchmark/tau2/scripts/summarize.py diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 5821a83ac..561a69e63 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -18,10 +18,8 @@ benchmark/tau2/ │ ├── baseline.yaml │ └── prewrite.yaml ├── scripts/ -│ ├── preflight.py │ ├── run_eval.py -│ ├── summarize.py -│ └── parity_check.py +│ └── tau2_common.py └── run_full_eval.sh ``` @@ -32,10 +30,12 @@ Generated artifacts are written to `benchmark/tau2/result//`. Plan the default benchmark without running TAU-2: ```bash -python benchmark/tau2/scripts/preflight.py --config benchmark/tau2/config/baseline.yaml python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baseline.yaml --plan-only ``` +Add `--preflight` or `--strict-preflight` when you want the runner to write a +small environment/config check next to the run plan. + Plan a one-cell upstream TAU-2 smoke: ```bash diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh index d03df3f84..22936e3d3 100755 --- a/benchmark/tau2/run_full_eval.sh +++ b/benchmark/tau2/run_full_eval.sh @@ -6,6 +6,8 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" PYTHON_BIN="${PYTHON_BIN:-python3}" CONFIG="$SCRIPT_DIR/config/baseline.yaml" EXECUTE=false +PREFLIGHT=false +STRICT_PREFLIGHT=false RUN_ID="" RUN_EVAL_EXTRA=() @@ -23,6 +25,14 @@ while [[ $# -gt 0 ]]; do EXECUTE=true shift ;; + --preflight) + PREFLIGHT=true + shift + ;; + --strict-preflight) + STRICT_PREFLIGHT=true + shift + ;; --domain|--repeat-count|--strategy-id|--task-id|--num-tasks) RUN_EVAL_EXTRA+=("$1" "$2") shift 2 @@ -30,9 +40,9 @@ while [[ $# -gt 0 ]]; do --help|-h) cat <<'EOF' Usage: - benchmark/tau2/run_full_eval.sh [--config PATH] [--run-id ID] [--execute] + benchmark/tau2/run_full_eval.sh [--config PATH] [--run-id ID] [--execute] [--preflight] -Without --execute the script only writes preflight and run_plan artifacts. +Without --execute the script only writes run_plan artifacts. EOF exit 0 ;; @@ -49,7 +59,11 @@ if [[ -n "$RUN_ID" ]]; then fi cd "$REPO_ROOT" -"$PYTHON_BIN" "$SCRIPT_DIR/scripts/preflight.py" --config "$CONFIG" "${RUN_ARGS[@]}" +if [[ "$STRICT_PREFLIGHT" == true ]]; then + RUN_EVAL_EXTRA+=(--strict-preflight) +elif [[ "$PREFLIGHT" == true ]]; then + RUN_EVAL_EXTRA+=(--preflight) +fi if [[ "$EXECUTE" == true ]]; then "$PYTHON_BIN" "$SCRIPT_DIR/scripts/run_eval.py" --config "$CONFIG" "${RUN_ARGS[@]}" "${RUN_EVAL_EXTRA[@]}" --execute diff --git a/benchmark/tau2/scripts/parity_check.py b/benchmark/tau2/scripts/parity_check.py deleted file mode 100755 index d513715f0..000000000 --- a/benchmark/tau2/scripts/parity_check.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import json -from pathlib import Path -from typing import Any - -from tau2_common import write_json - - -def _load_json(path: Path) -> Any: - return json.loads(path.read_text(encoding="utf-8")) - - -def main() -> int: - parser = argparse.ArgumentParser(description="Compare OpenViking TAU-2 artifacts against a harness reference.") - parser.add_argument("--ov-run-plan", type=Path, required=True) - parser.add_argument("--harness-run-plan", type=Path, required=True) - parser.add_argument("--output", type=Path, required=True) - args = parser.parse_args() - - ov_plan = _load_json(args.ov_run_plan) - harness_plan = _load_json(args.harness_run_plan) - - ov_cells = ov_plan.get("cells") or [] - harness_cells = harness_plan.get("cells") or harness_plan.get("treatments") or [] - report = { - "status": "ok" if len(ov_cells) == len(harness_cells) else "mismatch", - "ov_run_plan": str(args.ov_run_plan.resolve()), - "harness_run_plan": str(args.harness_run_plan.resolve()), - "ov_cell_count": len(ov_cells), - "harness_cell_count": len(harness_cells), - "checks": { - "cell_count_match": len(ov_cells) == len(harness_cells), - }, - "notes": [ - "Initial parity is intentionally structural.", - "Train payload, retrieval trace, and scoreboard parity should be added as each migration layer lands.", - ], - } - write_json(args.output, report) - if report["status"] != "ok": - print(f"[parity][WARN] wrote mismatch report: {args.output}") - return 1 - print(f"[parity][OK] wrote {args.output}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/benchmark/tau2/scripts/preflight.py b/benchmark/tau2/scripts/preflight.py deleted file mode 100755 index 32f78c3a3..000000000 --- a/benchmark/tau2/scripts/preflight.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import importlib.util -import sys -from pathlib import Path -from typing import Any - -from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json - - -def _check_import(module: str) -> dict[str, Any]: - spec = importlib.util.find_spec(module) - return {"module": module, "ok": spec is not None} - - -def _split_status(config: dict[str, Any]) -> list[dict[str, Any]]: - rows = [] - for domain in domains(config): - path = split_file(config, domain) - rows.append( - { - "domain": domain, - "path": str(path), - "exists": path.is_file(), - } - ) - return rows - - -def main() -> int: - parser = argparse.ArgumentParser(description="Preflight TAU-2 benchmark config.") - parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml") - parser.add_argument("--run-id", default=run_id()) - parser.add_argument("--strict", action="store_true", help="Require optional runtime imports and TAU-2 split files.") - args = parser.parse_args() - - config = load_config(args.config) - out = output_dir(config, args.run_id) - - errors: list[str] = [] - try: - strategy_ids(config) - except Exception as exc: - errors.append(str(exc)) - - split_rows = _split_status(config) - if args.strict: - for row in split_rows: - if not row["exists"]: - errors.append(f"missing split file for {row['domain']}: {row['path']}") - - import_rows = [_check_import("openviking"), _check_import("openviking_cli"), _check_import("tau2")] - if args.strict: - for row in import_rows: - if not row["ok"]: - errors.append(f"missing Python module: {row['module']}") - - report = { - "status": "failed" if errors else "ok", - "config": str(args.config.resolve()), - "run_id": args.run_id, - "tau2_repo": str(tau2_repo(config)), - "domains": domains(config), - "strategies": strategy_ids(config), - "strict": args.strict, - "imports": import_rows, - "split_files": split_rows, - "errors": errors, - } - write_json(out / "preflight.json", report) - - if errors: - for error in errors: - print(f"[preflight][ERROR] {error}", file=sys.stderr) - print(f"[preflight] wrote {out / 'preflight.json'}", file=sys.stderr) - return 1 - print(f"[preflight][OK] wrote {out / 'preflight.json'}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 3d6d517c0..2a4e9b812 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -2,6 +2,7 @@ from __future__ import annotations import argparse +import importlib.util import subprocess import sys from pathlib import Path @@ -159,6 +160,42 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str return rows +def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: + errors: list[str] = [] + split_rows = [] + for domain in domains(config): + path = split_file(config, domain) + exists = path.is_file() + split_rows.append({"domain": domain, "path": str(path), "exists": exists}) + if strict and not exists: + errors.append(f"missing split file for {domain}: {path}") + + import_rows = [] + for module in ("openviking", "openviking_cli", "tau2"): + ok = importlib.util.find_spec(module) is not None + import_rows.append({"module": module, "ok": ok}) + if strict and not ok: + errors.append(f"missing Python module: {module}") + + report = { + "status": "failed" if errors else "ok", + "strict": strict, + "tau2_repo": str(tau2_repo(config)), + "domains": domains(config), + "strategies": strategy_ids(config), + "imports": import_rows, + "split_files": split_rows, + "errors": errors, + } + write_json(out / "preflight.json", report) + if errors: + for error in errors: + print(f"[preflight][ERROR] {error}", file=sys.stderr) + return 1 + print(f"[preflight][OK] wrote {out / 'preflight.json'}") + return 0 + + def main() -> int: parser = argparse.ArgumentParser(description="Plan or run TAU-2 benchmark cells.") parser.add_argument("--config", type=Path, default=Path(__file__).parents[1] / "config" / "baseline.yaml") @@ -168,6 +205,8 @@ def main() -> int: parser.add_argument("--strategy-id", action="append", help="Run only this strategy id; may be repeated.") parser.add_argument("--task-id", action="append", help="Run only this TAU-2 task id; may be repeated.") parser.add_argument("--num-tasks", type=int, help="Run the first N tasks from the selected split.") + parser.add_argument("--preflight", action="store_true", help="Write a lightweight environment/config preflight report.") + parser.add_argument("--strict-preflight", action="store_true", help="Fail if optional runtime imports or split files are missing.") parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.") parser.add_argument("--execute", action="store_true", help="Execute planned cells.") args = parser.parse_args() @@ -178,6 +217,11 @@ def main() -> int: config = load_config(args.config) out = output_dir(config, args.run_id) out.mkdir(parents=True, exist_ok=True) + if args.preflight or args.strict_preflight: + preflight_status = _preflight(config, out, strict=args.strict_preflight) + if args.strict_preflight and preflight_status != 0: + return preflight_status + plan = _build_plan( config, args.run_id, diff --git a/benchmark/tau2/scripts/summarize.py b/benchmark/tau2/scripts/summarize.py deleted file mode 100755 index 5e07c03f2..000000000 --- a/benchmark/tau2/scripts/summarize.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import json -from pathlib import Path -from statistics import mean -from typing import Any - -from tau2_common import write_json - - -def _load_json(path: Path) -> Any: - return json.loads(path.read_text(encoding="utf-8")) - - -def main() -> int: - parser = argparse.ArgumentParser(description="Summarize TAU-2 cell result JSON files.") - parser.add_argument("--run-dir", type=Path, required=True) - args = parser.parse_args() - - run_dir = args.run_dir.expanduser().resolve() - rows = [] - for path in sorted((run_dir / "cell_results").glob("*.json")): - row = _load_json(path) - rows.append(row) - - returncodes = [row.get("returncode") for row in rows] - summary = { - "run_dir": str(run_dir), - "cell_count": len(rows), - "succeeded_cell_count": sum(1 for code in returncodes if code == 0), - "failed_cell_count": sum(1 for code in returncodes if code != 0), - "returncodes": returncodes, - "average_reward": None, - "notes": [ - "This summarizer only aggregates wrapper cell status in the initial PR.", - "TAU-2 reward parsing is added once the execution artifact shape is fixed.", - ], - } - rewards = [row.get("reward") for row in rows if isinstance(row.get("reward"), (int, float))] - if rewards: - summary["average_reward"] = mean(rewards) - write_json(run_dir / "summary.json", summary) - print(f"[tau2] wrote {run_dir / 'summary.json'}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) From 95ea695b8f14079dac9c2dc3dfcb24503a28fc2a Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 00:03:50 +0800 Subject: [PATCH 05/17] benchmark: document tau2 dependency setup --- benchmark/tau2/.gitignore | 4 ++ benchmark/tau2/README.md | 54 +++++++++++++++ benchmark/tau2/config/baseline.yaml | 7 ++ benchmark/tau2/config/official.yaml | 7 ++ benchmark/tau2/scripts/run_eval.py | 42 +++++++++++- benchmark/tau2/scripts/setup_tau2_repo.sh | 82 +++++++++++++++++++++++ benchmark/tau2/scripts/tau2_common.py | 72 ++++++++++++++++++++ 7 files changed, 265 insertions(+), 3 deletions(-) create mode 100644 benchmark/tau2/config/official.yaml create mode 100755 benchmark/tau2/scripts/setup_tau2_repo.sh diff --git a/benchmark/tau2/.gitignore b/benchmark/tau2/.gitignore index 1cd791b52..2577e5885 100644 --- a/benchmark/tau2/.gitignore +++ b/benchmark/tau2/.gitignore @@ -1 +1,5 @@ result/ +.env.tau2 +.external/ +.venv-tau2/ +__pycache__/ diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 561a69e63..4b6fb8c10 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -16,9 +16,11 @@ Category rerank and other harness-only diagnostics are not migrated here yet. benchmark/tau2/ ├── config/ │ ├── baseline.yaml +│ ├── official.yaml │ └── prewrite.yaml ├── scripts/ │ ├── run_eval.py +│ ├── setup_tau2_repo.sh │ └── tau2_common.py └── run_full_eval.sh ``` @@ -27,6 +29,31 @@ Generated artifacts are written to `benchmark/tau2/result//`. ## Quick Start +This benchmark delegates task simulation and scoring to an external TAU-2 +checkout. Point the runner at that checkout and CLI explicitly when they are not +on the default path: + +```bash +export TAU2_REPO=/path/to/tau2-bench +export TAU2_CLI=/path/to/tau2 +``` + +For a local one-command setup, clone and install TAU-2 into ignored benchmark +directories: + +```bash +benchmark/tau2/scripts/setup_tau2_repo.sh +source benchmark/tau2/.env.tau2 +``` + +Use `TAU2_REF` or `--ref` when you need a TAU-2 branch that already contains the +confirmation-aware user simulator prompt: + +```bash +benchmark/tau2/scripts/setup_tau2_repo.sh --ref +source benchmark/tau2/.env.tau2 +``` + Plan the default benchmark without running TAU-2: ```bash @@ -36,6 +63,18 @@ python benchmark/tau2/scripts/run_eval.py --config benchmark/tau2/config/baselin Add `--preflight` or `--strict-preflight` when you want the runner to write a small environment/config check next to the run plan. +After setup, verify the local TAU-2 link and write a one-cell run plan: + +```bash +benchmark/tau2/run_full_eval.sh \ + --config benchmark/tau2/config/baseline.yaml \ + --strict-preflight \ + --domain retail \ + --strategy-id no_memory \ + --task-id 5 \ + --repeat-count 1 +``` + Plan a one-cell upstream TAU-2 smoke: ```bash @@ -61,6 +100,21 @@ The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory cells are kept in the same plan, but marked adapter-pending until the TAU-2 agent adapter is wired in this benchmark directory. +## User Simulator Policy + +The runner default is the official TAU-2 user simulator if +`eval.user_simulator_policy` is omitted. The bundled OpenViking memory benchmark +config sets `confirmation_aware`, because a memory benchmark should not treat +user confirmation as task completion before the backend write has happened. + +`confirmation_aware` does not monkey-patch TAU-2 from this directory. It requires +the configured `TAU2_REPO` to contain the corresponding upstream TAU-2 simulator +prompt fix. `--strict-preflight` fails fast when that prompt is not detected, so +the artifact cannot silently claim confirmation-aware semantics while running an +older official simulator. + +Use `config/official.yaml` when you need an official-user-simulator parity run. + ## Evidence Boundary Only completed `retail + airline` runs with the same config, same seeds/repeats, diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index c1e23e9a9..bb233417b 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -15,8 +15,15 @@ benchmark: paths: tau2_repo: ${TAU2_REPO:-data/external_benchmarks/tau2-bench} + tau2_cli: ${TAU2_CLI:-tau2} output_dir: benchmark/tau2/result +eval: + # The runner default is official if this field is omitted. The OpenViking + # memory benchmark config opts into the confirmation-aware TAU-2 prompt when + # the referenced TAU-2 checkout contains that upstream fix. + user_simulator_policy: confirmation_aware + model: agent_llm: ${TAU2_AGENT_LLM:-openai/doubao-seed-2-0-pro-260215} user_llm: ${TAU2_USER_LLM:-openai/doubao-seed-2-0-pro-260215} diff --git a/benchmark/tau2/config/official.yaml b/benchmark/tau2/config/official.yaml new file mode 100644 index 000000000..d10bee872 --- /dev/null +++ b/benchmark/tau2/config/official.yaml @@ -0,0 +1,7 @@ +extends: baseline.yaml + +benchmark: + name: tau2_openviking_official_user_simulator + +eval: + user_simulator_policy: official diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 2a4e9b812..31fda233c 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -8,7 +8,20 @@ from pathlib import Path from typing import Any -from tau2_common import domains, load_config, output_dir, run_id, split_file, strategy_ids, tau2_repo, write_json +from tau2_common import ( + domains, + load_config, + output_dir, + run_id, + simulator_policy_report, + split_file, + strategy_ids, + tau2_cli, + tau2_context, + tau2_repo, + user_simulator_policy, + write_json, +) def _tau2_command( @@ -27,7 +40,7 @@ def _tau2_command( return None command = [ - "tau2", + tau2_cli(config), "run", "--domain", domain, @@ -76,6 +89,7 @@ def _build_plan( repeat_count_override: int | None, ) -> dict[str, Any]: repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4)) + policy_report = simulator_policy_report(config) strategies = config.get("strategies") or [] if selected_strategy_ids: unknown = selected_strategy_ids - set(strategy_ids(config)) @@ -113,6 +127,8 @@ def _build_plan( "memory_backend": strategy.get("memory_backend"), "adapter_status": strategy.get("adapter_status", "ready"), "executable": command is not None, + "user_simulator_policy": user_simulator_policy(config), + "user_simulator_policy_supported": policy_report["supported"], "split_file": str(split_path), "command": command, } @@ -123,12 +139,20 @@ def _build_plan( "status": "planned", "strategy_ids": strategy_ids(config), "domains": plan_domains, + "tau2": tau2_context(config), + "simulator_policy": policy_report, "cell_count": len(cells), "cells": cells, } def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]: + policy_report = plan.get("simulator_policy") or {} + if not policy_report.get("supported", False): + raise RuntimeError( + "configured user simulator policy is not supported by this TAU-2 checkout: " + f"{policy_report}" + ) rows = [] for cell in plan["cells"]: if not cell.get("executable"): @@ -162,6 +186,17 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: errors: list[str] = [] + tau2_info = tau2_context(config) + policy_report = simulator_policy_report(config) + if strict and not tau2_info["tau2_repo_exists"]: + errors.append(f"missing TAU-2 repo: {tau2_info['tau2_repo']}") + if strict and not tau2_info["tau2_cli_resolved"]: + errors.append(f"missing TAU-2 CLI: {tau2_info['tau2_cli']}") + if strict and not policy_report["supported"]: + errors.append( + "configured confirmation-aware user simulator policy requires a TAU-2 " + f"checkout with the prompt fix: {policy_report['prompt_files']}" + ) split_rows = [] for domain in domains(config): path = split_file(config, domain) @@ -180,7 +215,8 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: report = { "status": "failed" if errors else "ok", "strict": strict, - "tau2_repo": str(tau2_repo(config)), + "tau2": tau2_info, + "simulator_policy": policy_report, "domains": domains(config), "strategies": strategy_ids(config), "imports": import_rows, diff --git a/benchmark/tau2/scripts/setup_tau2_repo.sh b/benchmark/tau2/scripts/setup_tau2_repo.sh new file mode 100755 index 000000000..3cee2655a --- /dev/null +++ b/benchmark/tau2/scripts/setup_tau2_repo.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +TAU2_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +DEFAULT_REPO_DIR="$TAU2_DIR/.external/tau2-bench" +DEFAULT_VENV_DIR="$TAU2_DIR/.venv-tau2" + +REPO_URL="${TAU2_REPO_URL:-https://github.com/sierra-research/tau2-bench.git}" +REPO_DIR="${TAU2_REPO:-$DEFAULT_REPO_DIR}" +VENV_DIR="${TAU2_VENV:-$DEFAULT_VENV_DIR}" +REF="${TAU2_REF:-}" +INSTALL=true + +while [[ $# -gt 0 ]]; do + case "$1" in + --repo-url) + REPO_URL="$2" + shift 2 + ;; + --repo-dir) + REPO_DIR="$2" + shift 2 + ;; + --venv) + VENV_DIR="$2" + shift 2 + ;; + --ref) + REF="$2" + shift 2 + ;; + --no-install) + INSTALL=false + shift + ;; + --help|-h) + cat <<'EOF' +Usage: + benchmark/tau2/scripts/setup_tau2_repo.sh [--repo-url URL] [--repo-dir DIR] [--venv DIR] [--ref REF] [--no-install] + +Clones TAU-2 into a local ignored directory and optionally installs it into a +local virtualenv. The script writes benchmark/tau2/.env.tau2 with TAU2_REPO and +TAU2_CLI for the benchmark runner. +EOF + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + exit 1 + ;; + esac +done + +mkdir -p "$(dirname "$REPO_DIR")" +if [[ ! -d "$REPO_DIR/.git" ]]; then + git clone "$REPO_URL" "$REPO_DIR" +else + git -C "$REPO_DIR" fetch --all --prune +fi + +if [[ -n "$REF" ]]; then + git -C "$REPO_DIR" checkout "$REF" +fi + +TAU2_CLI="tau2" +if [[ "$INSTALL" == true ]]; then + python3 -m venv "$VENV_DIR" + "$VENV_DIR/bin/python" -m pip install --upgrade pip + "$VENV_DIR/bin/python" -m pip install -e "$REPO_DIR" + TAU2_CLI="$VENV_DIR/bin/tau2" +fi + +cat > "$TAU2_DIR/.env.tau2" < Path: return resolve_path(raw) +def tau2_cli(config: dict[str, Any]) -> str: + return str(config.get("paths", {}).get("tau2_cli") or "tau2") + + +def _git_commit(path: Path) -> str | None: + if not path.exists(): + return None + completed = subprocess.run( + ["git", "-C", str(path), "rev-parse", "HEAD"], + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + check=False, + ) + if completed.returncode != 0: + return None + return completed.stdout.strip() or None + + +def tau2_context(config: dict[str, Any]) -> dict[str, Any]: + repo = tau2_repo(config) + cli = tau2_cli(config) + return { + "tau2_repo": str(repo), + "tau2_repo_exists": repo.exists(), + "tau2_commit": _git_commit(repo), + "tau2_cli": cli, + "tau2_cli_resolved": shutil.which(cli), + } + + +def user_simulator_policy(config: dict[str, Any]) -> str: + policy = config.get("eval", {}).get("user_simulator_policy", "official") + policy = str(policy) + if policy not in {"official", "confirmation_aware"}: + raise ValueError( + "eval.user_simulator_policy must be 'official' or 'confirmation_aware'" + ) + return policy + + +def simulator_policy_report(config: dict[str, Any]) -> dict[str, Any]: + policy = user_simulator_policy(config) + repo = tau2_repo(config) + prompt_paths = [ + repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines.md", + repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines_tools.md", + ] + prompt_text = "\n".join( + path.read_text(encoding="utf-8") for path in prompt_paths if path.is_file() + ) + confirmation_aware_prompt = ( + "do not emit" in prompt_text + and "###STOP###" in prompt_text + and "confirm" in prompt_text.lower() + ) + supported = policy == "official" or confirmation_aware_prompt + return { + "user_simulator_policy": policy, + "supported": supported, + "confirmation_aware_prompt_detected": confirmation_aware_prompt, + "prompt_files": [str(path) for path in prompt_paths], + "claim_boundary": ( + "official_tau2_user_simulator" + if policy == "official" + else "requires_tau2_confirmation_aware_user_simulator_prompt" + ), + } + + def split_file(config: dict[str, Any], domain: str) -> Path: return tau2_repo(config) / "data" / "tau2" / "domains" / domain / "split_tasks.json" From e59a4a052279af30fba19fd4c269e2552f00bc14 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 00:24:41 +0800 Subject: [PATCH 06/17] benchmark: simplify tau2 simulator patch --- benchmark/tau2/README.md | 22 +++----- benchmark/tau2/scripts/tau2_common.py | 79 ++++++++++++++++++++++----- 2 files changed, 73 insertions(+), 28 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 4b6fb8c10..198e869c7 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -46,14 +46,6 @@ benchmark/tau2/scripts/setup_tau2_repo.sh source benchmark/tau2/.env.tau2 ``` -Use `TAU2_REF` or `--ref` when you need a TAU-2 branch that already contains the -confirmation-aware user simulator prompt: - -```bash -benchmark/tau2/scripts/setup_tau2_repo.sh --ref -source benchmark/tau2/.env.tau2 -``` - Plan the default benchmark without running TAU-2: ```bash @@ -107,13 +99,15 @@ The runner default is the official TAU-2 user simulator if config sets `confirmation_aware`, because a memory benchmark should not treat user confirmation as task completion before the backend write has happened. -`confirmation_aware` does not monkey-patch TAU-2 from this directory. It requires -the configured `TAU2_REPO` to contain the corresponding upstream TAU-2 simulator -prompt fix. `--strict-preflight` fails fast when that prompt is not detected, so -the artifact cannot silently claim confirmation-aware semantics while running an -older official simulator. +`confirmation_aware` applies a small idempotent prompt patch to the configured +TAU-2 checkout before planning or running. The patch appends the confirmation +boundary from [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297) +to the TAU-2 user simulator guidelines when it is not already present, and the +run artifacts record whether the patch was applied. -Use `config/official.yaml` when you need an official-user-simulator parity run. +Use `config/official.yaml` with a clean TAU-2 checkout when you need an +official-user-simulator parity run. If the checkout was already patched, the +artifact records that boundary instead of labeling the run pure official. ## Evidence Boundary diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py index 0502e83c4..0281d4285 100755 --- a/benchmark/tau2/scripts/tau2_common.py +++ b/benchmark/tau2/scripts/tau2_common.py @@ -14,6 +14,21 @@ TAU2_DIR = Path(__file__).resolve().parents[1] REPO_ROOT = TAU2_DIR.parents[1] +CONFIRMATION_AWARE_UPSTREAM_PR = "https://github.com/sierra-research/tau2-bench/pull/297" +CONFIRMATION_AWARE_MARKER = "OpenViking TAU-2 confirmation-aware user simulator patch" +CONFIRMATION_AWARE_APPENDIX = f""" + +## {CONFIRMATION_AWARE_MARKER} + +Reference: {CONFIRMATION_AWARE_UPSTREAM_PR} + +- If the agent asks you to confirm, authorize, or approve a backend action, + reply with the requested confirmation but do not emit `###STOP###` in the + same turn. +- Emit `###STOP###` only after the agent clearly reports that the requested + backend action has been completed, or when the official transfer / + out-of-scope rules apply. +""" _ENV_PATTERN = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}") @@ -146,6 +161,37 @@ def tau2_context(config: dict[str, Any]) -> dict[str, Any]: } +def _prompt_paths(repo: Path) -> list[Path]: + return [ + repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines.md", + repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines_tools.md", + ] + + +def _has_confirmation_aware_prompt(prompt_text: str) -> bool: + return ( + CONFIRMATION_AWARE_MARKER in prompt_text + or ( + "do not emit" in prompt_text + and "###STOP###" in prompt_text + and "confirm" in prompt_text.lower() + ) + ) + + +def _ensure_confirmation_aware_prompt(repo: Path) -> bool: + patched = False + for path in _prompt_paths(repo): + if not path.is_file(): + continue + text = path.read_text(encoding="utf-8") + if _has_confirmation_aware_prompt(text): + continue + path.write_text(text.rstrip() + CONFIRMATION_AWARE_APPENDIX + "\n", encoding="utf-8") + patched = True + return patched + + def user_simulator_policy(config: dict[str, Any]) -> str: policy = config.get("eval", {}).get("user_simulator_policy", "official") policy = str(policy) @@ -159,29 +205,34 @@ def user_simulator_policy(config: dict[str, Any]) -> str: def simulator_policy_report(config: dict[str, Any]) -> dict[str, Any]: policy = user_simulator_policy(config) repo = tau2_repo(config) - prompt_paths = [ - repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines.md", - repo / "data" / "tau2" / "user_simulator" / "simulation_guidelines_tools.md", - ] + patch_applied = policy == "confirmation_aware" and _ensure_confirmation_aware_prompt(repo) + patch_mode = "direct_prompt_append" if patch_applied else "none" + if policy == "confirmation_aware": + if not patch_applied: + patch_mode = "upstream_or_existing_prompt" + + prompt_paths = _prompt_paths(repo) prompt_text = "\n".join( path.read_text(encoding="utf-8") for path in prompt_paths if path.is_file() ) - confirmation_aware_prompt = ( - "do not emit" in prompt_text - and "###STOP###" in prompt_text - and "confirm" in prompt_text.lower() - ) + confirmation_aware_prompt = _has_confirmation_aware_prompt(prompt_text) supported = policy == "official" or confirmation_aware_prompt + claim_boundary = "confirmation_aware_user_simulator_prompt" + if policy == "official": + claim_boundary = ( + "official_policy_with_confirmation_aware_checkout" + if confirmation_aware_prompt + else "official_tau2_user_simulator" + ) return { "user_simulator_policy": policy, "supported": supported, "confirmation_aware_prompt_detected": confirmation_aware_prompt, + "confirmation_aware_upstream_pr": CONFIRMATION_AWARE_UPSTREAM_PR, + "patch_applied": patch_applied, + "patch_mode": patch_mode, "prompt_files": [str(path) for path in prompt_paths], - "claim_boundary": ( - "official_tau2_user_simulator" - if policy == "official" - else "requires_tau2_confirmation_aware_user_simulator_prompt" - ), + "claim_boundary": claim_boundary, } From 32b9b42d371b90a9860e0dceb136b0c7c0a13bd3 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 00:30:43 +0800 Subject: [PATCH 07/17] benchmark: keep simulator patch prompt clean --- benchmark/tau2/README.md | 8 ++++---- benchmark/tau2/scripts/tau2_common.py | 16 ++++------------ 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 198e869c7..f8e4dc642 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -100,10 +100,10 @@ config sets `confirmation_aware`, because a memory benchmark should not treat user confirmation as task completion before the backend write has happened. `confirmation_aware` applies a small idempotent prompt patch to the configured -TAU-2 checkout before planning or running. The patch appends the confirmation -boundary from [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297) -to the TAU-2 user simulator guidelines when it is not already present, and the -run artifacts record whether the patch was applied. +TAU-2 checkout before planning or running. The patch appends only the behavioral +confirmation boundary to the TAU-2 user simulator guidelines; metadata such as +the upstream PR link is kept in run artifacts, not in the simulator prompt. +Reference: [sierra-research/tau2-bench#297](https://github.com/sierra-research/tau2-bench/pull/297). Use `config/official.yaml` with a clean TAU-2 checkout when you need an official-user-simulator parity run. If the checkout was already patched, the diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py index 0281d4285..ae00e4c6c 100755 --- a/benchmark/tau2/scripts/tau2_common.py +++ b/benchmark/tau2/scripts/tau2_common.py @@ -15,12 +15,7 @@ TAU2_DIR = Path(__file__).resolve().parents[1] REPO_ROOT = TAU2_DIR.parents[1] CONFIRMATION_AWARE_UPSTREAM_PR = "https://github.com/sierra-research/tau2-bench/pull/297" -CONFIRMATION_AWARE_MARKER = "OpenViking TAU-2 confirmation-aware user simulator patch" -CONFIRMATION_AWARE_APPENDIX = f""" - -## {CONFIRMATION_AWARE_MARKER} - -Reference: {CONFIRMATION_AWARE_UPSTREAM_PR} +CONFIRMATION_AWARE_APPENDIX = """ - If the agent asks you to confirm, authorize, or approve a backend action, reply with the requested confirmation but do not emit `###STOP###` in the @@ -169,13 +164,10 @@ def _prompt_paths(repo: Path) -> list[Path]: def _has_confirmation_aware_prompt(prompt_text: str) -> bool: + normalized = " ".join(prompt_text.split()) return ( - CONFIRMATION_AWARE_MARKER in prompt_text - or ( - "do not emit" in prompt_text - and "###STOP###" in prompt_text - and "confirm" in prompt_text.lower() - ) + "reply with the requested confirmation" in normalized + and "do not emit `###STOP###` in the same turn" in normalized ) From 7ef274334d7a1ae4557c40671d2ba372dfa2d80a Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 00:35:46 +0800 Subject: [PATCH 08/17] benchmark: clarify simulator patch config --- benchmark/tau2/config/baseline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index bb233417b..602545ee4 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -20,8 +20,8 @@ paths: eval: # The runner default is official if this field is omitted. The OpenViking - # memory benchmark config opts into the confirmation-aware TAU-2 prompt when - # the referenced TAU-2 checkout contains that upstream fix. + # memory benchmark config opts into a confirmation-aware TAU-2 user simulator + # prompt; run_eval.py applies that small prompt patch idempotently when needed. user_simulator_policy: confirmation_aware model: From 00bd6add35d5ce31a5fb8047cbd45871b67c595f Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 00:51:21 +0800 Subject: [PATCH 09/17] benchmark: clarify tau2 adapter boundary --- benchmark/tau2/README.md | 18 ++++++++++++++++++ benchmark/tau2/scripts/run_eval.py | 10 ++++++++++ 2 files changed, 28 insertions(+) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index f8e4dc642..143b32540 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -92,6 +92,24 @@ The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory cells are kept in the same plan, but marked adapter-pending until the TAU-2 agent adapter is wired in this benchmark directory. +## Memory Adapter Boundary + +The first PR keeps memory strategies visible in `run_plan.json` without +pretending they are executable through upstream TAU-2 flags. `no_memory` cells +can run immediately through the external TAU-2 CLI. OpenViking memory cells are +planned with corpus / strategy metadata and `adapter_status: pending`; the plan +also records `non_executable_reason` for those cells. + +The next adapter step should register a TAU-2 agent entry point that can: + +- train by writing TAU-2 training conversations into OpenViking sessions; +- evaluate by retrieving OpenViking memory at the configured decision node; +- emit enough artifact metadata to identify the OpenViking account, agent, + corpus, retrieval mode, and simulator policy used by each cell. + +Until that adapter exists, `--execute` is expected to fail fast if a selected +cell needs OpenViking memory. + ## User Simulator Policy The runner default is the official TAU-2 user simulator if diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 31fda233c..67a229a5f 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -116,6 +116,12 @@ def _build_plan( task_ids=task_ids, num_tasks=num_tasks, ) + non_executable_reason = None + if command is None: + non_executable_reason = ( + "OpenViking memory strategy requires a TAU-2 agent adapter; " + "this benchmark scaffold only executes upstream TAU-2 no-memory cells." + ) cells.append( { "domain": domain, @@ -131,8 +137,10 @@ def _build_plan( "user_simulator_policy_supported": policy_report["supported"], "split_file": str(split_path), "command": command, + "non_executable_reason": non_executable_reason, } ) + executable_cell_count = sum(1 for cell in cells if cell["executable"]) return { "schema_version": "openviking.tau2.run_plan.v0", "run_id": configured_run_id, @@ -142,6 +150,8 @@ def _build_plan( "tau2": tau2_context(config), "simulator_policy": policy_report, "cell_count": len(cells), + "executable_cell_count": executable_cell_count, + "pending_cell_count": len(cells) - executable_cell_count, "cells": cells, } From 85d536342adc9af02c30eb6227b775d365f9bcf1 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 02:37:49 +0800 Subject: [PATCH 10/17] benchmark: wire tau2 memory v2 eval --- benchmark/tau2/README.md | 55 ++- benchmark/tau2/config/baseline.yaml | 2 +- benchmark/tau2/run_full_eval.sh | 2 +- benchmark/tau2/scripts/run_eval.py | 222 +++++++++- benchmark/tau2/scripts/run_memory_v2_eval.py | 412 +++++++++++++++++++ 5 files changed, 668 insertions(+), 25 deletions(-) create mode 100644 benchmark/tau2/scripts/run_memory_v2_eval.py diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 143b32540..6b617cb48 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -4,11 +4,13 @@ This directory contains a small OpenViking-style entry point for TAU-2 memory evaluation. The first version is intentionally narrow: - no-memory control; -- fresh OpenViking memory baseline; +- fresh OpenViking Memory V2 experience-only baseline; - trajectory / procedure-view treatment; - optional pre-write recall. Category rerank and other harness-only diagnostics are not migrated here yet. +The Memory V2 baseline is wired end to end; trajectory / procedure-view remains +visible in the plan but adapter-pending. ## Layout @@ -85,30 +87,49 @@ configured: benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute ``` +Run the Memory V2 8-trial baseline (`retail + airline` x 4 repeats): + +```bash +benchmark/tau2/run_full_eval.sh \ + --config benchmark/tau2/config/baseline.yaml \ + --strategy-id memory_v2_experience_only \ + --execute +``` + +For a small E2E smoke, keep both the eval and train slices tiny: + +```bash +benchmark/tau2/run_full_eval.sh \ + --config benchmark/tau2/config/baseline.yaml \ + --domain retail \ + --strategy-id memory_v2_experience_only \ + --num-tasks 1 \ + --train-num-tasks 1 \ + --repeat-count 1 \ + --execute +``` + When using Doubao through an OpenAI-compatible endpoint, set `OPENAI_API_KEY` and `OPENAI_API_BASE` for LiteLLM before running upstream TAU-2. -The initial no-memory cells use upstream TAU-2 CLI flags only. OpenViking memory -cells are kept in the same plan, but marked adapter-pending until the TAU-2 -agent adapter is wired in this benchmark directory. +Start the OpenViking service before executing memory cells, and verify it with +`ov status`. For evidence runs, use a clean OpenViking workspace/config and set +`OPENVIKING_URL` explicitly so local custom memory templates do not pollute the +Memory V2 baseline. ## Memory Adapter Boundary -The first PR keeps memory strategies visible in `run_plan.json` without -pretending they are executable through upstream TAU-2 flags. `no_memory` cells -can run immediately through the external TAU-2 CLI. OpenViking memory cells are -planned with corpus / strategy metadata and `adapter_status: pending`; the plan -also records `non_executable_reason` for those cells. - -The next adapter step should register a TAU-2 agent entry point that can: +`no_memory` cells run through the external TAU-2 CLI. `memory_v2_experience_only` +cells run through a small TAU-2 agent adapter in this directory: - train by writing TAU-2 training conversations into OpenViking sessions; -- evaluate by retrieving OpenViking memory at the configured decision node; -- emit enough artifact metadata to identify the OpenViking account, agent, +- evaluate by retrieving OpenViking experience memory at the first user turn; +- emit artifact metadata to identify the OpenViking account, agent, corpus, retrieval mode, and simulator policy used by each cell. -Until that adapter exists, `--execute` is expected to fail fast if a selected -cell needs OpenViking memory. +The trajectory / procedure-view treatment is kept in the same plan but remains +`adapter_status: pending`; `--execute` fails fast if that strategy is selected +before its adapter is implemented. ## User Simulator Policy @@ -132,3 +153,7 @@ artifact records that boundary instead of labeling the run pure official. Only completed `retail + airline` runs with the same config, same seeds/repeats, and non-empty artifacts should be read as benchmark evidence. Partial runs, single-task probes, or missing OpenViking corpus identity are diagnostics. +Executed runs write per-cell JSON under `cell_results/` and a strategy/domain +aggregate under `scoreboard.json`. Memory training artifacts are shared by +domain and strategy under `memory_corpora/`, so repeated eval cells reuse the +same fresh corpus instead of rewriting it. diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 602545ee4..08c0a6bdd 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -45,7 +45,7 @@ strategies: - id: memory_v2_experience_only label: OpenViking Memory V2 experience-only memory_backend: openviking - adapter_status: pending + adapter_status: ready train_required: true train_memory_mode: experience_only retrieval_mode: first_user diff --git a/benchmark/tau2/run_full_eval.sh b/benchmark/tau2/run_full_eval.sh index 22936e3d3..ca69a7a32 100755 --- a/benchmark/tau2/run_full_eval.sh +++ b/benchmark/tau2/run_full_eval.sh @@ -33,7 +33,7 @@ while [[ $# -gt 0 ]]; do STRICT_PREFLIGHT=true shift ;; - --domain|--repeat-count|--strategy-id|--task-id|--num-tasks) + --domain|--repeat-count|--strategy-id|--task-id|--num-tasks|--train-num-tasks) RUN_EVAL_EXTRA+=("$1" "$2") shift 2 ;; diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 67a229a5f..8b6a81a4a 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -3,6 +3,7 @@ import argparse import importlib.util +import json import subprocess import sys from pathlib import Path @@ -24,18 +25,125 @@ ) +def _reward(sim: dict[str, Any]) -> float: + info = sim.get("reward_info") or {} + value = info.get("reward", sim.get("reward", 0.0)) + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def _db_match(sim: dict[str, Any]) -> bool | None: + info = sim.get("reward_info") or {} + db = info.get("db_check") or {} + if isinstance(db, dict): + if "score" in db: + return bool(db["score"]) + if "db_match" in db: + return bool(db["db_match"]) + return sim.get("db_match") + + +def _metrics_from_tau2_results(results_path: Path) -> dict[str, Any]: + data = json.loads(results_path.read_text(encoding="utf-8")) + sims = data.get("simulations") or [] + rewards = [_reward(sim) for sim in sims] + db_values = [_db_match(sim) for sim in sims] + db_known = [value for value in db_values if value is not None] + return { + "simulation_count": len(sims), + "avg_reward": sum(rewards) / len(rewards) if rewards else 0.0, + "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) if db_known else None, + } + + def _tau2_command( config: dict[str, Any], *, domain: str, strategy: dict[str, Any], + configured_run_id: str, run_label: str, task_ids: list[str] | None, num_tasks: int | None, + train_num_tasks: int | None, ) -> list[str] | None: benchmark = config["benchmark"] model = config["model"] + reasoning_effort = benchmark.get("reasoning_effort") + agent_llm_args = '{"temperature":0.0}' + user_llm_args = '{"temperature":0.0}' + if reasoning_effort: + agent_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' + user_llm_args = f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}' + + if ( + strategy.get("memory_backend") == "openviking" + and strategy.get("train_memory_mode") == "experience_only" + ): + openviking = config["openviking"] + account = f"{openviking['account']}-{configured_run_id}-{domain}-{strategy['id']}" + agent_id = f"{openviking['agent_id']}-{domain}-{strategy['id']}" + user = f"tau2-{domain}-{strategy['id']}" + search_uri = f"viking://agent/{agent_id}/memories/experiences" + command = [ + sys.executable, + str(Path(__file__).with_name("run_memory_v2_eval.py")), + "--tau2-repo", + str(tau2_repo(config)), + "--run-dir", + str(output_dir(config, configured_run_id) / "memory_cells" / run_label), + "--corpus-dir", + str( + output_dir(config, configured_run_id) + / "memory_corpora" + / f"{domain}_{strategy['id']}" + ), + "--run-label", + run_label, + "--domain", + domain, + "--train-split-name", + str(benchmark.get("train_split_name", "train")), + "--eval-split-name", + str(benchmark.get("eval_split_name", "test")), + "--max-steps", + str(benchmark.get("max_steps", 200)), + "--max-concurrency", + str(benchmark.get("task_max_concurrency", 10)), + "--agent-llm", + str(model["agent_llm"]), + "--user-llm", + str(model["user_llm"]), + "--agent-llm-args", + agent_llm_args, + "--user-llm-args", + user_llm_args, + "--openviking-url", + str(openviking["url"]), + "--openviking-account", + account, + "--openviking-user", + user, + "--openviking-agent-id", + agent_id, + "--search-uri", + search_uri, + "--retrieval-top-k", + str(openviking.get("retrieval_top_k", 4)), + ] + if task_ids: + for task_id in task_ids: + command.extend(["--task-id", task_id]) + elif num_tasks is not None: + command.extend(["--num-tasks", str(num_tasks)]) + train_num_tasks = train_num_tasks if train_num_tasks is not None else strategy.get("train_num_tasks") + if train_num_tasks is not None: + command.extend(["--train-num-tasks", str(train_num_tasks)]) + return command + if strategy.get("memory_backend") != "none": return None @@ -64,10 +172,8 @@ def _tau2_command( run_label, ] - reasoning_effort = benchmark.get("reasoning_effort") - if reasoning_effort: - command.extend(["--agent-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}']) - command.extend(["--user-llm-args", f'{{"temperature":0.0,"reasoning_effort":"{reasoning_effort}"}}']) + command.extend(["--agent-llm-args", agent_llm_args]) + command.extend(["--user-llm-args", user_llm_args]) if task_ids: command.append("--task-ids") @@ -86,6 +192,7 @@ def _build_plan( selected_strategy_ids: set[str] | None, task_ids: list[str] | None, num_tasks: int | None, + train_num_tasks: int | None, repeat_count_override: int | None, ) -> dict[str, Any]: repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4)) @@ -112,15 +219,17 @@ def _build_plan( config, domain=domain, strategy=strategy, + configured_run_id=configured_run_id, run_label=run_label, task_ids=task_ids, num_tasks=num_tasks, + train_num_tasks=train_num_tasks, ) non_executable_reason = None if command is None: non_executable_reason = ( - "OpenViking memory strategy requires a TAU-2 agent adapter; " - "this benchmark scaffold only executes upstream TAU-2 no-memory cells." + "This OpenViking memory strategy is planned but not wired to " + "the TAU-2 adapter in this PR." ) cells.append( { @@ -156,6 +265,90 @@ def _build_plan( } +def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, str]: + if cell["strategy_id"] == "memory_v2_experience_only": + run_dir = out / "memory_cells" / cell["run_label"] + corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{cell['strategy_id']}" + return { + "summary": str(run_dir / f"{cell['run_label']}.summary.json"), + "results": str(run_dir / f"{cell['run_label']}.json"), + "retrieval_trace": str(run_dir / f"{cell['run_label']}.retrieval_trace.jsonl"), + "corpus_manifest": str(corpus_dir / "corpus_manifest.json"), + } + return { + "results": str(repo / "data" / "simulations" / f"{cell['run_label']}.json") + } + + +def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any] | None: + if cell["strategy_id"] == "memory_v2_experience_only": + summary_path = Path(artifacts["summary"]) + if not summary_path.is_file(): + return None + summary = json.loads(summary_path.read_text(encoding="utf-8")) + return summary.get("metrics") + + results_path = Path(artifacts["results"]) + if not results_path.is_file(): + return None + return _metrics_from_tau2_results(results_path) + + +def _summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: + def weighted(rows_for_group: list[dict[str, Any]]) -> dict[str, Any]: + metric_rows = [row for row in rows_for_group if row.get("metrics")] + sim_count = sum(int(row["metrics"].get("simulation_count") or 0) for row in metric_rows) + reward_sum = sum( + float(row["metrics"].get("avg_reward") or 0.0) + * int(row["metrics"].get("simulation_count") or 0) + for row in metric_rows + ) + db_weighted_rows = [ + row + for row in metric_rows + if row["metrics"].get("db_match_rate") is not None + and int(row["metrics"].get("simulation_count") or 0) > 0 + ] + db_weight = sum(int(row["metrics"].get("simulation_count") or 0) for row in db_weighted_rows) + db_sum = sum( + float(row["metrics"]["db_match_rate"]) + * int(row["metrics"].get("simulation_count") or 0) + for row in db_weighted_rows + ) + return { + "cell_count": len(rows_for_group), + "completed_cell_count": len(metric_rows), + "simulation_count": sim_count, + "avg_reward": reward_sum / sim_count if sim_count else None, + "db_match_rate": db_sum / db_weight if db_weight else None, + } + + by_strategy: dict[str, dict[str, Any]] = {} + for row in rows: + strategy_id = row["strategy_id"] + strategy_summary = by_strategy.setdefault( + strategy_id, + { + "strategy_id": strategy_id, + "domains": {}, + "task_weighted_total": {}, + }, + ) + strategy_summary["domains"].setdefault(row["domain"], []).append(row) + + for strategy_summary in by_strategy.values(): + all_rows = [] + for domain, domain_rows in list(strategy_summary["domains"].items()): + strategy_summary["domains"][domain] = weighted(domain_rows) + all_rows.extend(domain_rows) + strategy_summary["task_weighted_total"] = weighted(all_rows) + + return { + "schema_version": "openviking.tau2.scoreboard.v0", + "strategies": by_strategy, + } + + def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str, Any]]: policy_report = plan.get("simulator_policy") or {} if not policy_report.get("supported", False): @@ -187,6 +380,8 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str "stdout_tail": completed.stdout[-4000:], "stderr_tail": completed.stderr[-4000:], } + row["artifacts"] = _cell_artifacts(cell, repo, out) + row["metrics"] = _cell_metrics(cell, row["artifacts"]) rows.append(row) write_json(out / "cell_results" / f"{cell['run_label']}.json", row) if completed.returncode != 0: @@ -251,8 +446,17 @@ def main() -> int: parser.add_argument("--strategy-id", action="append", help="Run only this strategy id; may be repeated.") parser.add_argument("--task-id", action="append", help="Run only this TAU-2 task id; may be repeated.") parser.add_argument("--num-tasks", type=int, help="Run the first N tasks from the selected split.") - parser.add_argument("--preflight", action="store_true", help="Write a lightweight environment/config preflight report.") - parser.add_argument("--strict-preflight", action="store_true", help="Fail if optional runtime imports or split files are missing.") + parser.add_argument("--train-num-tasks", type=int, help="Train OpenViking memory on the first N train tasks.") + parser.add_argument( + "--preflight", + action="store_true", + help="Write a lightweight environment/config preflight report.", + ) + parser.add_argument( + "--strict-preflight", + action="store_true", + help="Fail if optional runtime imports or split files are missing.", + ) parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.") parser.add_argument("--execute", action="store_true", help="Execute planned cells.") args = parser.parse_args() @@ -275,6 +479,7 @@ def main() -> int: selected_strategy_ids=set(args.strategy_id) if args.strategy_id else None, task_ids=args.task_id, num_tasks=args.num_tasks, + train_num_tasks=args.train_num_tasks, repeat_count_override=args.repeat_count, ) write_json(out / "run_plan.json", plan) @@ -287,6 +492,7 @@ def main() -> int: plan["status"] = "succeeded" plan["executed_cell_count"] = len(rows) write_json(out / "run_plan.json", plan) + write_json(out / "scoreboard.json", _summarize(rows)) except Exception as exc: plan["status"] = "failed" plan["error"] = str(exc) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py new file mode 100644 index 000000000..ff0d6f32c --- /dev/null +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path +from typing import Any + + +AGENT_NAME = "openviking_memory_agent" +REPO_ROOT = Path(__file__).resolve().parents[3] + + +def _json(text: str) -> dict[str, Any]: + return json.loads(text) if text else {} + + +def _write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n") + + +def _add_tau2_to_path(tau2_repo: Path) -> None: + src = tau2_repo / "src" + sys.path.insert(0, str(REPO_ROOT)) + sys.path.insert(0, str(src if src.is_dir() else tau2_repo)) + + +def _save_to_arg(path: Path) -> str: + # TAU-2 run_domain appends ".json" to save_to. Keep our artifact paths + # stable by passing the stem when callers hand us a JSON path. + return str(path.with_suffix("") if path.suffix == ".json" else path) + + +def _reward(sim: dict[str, Any]) -> float: + info = sim.get("reward_info") or {} + value = info.get("reward", sim.get("reward", 0.0)) + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def _db_match(sim: dict[str, Any]) -> bool | None: + info = sim.get("reward_info") or {} + db = info.get("db_check") or {} + if isinstance(db, dict): + if "score" in db: + return bool(db["score"]) + if "db_match" in db: + return bool(db["db_match"]) + return sim.get("db_match") + + +def _metrics(results_path: Path) -> dict[str, Any]: + data = json.loads(results_path.read_text()) + sims = data.get("simulations") or [] + rewards = [_reward(sim) for sim in sims] + db_values = [_db_match(sim) for sim in sims] + db_known = [value for value in db_values if value is not None] + return { + "simulation_count": len(sims), + "avg_reward": sum(rewards) / len(rewards) if rewards else 0.0, + "db_match_rate": (sum(1 for value in db_known if value) / len(db_known)) if db_known else None, + } + + +def _message_text(message: dict[str, Any]) -> tuple[str, str]: + role = str(message.get("role") or "assistant") + if role == "user": + return "user", str(message.get("content") or "") + if role == "tool": + return "assistant", f"Tool result: {message.get('content') or ''}" + calls = message.get("tool_calls") or [] + if calls: + rendered = [] + for call in calls: + name = call.get("name") or call.get("function", {}).get("name") or "unknown_tool" + arguments = call.get("arguments") or call.get("function", {}).get("arguments") or {} + rendered.append(f"{name}({json.dumps(arguments, ensure_ascii=False, sort_keys=True)})") + return "assistant", "Assistant tool call: " + "; ".join(rendered) + return "assistant", str(message.get("content") or "") + + +def _run_tau2( + *, + tau2_repo: Path, + domain: str, + split: str, + task_ids: list[str] | None, + num_tasks: int | None, + trials: int, + max_steps: int, + max_concurrency: int, + agent: str, + user: str, + agent_llm: str, + user_llm: str, + agent_llm_args: dict[str, Any], + user_llm_args: dict[str, Any], + seed: int, + save_to: Path, +): + _add_tau2_to_path(tau2_repo) + from tau2.data_model.simulation import RunConfig + from tau2.run import run_domain + + if save_to.exists(): + save_to.unlink() + return run_domain( + RunConfig( + domain=domain, + task_split_name=split, + task_ids=task_ids, + num_tasks=num_tasks, + agent=agent, + llm_agent=agent_llm, + llm_args_agent=agent_llm_args, + user=user, + llm_user=user_llm, + llm_args_user=user_llm_args, + num_trials=trials, + max_steps=max_steps, + save_to=_save_to_arg(save_to), + max_concurrency=max_concurrency, + seed=seed, + log_level="INFO", + ) + ) + + +def _client(args: argparse.Namespace): + import openviking as ov + + client = ov.SyncHTTPClient( + url=args.openviking_url, + api_key="", + user=args.openviking_user, + agent_id=args.openviking_agent_id, + account=args.openviking_account, + timeout=args.openviking_timeout, + extra_headers={}, + ) + client.initialize() + return client + + +def _wait_task(client: Any, task_id: str | None, timeout: int) -> dict[str, Any]: + if not task_id: + return {"status": "no_task"} + deadline = time.time() + timeout + last = None + while time.time() < deadline: + last = client.get_task(task_id) + status = (last or {}).get("status") + if status == "completed": + return last or {"status": status} + if status in {"failed", "cancelled"}: + raise RuntimeError(f"OpenViking task {task_id} {status}: {last}") + time.sleep(2) + raise TimeoutError(f"OpenViking task {task_id} did not finish within {timeout}s: {last}") + + +def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) -> dict[str, Any]: + if corpus_manifest.is_file() and not args.force_train: + return json.loads(corpus_manifest.read_text()) + + _run_tau2( + tau2_repo=args.tau2_repo, + domain=args.domain, + split=args.train_split_name, + task_ids=args.train_task_ids, + num_tasks=args.train_num_tasks, + trials=1, + max_steps=args.max_steps, + max_concurrency=args.max_concurrency, + agent=args.base_agent, + user=args.user, + agent_llm=args.agent_llm, + user_llm=args.user_llm, + agent_llm_args=args.agent_llm_args, + user_llm_args=args.user_llm_args, + seed=args.seed, + save_to=train_results, + ) + + data = json.loads(train_results.read_text()) + client = _client(args) + committed = [] + try: + for sim in data.get("simulations") or []: + session_id = f"tau2-{args.domain}-train-{sim.get('task_id')}-trial-{sim.get('trial', 0)}" + created = client.create_session(session_id=session_id) + sid = created.get("session_id", session_id) + for msg in sim.get("messages") or []: + role, text = _message_text(msg) + if not text.strip(): + continue + client.add_message( + sid, + role=role, + parts=[{"type": "text", "text": text}], + created_at=msg.get("timestamp"), + ) + result = client.commit_session(sid, telemetry=True) + task = _wait_task(client, result.get("task_id"), args.openviking_wait_timeout) + committed.append( + { + "session_id": sid, + "task_id": sim.get("task_id"), + "commit_status": result.get("status"), + "openviking_task_id": result.get("task_id"), + "openviking_task_status": task.get("status"), + } + ) + finally: + client.close() + + manifest = { + "domain": args.domain, + "train_results": str(train_results), + "openviking": { + "url": args.openviking_url, + "account": args.openviking_account, + "user": args.openviking_user, + "agent_id": args.openviking_agent_id, + "search_uri": args.search_uri, + }, + "committed_sessions": committed, + "committed_session_count": len(committed), + } + _write_json(corpus_manifest, manifest) + return manifest + + +def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None: + _add_tau2_to_path(args.tau2_repo) + + from tau2.agent.llm_agent import LLMAgent, LLMAgentState + from tau2.data_model.message import MultiToolMessage, SystemMessage + from tau2.registry import registry + from tau2.utils.llm_utils import generate + + class OpenVikingMemoryAgent(LLMAgent): + def get_init_state(self, message_history=None): + state = super().get_init_state(message_history) + state.system_messages.append( + SystemMessage(role="system", content="") + ) + return state + + def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: + client = _client(args) + rows: list[dict[str, Any]] = [] + try: + result = client.search(query=query, target_uri=args.search_uri, limit=args.retrieval_top_k) + memories = list(getattr(result, "memories", []) or []) + blocks = [] + for index, match in enumerate(memories[: args.retrieval_top_k], 1): + uri = getattr(match, "uri", "") + text = "" + try: + text = client.read(uri) + except Exception: + text = getattr(match, "abstract", "") or getattr(match, "overview", "") or "" + rows.append( + { + "uri": uri, + "score": getattr(match, "score", None), + "level": getattr(match, "level", None), + "text_chars": len(text), + } + ) + if text.strip(): + blocks.append(f"Memory {index} ({uri}):\n{text.strip()}") + return "\n\n".join(blocks), rows + finally: + client.close() + + def generate_next_message(self, message, state: LLMAgentState): + if isinstance(message, MultiToolMessage): + state.messages.extend(message.tool_messages) + else: + state.messages.append(message) + marker_index = next( + ( + i + for i, item in enumerate(state.system_messages) + if isinstance(item, SystemMessage) and item.content == "" + ), + None, + ) + role = getattr(message, "role", "") + role_value = getattr(role, "value", role) + if marker_index is not None and str(role_value) == "user": + query = str(getattr(message, "content", "") or "") + block, matches = self._retrieve(query) + prompt = ( + "No OpenViking memory matched this user request." + if not block + else "Use these OpenViking experience memories only when they match the current task:\n\n" + + block + ) + state.system_messages[marker_index] = SystemMessage(role="system", content=prompt) + with trace_path.open("a", encoding="utf-8") as handle: + handle.write( + json.dumps( + { + "query": query, + "match_count": len(matches), + "matches": matches, + }, + ensure_ascii=False, + sort_keys=True, + ) + + "\n" + ) + + assistant_message = generate( + model=self.llm, + tools=self.tools, + messages=state.system_messages + state.messages, + **self.llm_args, + ) + state.messages.append(assistant_message) + return assistant_message, state + + if AGENT_NAME not in registry.get_agents(): + registry.register_agent(OpenVikingMemoryAgent, AGENT_NAME) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run TAU-2 with OpenViking Memory V2.") + parser.add_argument("--tau2-repo", type=Path, required=True) + parser.add_argument("--run-dir", type=Path, required=True) + parser.add_argument("--corpus-dir", type=Path) + parser.add_argument("--run-label", required=True) + parser.add_argument("--domain", required=True) + parser.add_argument("--train-split-name", default="train") + parser.add_argument("--eval-split-name", default="test") + parser.add_argument("--task-id", dest="task_ids", action="append") + parser.add_argument("--num-tasks", type=int) + parser.add_argument("--train-task-id", dest="train_task_ids", action="append") + parser.add_argument("--train-num-tasks", type=int) + parser.add_argument("--max-steps", type=int, default=200) + parser.add_argument("--max-concurrency", type=int, default=10) + parser.add_argument("--seed", type=int, default=300) + parser.add_argument("--base-agent", default="llm_agent") + parser.add_argument("--user", default="user_simulator") + parser.add_argument("--agent-llm", required=True) + parser.add_argument("--user-llm", required=True) + parser.add_argument("--agent-llm-args", type=_json, default={}) + parser.add_argument("--user-llm-args", type=_json, default={}) + parser.add_argument("--openviking-url", required=True) + parser.add_argument("--openviking-account", required=True) + parser.add_argument("--openviking-user", required=True) + parser.add_argument("--openviking-agent-id", required=True) + parser.add_argument("--openviking-timeout", type=float, default=600.0) + parser.add_argument("--openviking-wait-timeout", type=int, default=600) + parser.add_argument("--search-uri", required=True) + parser.add_argument("--retrieval-top-k", type=int, default=4) + parser.add_argument("--force-train", action="store_true") + args = parser.parse_args() + + args.tau2_repo = args.tau2_repo.resolve() + args.run_dir.mkdir(parents=True, exist_ok=True) + corpus_dir = args.corpus_dir or args.run_dir + corpus_dir.mkdir(parents=True, exist_ok=True) + train_results = corpus_dir / "train_results.json" + corpus_manifest = corpus_dir / "corpus_manifest.json" + eval_results = args.run_dir / f"{args.run_label}.json" + trace_path = args.run_dir / f"{args.run_label}.retrieval_trace.jsonl" + summary_path = args.run_dir / f"{args.run_label}.summary.json" + + corpus = _train(args, train_results, corpus_manifest) + _register_memory_agent(args, trace_path) + _run_tau2( + tau2_repo=args.tau2_repo, + domain=args.domain, + split=args.eval_split_name, + task_ids=args.task_ids, + num_tasks=args.num_tasks, + trials=1, + max_steps=args.max_steps, + max_concurrency=args.max_concurrency, + agent=AGENT_NAME, + user=args.user, + agent_llm=args.agent_llm, + user_llm=args.user_llm, + agent_llm_args=args.agent_llm_args, + user_llm_args=args.user_llm_args, + seed=args.seed, + save_to=eval_results, + ) + summary = { + "run_label": args.run_label, + "domain": args.domain, + "strategy_id": "memory_v2_experience_only", + "corpus": corpus, + "eval_results": str(eval_results), + "retrieval_trace": str(trace_path), + "metrics": _metrics(eval_results), + } + _write_json(summary_path, summary) + print(json.dumps(summary, ensure_ascii=False, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 90f040aa589619b37c1a396fd77559ffe30d3d7c Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 05:18:16 +0800 Subject: [PATCH 11/17] benchmark: harden tau2 memory agent tool calls --- benchmark/tau2/scripts/run_memory_v2_eval.py | 44 ++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index ff0d6f32c..1b8832fc0 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -239,7 +239,7 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None: _add_tau2_to_path(args.tau2_repo) from tau2.agent.llm_agent import LLMAgent, LLMAgentState - from tau2.data_model.message import MultiToolMessage, SystemMessage + from tau2.data_model.message import AssistantMessage, MultiToolMessage, SystemMessage from tau2.registry import registry from tau2.utils.llm_utils import generate @@ -279,6 +279,41 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: finally: client.close() + def _generate(self, messages): + try: + return generate( + model=self.llm, + tools=self.tools, + messages=messages, + **self.llm_args, + ) + except json.JSONDecodeError: + retry_messages = messages + [ + SystemMessage( + role="system", + content=( + "Retry the last assistant step once. If you call a tool, " + "the tool arguments must be syntactically valid JSON." + ), + ) + ] + try: + return generate( + model=self.llm, + tools=self.tools, + messages=retry_messages, + **self.llm_args, + ) + except json.JSONDecodeError as exc: + return AssistantMessage( + role="assistant", + content="I need to continue with the available task information.", + raw_data={ + "openviking_memory_agent_error": "invalid_tool_call_json", + "error": str(exc), + }, + ) + def generate_next_message(self, message, state: LLMAgentState): if isinstance(message, MultiToolMessage): state.messages.extend(message.tool_messages) @@ -318,12 +353,7 @@ def generate_next_message(self, message, state: LLMAgentState): + "\n" ) - assistant_message = generate( - model=self.llm, - tools=self.tools, - messages=state.system_messages + state.messages, - **self.llm_args, - ) + assistant_message = self._generate(state.system_messages + state.messages) state.messages.append(assistant_message) return assistant_message, state From b52e65b790eb6e02e2d98e15a0db6ab6602bdd7c Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 10:07:59 +0800 Subject: [PATCH 12/17] benchmark: tolerate empty tau2 assistant responses --- benchmark/tau2/scripts/run_memory_v2_eval.py | 57 ++++++++++++++------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 1b8832fc0..6dfe5b6f6 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -280,13 +280,20 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: client.close() def _generate(self, messages): + def _is_empty_assistant(response) -> bool: + content = str(getattr(response, "content", "") or "") + tool_calls = getattr(response, "tool_calls", None) or [] + return not content.strip() and not tool_calls + try: - return generate( + response = generate( model=self.llm, tools=self.tools, messages=messages, **self.llm_args, ) + if not _is_empty_assistant(response): + return response except json.JSONDecodeError: retry_messages = messages + [ SystemMessage( @@ -297,22 +304,40 @@ def _generate(self, messages): ), ) ] - try: - return generate( - model=self.llm, - tools=self.tools, - messages=retry_messages, - **self.llm_args, - ) - except json.JSONDecodeError as exc: - return AssistantMessage( - role="assistant", - content="I need to continue with the available task information.", - raw_data={ - "openviking_memory_agent_error": "invalid_tool_call_json", - "error": str(exc), - }, + else: + retry_messages = messages + [ + SystemMessage( + role="system", + content=( + "Retry the last assistant step once. Return either a useful " + "natural language response or a valid tool call; do not return " + "an empty assistant message." + ), ) + ] + try: + response = generate( + model=self.llm, + tools=self.tools, + messages=retry_messages, + **self.llm_args, + ) + if not _is_empty_assistant(response): + return response + return AssistantMessage( + role="assistant", + content="I need to continue with the available task information.", + raw_data={"openviking_memory_agent_error": "empty_assistant_message"}, + ) + except json.JSONDecodeError as exc: + return AssistantMessage( + role="assistant", + content="I need to continue with the available task information.", + raw_data={ + "openviking_memory_agent_error": "invalid_tool_call_json", + "error": str(exc), + }, + ) def generate_next_message(self, message, state: LLMAgentState): if isinstance(message, MultiToolMessage): From 1c84468a2e34937508cf96fd38260c644b2cbae4 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 10:37:42 +0800 Subject: [PATCH 13/17] benchmark: normalize tau2 llm environment --- benchmark/tau2/scripts/run_eval.py | 8 ++++++ benchmark/tau2/scripts/run_memory_v2_eval.py | 3 +++ benchmark/tau2/scripts/tau2_common.py | 28 ++++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 8b6a81a4a..aaab01cbd 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -13,6 +13,7 @@ domains, load_config, output_dir, + normalize_litellm_env, run_id, simulator_policy_report, split_file, @@ -391,12 +392,17 @@ def _execute_cells(plan: dict[str, Any], repo: Path, out: Path) -> list[dict[str def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: errors: list[str] = [] + llm_env = normalize_litellm_env() tau2_info = tau2_context(config) policy_report = simulator_policy_report(config) if strict and not tau2_info["tau2_repo_exists"]: errors.append(f"missing TAU-2 repo: {tau2_info['tau2_repo']}") if strict and not tau2_info["tau2_cli_resolved"]: errors.append(f"missing TAU-2 CLI: {tau2_info['tau2_cli']}") + if strict and not llm_env["has_api_key"]: + errors.append("missing LLM API key: set OPENAI_API_KEY or ARK_API_KEY") + if strict and not llm_env["has_base_url"]: + errors.append("missing OpenAI-compatible base URL: set OPENAI_API_BASE, OPENAI_BASE_URL, or ARK_BASE_URL") if strict and not policy_report["supported"]: errors.append( "configured confirmation-aware user simulator policy requires a TAU-2 " @@ -421,6 +427,7 @@ def _preflight(config: dict[str, Any], out: Path, *, strict: bool) -> int: "status": "failed" if errors else "ok", "strict": strict, "tau2": tau2_info, + "llm_env": llm_env, "simulator_policy": policy_report, "domains": domains(config), "strategies": strategy_ids(config), @@ -460,6 +467,7 @@ def main() -> int: parser.add_argument("--plan-only", action="store_true", help="Only write run_plan.json.") parser.add_argument("--execute", action="store_true", help="Execute planned cells.") args = parser.parse_args() + normalize_litellm_env() if args.plan_only and args.execute: raise SystemExit("--plan-only and --execute are mutually exclusive") diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 6dfe5b6f6..052c6afd9 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -8,6 +8,8 @@ from pathlib import Path from typing import Any +from tau2_common import normalize_litellm_env + AGENT_NAME = "openviking_memory_agent" REPO_ROOT = Path(__file__).resolve().parents[3] @@ -418,6 +420,7 @@ def main() -> int: parser.add_argument("--retrieval-top-k", type=int, default=4) parser.add_argument("--force-train", action="store_true") args = parser.parse_args() + normalize_litellm_env() args.tau2_repo = args.tau2_repo.resolve() args.run_dir.mkdir(parents=True, exist_ok=True) diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py index ae00e4c6c..15f504cf3 100755 --- a/benchmark/tau2/scripts/tau2_common.py +++ b/benchmark/tau2/scripts/tau2_common.py @@ -33,6 +33,34 @@ def run_id() -> str: return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") +def normalize_litellm_env() -> dict[str, Any]: + aliases = [] + if not os.environ.get("OPENAI_API_KEY") and os.environ.get("ARK_API_KEY"): + os.environ["OPENAI_API_KEY"] = os.environ["ARK_API_KEY"] + aliases.append("OPENAI_API_KEY<-ARK_API_KEY") + ark_base = os.environ.get("ARK_BASE_URL") + openai_base = os.environ.get("OPENAI_API_BASE") or os.environ.get("OPENAI_BASE_URL") + if not openai_base and ark_base: + os.environ["OPENAI_API_BASE"] = ark_base + os.environ["OPENAI_BASE_URL"] = ark_base + aliases.append("OPENAI_API_BASE<-ARK_BASE_URL") + elif os.environ.get("OPENAI_API_BASE") and not os.environ.get("OPENAI_BASE_URL"): + os.environ["OPENAI_BASE_URL"] = os.environ["OPENAI_API_BASE"] + aliases.append("OPENAI_BASE_URL<-OPENAI_API_BASE") + elif os.environ.get("OPENAI_BASE_URL") and not os.environ.get("OPENAI_API_BASE"): + os.environ["OPENAI_API_BASE"] = os.environ["OPENAI_BASE_URL"] + aliases.append("OPENAI_API_BASE<-OPENAI_BASE_URL") + return { + "aliases": aliases, + "has_api_key": bool(os.environ.get("OPENAI_API_KEY") or os.environ.get("ARK_API_KEY")), + "has_base_url": bool( + os.environ.get("OPENAI_API_BASE") + or os.environ.get("OPENAI_BASE_URL") + or os.environ.get("ARK_BASE_URL") + ), + } + + def render_env(value: Any) -> Any: if isinstance(value, str): def replace(match: re.Match[str]) -> str: From a6b753537e6d284d35df8a1705172c275c93b58f Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 12:18:01 +0800 Subject: [PATCH 14/17] benchmark: add tau2 memory prewrite strategy --- benchmark/tau2/README.md | 37 +++---- benchmark/tau2/config/baseline.yaml | 25 ++--- benchmark/tau2/config/prewrite.yaml | 15 +-- benchmark/tau2/scripts/run_eval.py | 24 +++-- benchmark/tau2/scripts/run_memory_v2_eval.py | 105 ++++++++++++++++--- 5 files changed, 131 insertions(+), 75 deletions(-) diff --git a/benchmark/tau2/README.md b/benchmark/tau2/README.md index 6b617cb48..7ebdb0807 100644 --- a/benchmark/tau2/README.md +++ b/benchmark/tau2/README.md @@ -3,14 +3,11 @@ This directory contains a small OpenViking-style entry point for TAU-2 memory evaluation. The first version is intentionally narrow: -- no-memory control; - fresh OpenViking Memory V2 experience-only baseline; -- trajectory / procedure-view treatment; -- optional pre-write recall. +- Memory V2 pre-write recall treatment. -Category rerank and other harness-only diagnostics are not migrated here yet. -The Memory V2 baseline is wired end to end; trajectory / procedure-view remains -visible in the plan but adapter-pending. +Trajectory / procedure-view prompts, category rerank, and other harness-only +diagnostics are intentionally left out of this first PR. ## Layout @@ -64,35 +61,27 @@ benchmark/tau2/run_full_eval.sh \ --config benchmark/tau2/config/baseline.yaml \ --strict-preflight \ --domain retail \ - --strategy-id no_memory \ + --strategy-id memory_v2_experience_only \ --task-id 5 \ --repeat-count 1 ``` -Plan a one-cell upstream TAU-2 smoke: +Plan a one-cell Memory V2 pre-write smoke: ```bash benchmark/tau2/run_full_eval.sh \ --config benchmark/tau2/config/baseline.yaml \ --domain retail \ - --strategy-id no_memory \ + --strategy-id memory_v2_prewrite \ --num-tasks 1 \ --repeat-count 1 ``` -Run with execution enabled after TAU-2, model credentials, and OpenViking are -configured: - -```bash -benchmark/tau2/run_full_eval.sh --config benchmark/tau2/config/prewrite.yaml --execute -``` - -Run the Memory V2 8-trial baseline (`retail + airline` x 4 repeats): +Run the Memory V2 8-trial matrix (`retail + airline` x 2 strategies x 8 repeats): ```bash benchmark/tau2/run_full_eval.sh \ --config benchmark/tau2/config/baseline.yaml \ - --strategy-id memory_v2_experience_only \ --execute ``` @@ -117,20 +106,18 @@ Start the OpenViking service before executing memory cells, and verify it with `OPENVIKING_URL` explicitly so local custom memory templates do not pollute the Memory V2 baseline. -## Memory Adapter Boundary +## Memory Adapter -`no_memory` cells run through the external TAU-2 CLI. `memory_v2_experience_only` -cells run through a small TAU-2 agent adapter in this directory: +`memory_v2_experience_only` and `memory_v2_prewrite` cells run through a small +TAU-2 agent adapter in this directory: - train by writing TAU-2 training conversations into OpenViking sessions; - evaluate by retrieving OpenViking experience memory at the first user turn; +- for pre-write recall, retrieve again before write-like tool calls and + regenerate that step with the matched memories; - emit artifact metadata to identify the OpenViking account, agent, corpus, retrieval mode, and simulator policy used by each cell. -The trajectory / procedure-view treatment is kept in the same plan but remains -`adapter_status: pending`; `--execute` fails fast if that strategy is selected -before its adapter is implemented. - ## User Simulator Policy The runner default is the official TAU-2 user simulator if diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 08c0a6bdd..2dc8a9d2c 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -5,9 +5,8 @@ benchmark: - airline train_split_name: train eval_split_name: test - repeat_count: 4 + repeat_count: 8 task_max_concurrency: 10 - strategy_concurrency: 4 max_steps: 200 agent: llm_agent user: user_simulator @@ -27,7 +26,6 @@ eval: model: agent_llm: ${TAU2_AGENT_LLM:-openai/doubao-seed-2-0-pro-260215} user_llm: ${TAU2_USER_LLM:-openai/doubao-seed-2-0-pro-260215} - evaluator_llm: ${TAU2_EVALUATOR_LLM:-openai/doubao-seed-2-0-pro-260215} temperature: 0.0 openviking: @@ -38,26 +36,17 @@ openviking: replay_write_policy: read_only strategies: - - id: no_memory - label: No memory - memory_backend: none - train_required: false - id: memory_v2_experience_only label: OpenViking Memory V2 experience-only memory_backend: openviking - adapter_status: ready train_required: true + corpus_id: memory_v2_experience_only train_memory_mode: experience_only retrieval_mode: first_user - - id: trajectory_procedure_view - label: OpenViking trajectory procedure view + - id: memory_v2_prewrite + label: OpenViking Memory V2 pre-write recall memory_backend: openviking - adapter_status: pending train_required: true - train_memory_mode: trajectory_procedure_view - operation_mode: add_only - retrieval_mode: first_user - -features: - prewrite_recall: - enabled: false + corpus_id: memory_v2_experience_only + train_memory_mode: experience_only + retrieval_mode: prewrite diff --git a/benchmark/tau2/config/prewrite.yaml b/benchmark/tau2/config/prewrite.yaml index 965f09963..e8b12d9cf 100644 --- a/benchmark/tau2/config/prewrite.yaml +++ b/benchmark/tau2/config/prewrite.yaml @@ -3,10 +3,11 @@ extends: baseline.yaml benchmark: name: tau2_openviking_prewrite -features: - prewrite_recall: - enabled: true - decision_nodes: - - before_write_tool_call - max_memories: 4 - evidence_boundary: runtime_retrieval_trace_required +strategies: + - id: memory_v2_prewrite + label: OpenViking Memory V2 pre-write recall + memory_backend: openviking + train_required: true + corpus_id: memory_v2_experience_only + train_memory_mode: experience_only + retrieval_mode: prewrite diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index aaab01cbd..57a6069e8 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -85,9 +85,10 @@ def _tau2_command( and strategy.get("train_memory_mode") == "experience_only" ): openviking = config["openviking"] - account = f"{openviking['account']}-{configured_run_id}-{domain}-{strategy['id']}" - agent_id = f"{openviking['agent_id']}-{domain}-{strategy['id']}" - user = f"tau2-{domain}-{strategy['id']}" + corpus_id = str(strategy.get("corpus_id") or strategy["id"]) + account = f"{openviking['account']}-{configured_run_id}-{domain}-{corpus_id}" + agent_id = f"{openviking['agent_id']}-{domain}-{corpus_id}" + user = f"tau2-{domain}-{corpus_id}" search_uri = f"viking://agent/{agent_id}/memories/experiences" command = [ sys.executable, @@ -100,10 +101,12 @@ def _tau2_command( str( output_dir(config, configured_run_id) / "memory_corpora" - / f"{domain}_{strategy['id']}" + / f"{domain}_{corpus_id}" ), "--run-label", run_label, + "--strategy-id", + strategy["id"], "--domain", domain, "--train-split-name", @@ -134,6 +137,8 @@ def _tau2_command( search_uri, "--retrieval-top-k", str(openviking.get("retrieval_top_k", 4)), + "--retrieval-mode", + str(strategy.get("retrieval_mode", "first_user")), ] if task_ids: for task_id in task_ids: @@ -196,7 +201,7 @@ def _build_plan( train_num_tasks: int | None, repeat_count_override: int | None, ) -> dict[str, Any]: - repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 4)) + repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 8)) policy_report = simulator_policy_report(config) strategies = config.get("strategies") or [] if selected_strategy_ids: @@ -241,6 +246,8 @@ def _build_plan( "run_label": run_label, "train_required": bool(strategy.get("train_required")), "memory_backend": strategy.get("memory_backend"), + "corpus_id": strategy.get("corpus_id", strategy["id"]), + "retrieval_mode": strategy.get("retrieval_mode"), "adapter_status": strategy.get("adapter_status", "ready"), "executable": command is not None, "user_simulator_policy": user_simulator_policy(config), @@ -267,9 +274,10 @@ def _build_plan( def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, str]: - if cell["strategy_id"] == "memory_v2_experience_only": + if cell.get("memory_backend") == "openviking": run_dir = out / "memory_cells" / cell["run_label"] - corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{cell['strategy_id']}" + corpus_id = str(cell.get("corpus_id") or cell["strategy_id"]) + corpus_dir = out / "memory_corpora" / f"{cell['domain']}_{corpus_id}" return { "summary": str(run_dir / f"{cell['run_label']}.summary.json"), "results": str(run_dir / f"{cell['run_label']}.json"), @@ -282,7 +290,7 @@ def _cell_artifacts(cell: dict[str, Any], repo: Path, out: Path) -> dict[str, st def _cell_metrics(cell: dict[str, Any], artifacts: dict[str, str]) -> dict[str, Any] | None: - if cell["strategy_id"] == "memory_v2_experience_only": + if cell.get("memory_backend") == "openviking": summary_path = Path(artifacts["summary"]) if not summary_path.is_file(): return None diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 052c6afd9..a1220a7e6 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -13,6 +13,15 @@ AGENT_NAME = "openviking_memory_agent" REPO_ROOT = Path(__file__).resolve().parents[3] +READ_TOOL_PREFIXES = ( + "get_", + "find_", + "list_", + "search_", + "calculate", + "think", + "transfer_", +) def _json(text: str) -> dict[str, Any]: @@ -69,6 +78,32 @@ def _metrics(results_path: Path) -> dict[str, Any]: } +def _is_write_tool_call(tool_call: Any) -> bool: + name = str(getattr(tool_call, "name", "") or "") + return bool(name) and not name.startswith(READ_TOOL_PREFIXES) + + +def _tool_call_query(tool_calls: list[Any], state_messages: list[Any]) -> str: + rendered = [] + for call in tool_calls: + rendered.append( + f"{getattr(call, 'name', 'unknown_tool')}(" + f"{json.dumps(getattr(call, 'arguments', {}) or {}, ensure_ascii=False, sort_keys=True)}" + ")" + ) + recent_user = [ + str(getattr(message, "content", "") or "") + for message in state_messages[-8:] + if str(getattr(message, "role", "")) == "user" and str(getattr(message, "content", "") or "").strip() + ] + return ( + "Before executing write-like tool call(s): " + + "; ".join(rendered) + + "\nRecent user context: " + + " | ".join(recent_user[-3:]) + ) + + def _message_text(message: dict[str, Any]) -> tuple[str, str]: role = str(message.get("role") or "assistant") if role == "user": @@ -248,9 +283,10 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None: class OpenVikingMemoryAgent(LLMAgent): def get_init_state(self, message_history=None): state = super().get_init_state(message_history) - state.system_messages.append( - SystemMessage(role="system", content="") - ) + if args.retrieval_mode == "first_user": + state.system_messages.append( + SystemMessage(role="system", content="") + ) return state def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: @@ -281,6 +317,10 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: finally: client.close() + def _trace(self, event: dict[str, Any]) -> None: + with trace_path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n") + def _generate(self, messages): def _is_empty_assistant(response) -> bool: content = str(getattr(response, "content", "") or "") @@ -366,21 +406,48 @@ def generate_next_message(self, message, state: LLMAgentState): + block ) state.system_messages[marker_index] = SystemMessage(role="system", content=prompt) - with trace_path.open("a", encoding="utf-8") as handle: - handle.write( - json.dumps( - { - "query": query, - "match_count": len(matches), - "matches": matches, - }, - ensure_ascii=False, - sort_keys=True, - ) - + "\n" - ) + self._trace( + { + "decision_node": "first_user", + "query": query, + "match_count": len(matches), + "matches": matches, + } + ) assistant_message = self._generate(state.system_messages + state.messages) + if args.retrieval_mode == "prewrite": + tool_calls = list(getattr(assistant_message, "tool_calls", None) or []) + write_calls = [call for call in tool_calls if _is_write_tool_call(call)] + if write_calls: + query = _tool_call_query(write_calls, state.messages) + block, matches = self._retrieve(query) + self._trace( + { + "decision_node": "before_write_tool_call", + "query": query, + "match_count": len(matches), + "matches": matches, + "tool_calls": [ + { + "name": getattr(call, "name", ""), + "arguments": getattr(call, "arguments", {}) or {}, + } + for call in write_calls + ], + } + ) + if block: + prompt = ( + "Before executing the pending write-like tool call, use these " + "OpenViking experience memories only when they match the current task:\n\n" + + block + ) + assistant_message = self._generate( + state.system_messages + + state.messages + + [SystemMessage(role="system", content=prompt)] + ) state.messages.append(assistant_message) return assistant_message, state @@ -394,6 +461,7 @@ def main() -> int: parser.add_argument("--run-dir", type=Path, required=True) parser.add_argument("--corpus-dir", type=Path) parser.add_argument("--run-label", required=True) + parser.add_argument("--strategy-id", default="memory_v2_experience_only") parser.add_argument("--domain", required=True) parser.add_argument("--train-split-name", default="train") parser.add_argument("--eval-split-name", default="test") @@ -418,6 +486,7 @@ def main() -> int: parser.add_argument("--openviking-wait-timeout", type=int, default=600) parser.add_argument("--search-uri", required=True) parser.add_argument("--retrieval-top-k", type=int, default=4) + parser.add_argument("--retrieval-mode", choices=["first_user", "prewrite"], default="first_user") parser.add_argument("--force-train", action="store_true") args = parser.parse_args() normalize_litellm_env() @@ -433,6 +502,7 @@ def main() -> int: summary_path = args.run_dir / f"{args.run_label}.summary.json" corpus = _train(args, train_results, corpus_manifest) + trace_path.touch() _register_memory_agent(args, trace_path) _run_tau2( tau2_repo=args.tau2_repo, @@ -455,7 +525,8 @@ def main() -> int: summary = { "run_label": args.run_label, "domain": args.domain, - "strategy_id": "memory_v2_experience_only", + "strategy_id": args.strategy_id, + "retrieval_mode": args.retrieval_mode, "corpus": corpus, "eval_results": str(eval_results), "retrieval_trace": str(trace_path), From d44b07cd674d8b04da3ff7fabff73257d234cad3 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 13:00:18 +0800 Subject: [PATCH 15/17] benchmark: support current tau2 runner api --- benchmark/tau2/scripts/run_memory_v2_eval.py | 36 ++++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index a1220a7e6..8de3927e5 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -3,6 +3,7 @@ import argparse import json +import shutil import sys import time from pathlib import Path @@ -40,11 +41,16 @@ def _add_tau2_to_path(tau2_repo: Path) -> None: def _save_to_arg(path: Path) -> str: - # TAU-2 run_domain appends ".json" to save_to. Keep our artifact paths - # stable by passing the stem when callers hand us a JSON path. + # Some TAU-2 versions append ".json"; newer versions treat save_to as a + # run directory and write results.json under it. return str(path.with_suffix("") if path.suffix == ".json" else path) +def _compat_results_path(path: Path) -> Path: + run_dir = path.with_suffix("") if path.suffix == ".json" else path + return run_dir / "results.json" + + def _reward(sim: dict[str, Any]) -> float: info = sim.get("reward_info") or {} value = info.get("reward", sim.get("reward", 0.0)) @@ -141,13 +147,17 @@ def _run_tau2( save_to: Path, ): _add_tau2_to_path(tau2_repo) - from tau2.data_model.simulation import RunConfig + from tau2.data_model.simulation import RunConfig, TextRunConfig from tau2.run import run_domain + compat_results = _compat_results_path(save_to) if save_to.exists(): save_to.unlink() - return run_domain( - RunConfig( + if compat_results.parent.is_dir(): + shutil.rmtree(compat_results.parent) + config_cls = TextRunConfig if getattr(RunConfig, "__origin__", None) is not None else RunConfig + result = run_domain( + config_cls( domain=domain, task_split_name=split, task_ids=task_ids, @@ -166,6 +176,9 @@ def _run_tau2( log_level="INFO", ) ) + if not save_to.exists() and compat_results.exists(): + shutil.copyfile(compat_results, save_to) + return result def _client(args: argparse.Namespace): @@ -452,7 +465,18 @@ def generate_next_message(self, message, state: LLMAgentState): return assistant_message, state if AGENT_NAME not in registry.get_agents(): - registry.register_agent(OpenVikingMemoryAgent, AGENT_NAME) + def create_openviking_memory_agent(tools, domain_policy, **kwargs): + return OpenVikingMemoryAgent( + tools=tools, + domain_policy=domain_policy, + llm=kwargs.get("llm"), + llm_args=kwargs.get("llm_args"), + ) + + if hasattr(registry, "register_agent"): + registry.register_agent(OpenVikingMemoryAgent, AGENT_NAME) + else: + registry.register_agent_factory(create_openviking_memory_agent, AGENT_NAME) def main() -> int: From 581594ac780613143eb114a92e1d11a71eb7efd4 Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 14:30:03 +0800 Subject: [PATCH 16/17] benchmark: align tau2 memory prewrite parity --- benchmark/tau2/config/baseline.yaml | 3 +- benchmark/tau2/config/prewrite.yaml | 2 +- benchmark/tau2/scripts/run_eval.py | 9 ++ benchmark/tau2/scripts/run_memory_v2_eval.py | 124 +++++++++++++++---- 4 files changed, 113 insertions(+), 25 deletions(-) diff --git a/benchmark/tau2/config/baseline.yaml b/benchmark/tau2/config/baseline.yaml index 2dc8a9d2c..4c4a5060e 100644 --- a/benchmark/tau2/config/baseline.yaml +++ b/benchmark/tau2/config/baseline.yaml @@ -8,6 +8,7 @@ benchmark: repeat_count: 8 task_max_concurrency: 10 max_steps: 200 + seed: 300 agent: llm_agent user: user_simulator reasoning_effort: high @@ -49,4 +50,4 @@ strategies: train_required: true corpus_id: memory_v2_experience_only train_memory_mode: experience_only - retrieval_mode: prewrite + retrieval_mode: first_user_prewrite diff --git a/benchmark/tau2/config/prewrite.yaml b/benchmark/tau2/config/prewrite.yaml index e8b12d9cf..834963b41 100644 --- a/benchmark/tau2/config/prewrite.yaml +++ b/benchmark/tau2/config/prewrite.yaml @@ -10,4 +10,4 @@ strategies: train_required: true corpus_id: memory_v2_experience_only train_memory_mode: experience_only - retrieval_mode: prewrite + retrieval_mode: first_user_prewrite diff --git a/benchmark/tau2/scripts/run_eval.py b/benchmark/tau2/scripts/run_eval.py index 57a6069e8..5458ba61a 100755 --- a/benchmark/tau2/scripts/run_eval.py +++ b/benchmark/tau2/scripts/run_eval.py @@ -69,6 +69,7 @@ def _tau2_command( task_ids: list[str] | None, num_tasks: int | None, train_num_tasks: int | None, + seed: int, ) -> list[str] | None: benchmark = config["benchmark"] model = config["model"] @@ -139,6 +140,8 @@ def _tau2_command( str(openviking.get("retrieval_top_k", 4)), "--retrieval-mode", str(strategy.get("retrieval_mode", "first_user")), + "--seed", + str(seed), ] if task_ids: for task_id in task_ids: @@ -176,6 +179,8 @@ def _tau2_command( str(model["user_llm"]), "--save-to", run_label, + "--seed", + str(seed), ] command.extend(["--agent-llm-args", agent_llm_args]) @@ -202,6 +207,7 @@ def _build_plan( repeat_count_override: int | None, ) -> dict[str, Any]: repeat_count = repeat_count_override or int(config["benchmark"].get("repeat_count", 8)) + base_seed = int(config["benchmark"].get("seed", 300)) policy_report = simulator_policy_report(config) strategies = config.get("strategies") or [] if selected_strategy_ids: @@ -220,6 +226,7 @@ def _build_plan( split_path = split_file(config, domain) for strategy in strategies: for repeat_index in range(repeat_count): + seed = base_seed + repeat_index run_label = f"{configured_run_id}_{domain}_{strategy['id']}_r{repeat_index + 1}" command = _tau2_command( config, @@ -230,6 +237,7 @@ def _build_plan( task_ids=task_ids, num_tasks=num_tasks, train_num_tasks=train_num_tasks, + seed=seed, ) non_executable_reason = None if command is None: @@ -243,6 +251,7 @@ def _build_plan( "strategy_id": strategy["id"], "strategy_label": strategy.get("label", strategy["id"]), "repeat_index": repeat_index + 1, + "seed": seed, "run_label": run_label, "train_required": bool(strategy.get("train_required")), "memory_backend": strategy.get("memory_backend"), diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 8de3927e5..818802041 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -14,14 +14,20 @@ AGENT_NAME = "openviking_memory_agent" REPO_ROOT = Path(__file__).resolve().parents[3] -READ_TOOL_PREFIXES = ( - "get_", - "find_", - "list_", - "search_", - "calculate", - "think", - "transfer_", +WRITE_TOOL_PREFIXES = ( + "toggle_", + "enable_", + "disable_", + "set_", + "reset_", + "update_", + "modify_", + "cancel_", + "book_", + "exchange_", + "return_", + "grant_", + "reboot_", ) @@ -84,17 +90,29 @@ def _metrics(results_path: Path) -> dict[str, Any]: } +def _tool_call_name(tool_call: Any) -> str: + if isinstance(tool_call, dict): + return str(tool_call.get("name") or tool_call.get("function", {}).get("name") or "") + return str(getattr(tool_call, "name", "") or "") + + +def _tool_call_arguments(tool_call: Any) -> Any: + if isinstance(tool_call, dict): + return tool_call.get("arguments") or tool_call.get("function", {}).get("arguments") or {} + return getattr(tool_call, "arguments", {}) or {} + + def _is_write_tool_call(tool_call: Any) -> bool: - name = str(getattr(tool_call, "name", "") or "") - return bool(name) and not name.startswith(READ_TOOL_PREFIXES) + name = _tool_call_name(tool_call) + return bool(name) and name.startswith(WRITE_TOOL_PREFIXES) def _tool_call_query(tool_calls: list[Any], state_messages: list[Any]) -> str: rendered = [] for call in tool_calls: rendered.append( - f"{getattr(call, 'name', 'unknown_tool')}(" - f"{json.dumps(getattr(call, 'arguments', {}) or {}, ensure_ascii=False, sort_keys=True)}" + f"{_tool_call_name(call) or 'unknown_tool'}(" + f"{json.dumps(_tool_call_arguments(call), ensure_ascii=False, sort_keys=True)}" ")" ) recent_user = [ @@ -102,12 +120,18 @@ def _tool_call_query(tool_calls: list[Any], state_messages: list[Any]) -> str: for message in state_messages[-8:] if str(getattr(message, "role", "")) == "user" and str(getattr(message, "content", "") or "").strip() ] - return ( - "Before executing write-like tool call(s): " - + "; ".join(rendered) - + "\nRecent user context: " - + " | ".join(recent_user[-3:]) - ) + recent_observations = [ + str(getattr(message, "content", "") or "")[:600] + for message in state_messages[-12:] + if str(getattr(message, "role", "")) == "tool" and str(getattr(message, "content", "") or "").strip() + ] + parts = [ + "Before executing write-like tool call(s): " + "; ".join(rendered), + "Recent user context: " + " | ".join(recent_user[-3:]), + ] + if recent_observations: + parts.append("Recent tool observations: " + " | ".join(recent_observations[-4:])) + return "\n".join(parts) def _message_text(message: dict[str, Any]) -> tuple[str, str]: @@ -213,6 +237,37 @@ def _wait_task(client: Any, task_id: str | None, timeout: int) -> dict[str, Any] raise TimeoutError(f"OpenViking task {task_id} did not finish within {timeout}s: {last}") +def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]: + result = client.search( + query=f"{args.domain} customer service order reservation booking cancellation exchange return update", + target_uri=args.search_uri, + limit=args.retrieval_top_k, + ) + memories = list(getattr(result, "memories", []) or []) + reads = [] + for match in memories[: args.retrieval_top_k]: + uri = getattr(match, "uri", "") + text = "" + try: + text = client.read(uri) + except Exception: + text = getattr(match, "abstract", "") or getattr(match, "overview", "") or "" + reads.append( + { + "uri": uri, + "score": getattr(match, "score", None), + "text_chars": len(text), + "non_empty": bool(str(text).strip()), + } + ) + return { + "query": f"{args.domain} customer service order reservation booking cancellation exchange return update", + "match_count": len(memories), + "read_non_empty_count": sum(1 for row in reads if row["non_empty"]), + "matches": reads, + } + + def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) -> dict[str, Any]: if corpus_manifest.is_file() and not args.force_train: return json.loads(corpus_manifest.read_text()) @@ -268,6 +323,12 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) finally: client.close() + client = _client(args) + try: + corpus_probe = _probe_corpus(args, client) + finally: + client.close() + manifest = { "domain": args.domain, "train_results": str(train_results), @@ -280,6 +341,7 @@ def _train(args: argparse.Namespace, train_results: Path, corpus_manifest: Path) }, "committed_sessions": committed, "committed_session_count": len(committed), + "corpus_probe": corpus_probe, } _write_json(corpus_manifest, manifest) return manifest @@ -296,7 +358,7 @@ def _register_memory_agent(args: argparse.Namespace, trace_path: Path) -> None: class OpenVikingMemoryAgent(LLMAgent): def get_init_state(self, message_history=None): state = super().get_init_state(message_history) - if args.retrieval_mode == "first_user": + if args.retrieval_mode in {"first_user", "first_user_prewrite"}: state.system_messages.append( SystemMessage(role="system", content="") ) @@ -334,6 +396,15 @@ def _trace(self, event: dict[str, Any]) -> None: with trace_path.open("a", encoding="utf-8") as handle: handle.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n") + @staticmethod + def _trace_injection_fields(block: str, matches: list[dict[str, Any]]) -> dict[str, Any]: + injected_count = sum(1 for row in matches if int(row.get("text_chars") or 0) > 0) + return { + "injected": bool(block.strip()), + "injected_count": injected_count if block.strip() else 0, + "retrieval_action_taken": "retrieve_and_inject" if block.strip() else "retrieve_no_injection", + } + def _generate(self, messages): def _is_empty_assistant(response) -> bool: content = str(getattr(response, "content", "") or "") @@ -425,11 +496,12 @@ def generate_next_message(self, message, state: LLMAgentState): "query": query, "match_count": len(matches), "matches": matches, + **self._trace_injection_fields(block, matches), } ) assistant_message = self._generate(state.system_messages + state.messages) - if args.retrieval_mode == "prewrite": + if args.retrieval_mode in {"prewrite", "first_user_prewrite"}: tool_calls = list(getattr(assistant_message, "tool_calls", None) or []) write_calls = [call for call in tool_calls if _is_write_tool_call(call)] if write_calls: @@ -441,10 +513,11 @@ def generate_next_message(self, message, state: LLMAgentState): "query": query, "match_count": len(matches), "matches": matches, + **self._trace_injection_fields(block, matches), "tool_calls": [ { - "name": getattr(call, "name", ""), - "arguments": getattr(call, "arguments", {}) or {}, + "name": _tool_call_name(call), + "arguments": _tool_call_arguments(call), } for call in write_calls ], @@ -510,7 +583,11 @@ def main() -> int: parser.add_argument("--openviking-wait-timeout", type=int, default=600) parser.add_argument("--search-uri", required=True) parser.add_argument("--retrieval-top-k", type=int, default=4) - parser.add_argument("--retrieval-mode", choices=["first_user", "prewrite"], default="first_user") + parser.add_argument( + "--retrieval-mode", + choices=["first_user", "prewrite", "first_user_prewrite"], + default="first_user", + ) parser.add_argument("--force-train", action="store_true") args = parser.parse_args() normalize_litellm_env() @@ -551,6 +628,7 @@ def main() -> int: "domain": args.domain, "strategy_id": args.strategy_id, "retrieval_mode": args.retrieval_mode, + "seed": args.seed, "corpus": corpus, "eval_results": str(eval_results), "retrieval_trace": str(trace_path), From 14c43918877ef13a59a39e08fa22495e5f0a40ac Mon Sep 17 00:00:00 2001 From: huangruiteng Date: Wed, 13 May 2026 14:53:39 +0800 Subject: [PATCH 17/17] benchmark: make tau2 eval traces safer --- benchmark/tau2/scripts/run_memory_v2_eval.py | 54 ++++++++++---------- benchmark/tau2/scripts/tau2_common.py | 8 +++ 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/benchmark/tau2/scripts/run_memory_v2_eval.py b/benchmark/tau2/scripts/run_memory_v2_eval.py index 818802041..de5ef5441 100644 --- a/benchmark/tau2/scripts/run_memory_v2_eval.py +++ b/benchmark/tau2/scripts/run_memory_v2_eval.py @@ -237,6 +237,14 @@ def _wait_task(client: Any, task_id: str | None, timeout: int) -> dict[str, Any] raise TimeoutError(f"OpenViking task {task_id} did not finish within {timeout}s: {last}") +def _read_memory_text(client: Any, match: Any) -> tuple[str, str | None]: + try: + return client.read(getattr(match, "uri", "")), None + except Exception as exc: + fallback = getattr(match, "abstract", "") or getattr(match, "overview", "") or "" + return fallback, f"{type(exc).__name__}: {exc}" + + def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]: result = client.search( query=f"{args.domain} customer service order reservation booking cancellation exchange return update", @@ -247,19 +255,16 @@ def _probe_corpus(args: argparse.Namespace, client: Any) -> dict[str, Any]: reads = [] for match in memories[: args.retrieval_top_k]: uri = getattr(match, "uri", "") - text = "" - try: - text = client.read(uri) - except Exception: - text = getattr(match, "abstract", "") or getattr(match, "overview", "") or "" - reads.append( - { - "uri": uri, - "score": getattr(match, "score", None), - "text_chars": len(text), - "non_empty": bool(str(text).strip()), - } - ) + text, read_error = _read_memory_text(client, match) + row = { + "uri": uri, + "score": getattr(match, "score", None), + "text_chars": len(text), + "non_empty": bool(str(text).strip()), + } + if read_error: + row["read_error"] = read_error + reads.append(row) return { "query": f"{args.domain} customer service order reservation booking cancellation exchange return update", "match_count": len(memories), @@ -373,19 +378,16 @@ def _retrieve(self, query: str) -> tuple[str, list[dict[str, Any]]]: blocks = [] for index, match in enumerate(memories[: args.retrieval_top_k], 1): uri = getattr(match, "uri", "") - text = "" - try: - text = client.read(uri) - except Exception: - text = getattr(match, "abstract", "") or getattr(match, "overview", "") or "" - rows.append( - { - "uri": uri, - "score": getattr(match, "score", None), - "level": getattr(match, "level", None), - "text_chars": len(text), - } - ) + text, read_error = _read_memory_text(client, match) + row = { + "uri": uri, + "score": getattr(match, "score", None), + "level": getattr(match, "level", None), + "text_chars": len(text), + } + if read_error: + row["read_error"] = read_error + rows.append(row) if text.strip(): blocks.append(f"Memory {index} ({uri}):\n{text.strip()}") return "\n\n".join(blocks), rows diff --git a/benchmark/tau2/scripts/tau2_common.py b/benchmark/tau2/scripts/tau2_common.py index 15f504cf3..a8b5ce201 100755 --- a/benchmark/tau2/scripts/tau2_common.py +++ b/benchmark/tau2/scripts/tau2_common.py @@ -207,6 +207,9 @@ def _ensure_confirmation_aware_prompt(repo: Path) -> bool: text = path.read_text(encoding="utf-8") if _has_confirmation_aware_prompt(text): continue + backup = path.with_suffix(path.suffix + ".openviking.bak") + if not backup.exists(): + backup.write_text(text, encoding="utf-8") path.write_text(text.rstrip() + CONFIRMATION_AWARE_APPENDIX + "\n", encoding="utf-8") patched = True return patched @@ -252,6 +255,11 @@ def simulator_policy_report(config: dict[str, Any]) -> dict[str, Any]: "patch_applied": patch_applied, "patch_mode": patch_mode, "prompt_files": [str(path) for path in prompt_paths], + "backup_files": [ + str(path.with_suffix(path.suffix + ".openviking.bak")) + for path in prompt_paths + if path.with_suffix(path.suffix + ".openviking.bak").exists() + ], "claim_boundary": claim_boundary, }