diff --git a/bench/fixtures/enterpriseops-gym.json b/bench/fixtures/enterpriseops-gym.json new file mode 100644 index 0000000..34145fd --- /dev/null +++ b/bench/fixtures/enterpriseops-gym.json @@ -0,0 +1,103 @@ +[ + { + "task_id": "task_20251212_172511_458_e6427839_47076c15", + "domain": "itsm", + "system_prompt": "# ITSM Assistant Policy\n\nRole: ITSM Assistant (IT Service Management). Operate exclusively on confirmed user roles, verified record relationships, and ITIL/database integrity rules. Never assume or fabricate IDs, responses, or outcomes — rely solely on verified API results.", + "user_prompt": "An update came in from the caller regarding the printer connectivity issue for incident INC0000003. The user stated that several urgent client documents are still pending, and the inability to print is causing significant delays. Although the submission deadline has been extended by one business day, the service impact remains significant. Therefore increase the impact of the incident to high and update the necessary details.", + "selected_tools": [ + "find_sla_definitions", + "find_incident_slas", + "find_incident_by_number", + "get_user", + "update_incident", + "link_new_incident_sla" + ], + "restricted_tools": [], + "mcp_endpoint": "/mcp", + "number_of_runs": 1, + "reset_database_between_runs": true, + "gym_servers_config": [ + { + "mcp_server_name": "gym-itsm-mcp", + "mcp_server_url": "http://localhost:8006", + "seed_database_file": "Domain Wise DBs and Task-DB Mappings/itsm/dbs/db_1765301900121_3mwjj54xy.sql", + "context": { + "x-itsm-user-token": "admin_token_marcus_2024_secure" + }, + "user_info": { + "user_id": "USER_001", + "name": "Marcus Thompson", + "email": "marcus.thompson@techcorp.com" + } + } + ], + "verifiers": [ + { + "verifier_type": "database_state", + "name": "Verify if the priority of the incident is set correctly.", + "description": "Verify if the priority of the incident is set correctly.", + "gym_name": "gym-itsm-mcp", + "validation_config": { + "query": "SELECT COUNT(*) FROM incident WHERE incident_id = 'INC_003' AND impact = 'high' AND priority = 'high';", + "expected_value": 1, + "comparison_type": "equals" + } + }, + { + "verifier_type": "database_state", + "name": "Verify if the correct high-priority SLA is linked to the incident.", + "description": "Verify if the correct high-priority SLA is linked to the incident.", + "gym_name": "gym-itsm-mcp", + "validation_config": { + "query": "SELECT COUNT(*) FROM incident_sla WHERE incident_id = 'INC_003' AND sla_def_id = 'SLA_002';", + "expected_value": 1, + "comparison_type": "equals" + } + } + ] + }, + { + "task_id": "task_20251117_165528_648_bca89e7d_3e81ece9", + "domain": "calendar", + "system_prompt": "# Calendar Assistant Policy\n\nRole: Calendar Assistant. Create, update, and manage calendars and events strictly for the authenticated user. Never operate on calendars outside the user's access scope.", + "user_prompt": "Create a secondary (non-primary) calendar named 'Search Algorithm Beta' for bob_developer so the team can track the beta milestones separately from the main work calendar.", + "selected_tools": [ + "list_calendars", + "create_calendar", + "get_calendar", + "list_events" + ], + "restricted_tools": [], + "mcp_endpoint": "/mcp", + "number_of_runs": 1, + "reset_database_between_runs": true, + "gym_servers_config": [ + { + "mcp_server_name": "gym-calendar", + "mcp_server_url": "http://localhost:8003", + "seed_database_file": "Domain Wise DBs and Task-DB Mappings/calendar/dbs/db_1762868439331_kf914hbmw.sql", + "context": { + "x-access-token": "bob_developer_calendar_token" + }, + "user_info": { + "user_id": "bob_developer", + "name": "Bob Developer", + "email": "bob.developer@techcorp.com" + } + } + ], + "verifiers": [ + { + "verifier_type": "database_state", + "name": "Calendar Creation", + "description": "Verify secondary calendar created", + "gym_name": "gym-calendar", + "validation_config": { + "query": "SELECT COUNT(*) AS count FROM calendars WHERE summary = 'Search Algorithm Beta' AND is_primary = 0 AND user_id = 'bob_developer';", + "expected_value": 1, + "comparison_type": "equals" + } + } + ] + } +] diff --git a/bench/scripts/enterpriseops_gym_judge.py b/bench/scripts/enterpriseops_gym_judge.py new file mode 100644 index 0000000..e8f9efd --- /dev/null +++ b/bench/scripts/enterpriseops_gym_judge.py @@ -0,0 +1,198 @@ +# EnterpriseOps-Gym judge driver. One subcommand the TS adapter shells out to: +# judge --task-json PATH (agent tool-call transcript on stdin) +# -> {"success":bool,"passes":int,"total":int,"verifiers":[{name,passed}...]} +# +# This is the benchmark's OWN evaluation contract (ServiceNow/EnterpriseOps-Gym +# benchmark/verifier.py): tasks are scored on FINAL DATABASE STATE, not action +# sequences. The agent's tool-call transcript is replayed against the live, +# freshly-seeded gym MCP server (one HTTP POST per call to the server's /mcp +# endpoint); then each database_state verifier's SQL is executed via the gym +# server's /api/sql-runner endpoint and compared to expected_value under +# comparison_type (equals / greater_than / less_than / contains) — exactly the +# VerifierEngine._compare_values semantics. Per-task success = ALL verifiers pass +# (overall_success_rate); the fraction passing is the verifier_level_pass_rate. +# +# JSON is emitted as the LAST stdout line. Fail loud: an unreachable gym server, +# a non-database_state verifier we cannot run deterministically, or a malformed +# transcript prints {"error": "..."} and exits nonzero — never a fabricated score. + +import argparse +import json +import sys +import urllib.error +import urllib.request + + +def fail(msg: str) -> None: + print(json.dumps({"error": msg})) + sys.exit(1) + + +def post_json(url: str, headers: dict, payload: dict, timeout: float = 30.0) -> dict: + body = json.dumps(payload).encode("utf-8") + req = urllib.request.Request(url, data=body, method="POST") + req.add_header("Content-Type", "application/json") + for k, v in headers.items(): + req.add_header(k, str(v)) + with urllib.request.urlopen(req, timeout=timeout) as resp: + raw = resp.read().decode("utf-8") + return json.loads(raw) if raw.strip() else {} + + +def server_for_gym(servers: list, gym_name: str) -> dict: + # The verifier names its target gym (gym_name); match it to the configured + # server. A single-gym task falls back to the only configured server. + by_name = {s.get("mcp_server_name"): s for s in servers} + if gym_name in by_name: + return by_name[gym_name] + if len(servers) == 1: + return servers[0] + fail(f"verifier gym_name {gym_name!r} not in gym_servers_config ({sorted(by_name)})") + + +def replay_transcript(servers: list, transcript: list) -> None: + # Replay each agent tool call against the live gym MCP server so the final DB + # state reflects the agent's actions. The transcript is a list of + # {"tool":..., "arguments":{...}, "gym_name"?:...} entries (the worker's + # ordered tool calls). The server applies each call to its database; the + # verifiers then read the resulting state. An empty transcript leaves the + # seeded state untouched (the no-op-agent baseline → verifiers fail closed). + if not transcript: + return + for i, call in enumerate(transcript): + tool = call.get("tool") + if not isinstance(tool, str) or not tool: + fail(f"transcript[{i}] has no string 'tool' field: {call!r}") + gym_name = call.get("gym_name") + server = server_for_gym(servers, gym_name) if gym_name else servers[0] + url = server["mcp_server_url"].rstrip("/") + "/mcp" + headers = dict(server.get("context") or {}) + payload = {"tool": tool, "arguments": call.get("arguments") or {}} + try: + post_json(url, headers, payload) + except urllib.error.URLError as e: + fail(f"gym server unreachable at {url} replaying tool {tool!r}: {e}") + except Exception as e: # noqa: BLE001 + fail(f"tool call {tool!r} failed against {url}: {e}") + + +def run_sql(server: dict, query: str) -> object: + url = server["mcp_server_url"].rstrip("/") + "/api/sql-runner" + headers = dict(server.get("context") or {}) + try: + out = post_json(url, headers, {"query": query}) + except urllib.error.URLError as e: + fail(f"gym server unreachable at {url}: {e}") + except Exception as e: # noqa: BLE001 + fail(f"sql-runner POST to {url} failed: {e}") + # The sql-runner returns the scalar/first-cell result. Accept the common + # shapes (a bare value, {"result": v}, or rows -> first cell) without faking. + if isinstance(out, dict): + if "result" in out: + return out["result"] + if "rows" in out and out["rows"]: + first = out["rows"][0] + if isinstance(first, dict) and first: + return next(iter(first.values())) + if isinstance(first, list) and first: + return first[0] + if "error" in out: + fail(f"sql-runner error for query {query!r}: {out['error']}") + return out + + +def compare(actual: object, expected: object, comparison_type: str) -> bool: + # Mirrors VerifierEngine._compare_values: equals / greater_than / less_than / + # contains. Numeric comparisons coerce both sides to float; equals coerces to + # match the SQL COUNT(*) integer against the JSON expected_value. + if comparison_type == "equals": + try: + return float(actual) == float(expected) + except (TypeError, ValueError): + return str(actual) == str(expected) + if comparison_type == "greater_than": + return float(actual) > float(expected) + if comparison_type == "less_than": + return float(actual) < float(expected) + if comparison_type == "contains": + return str(expected) in str(actual) + fail(f"unsupported comparison_type {comparison_type!r}") + + +def cmd_judge(args) -> None: + raw_transcript = sys.stdin.read() + try: + task = json.load(open(args.task_json, encoding="utf-8")) + except Exception as e: # noqa: BLE001 + fail(f"reading task json {args.task_json} failed: {e}") + + servers = task.get("gym_servers_config") + if isinstance(servers, str): + servers = json.loads(servers) + if not isinstance(servers, list) or not servers: + fail("task has no gym_servers_config list") + + verifiers = task.get("verifiers") + if isinstance(verifiers, str): + verifiers = json.loads(verifiers) + if not isinstance(verifiers, list) or not verifiers: + fail("task has no verifiers list") + + # The transcript is JSON: either a list of tool-call objects or {"calls":[...]}. + transcript: list = [] + if raw_transcript.strip(): + try: + parsed = json.loads(raw_transcript) + except Exception as e: # noqa: BLE001 + fail(f"transcript is not valid JSON: {e}") + transcript = parsed.get("calls", []) if isinstance(parsed, dict) else parsed + if not isinstance(transcript, list): + fail("transcript must be a JSON list of tool calls (or {\"calls\":[...]})") + + replay_transcript(servers, transcript) + + results = [] + passes = 0 + for v in verifiers: + vtype = v.get("verifier_type") + if vtype != "database_state": + # Only the SQL state-checker is deterministic+deployable here; a + # response_check / tool_execution verifier needs the live agent loop, + # not this state replay. Fail loud rather than skip-and-inflate. + fail(f"verifier {v.get('name')!r} has non-deterministic verifier_type {vtype!r}") + cfg = v.get("validation_config") or {} + query = cfg.get("query") + if not isinstance(query, str) or not query.strip(): + fail(f"verifier {v.get('name')!r} has no SQL query") + server = server_for_gym(servers, v.get("gym_name")) + actual = run_sql(server, query) + ok = compare(actual, cfg.get("expected_value"), cfg.get("comparison_type", "equals")) + if ok: + passes += 1 + results.append({"name": v.get("name"), "passed": bool(ok)}) + + total = len(verifiers) + print( + json.dumps( + { + "success": passes == total, + "passes": passes, + "total": total, + "verifiers": results, + } + ) + ) + + +def main() -> None: + ap = argparse.ArgumentParser(description="EnterpriseOps-Gym judge driver") + sub = ap.add_subparsers(dest="cmd", required=True) + p = sub.add_parser("judge") + p.add_argument("--task-json", required=True) + args = ap.parse_args() + if args.cmd == "judge": + cmd_judge(args) + + +if __name__ == "__main__": + main() diff --git a/bench/src/adapters.ts b/bench/src/adapters.ts index 16abfdf..1b7c4fc 100644 --- a/bench/src/adapters.ts +++ b/bench/src/adapters.ts @@ -11,6 +11,7 @@ import { createCadBenchAdapter } from './benchmarks/cadbench' import { createCadDesignAdapter } from './benchmarks/cad-design' import { createCadGenBenchAdapter } from './benchmarks/cadgenbench' import { createCommit0Adapter } from './benchmarks/commit0' +import { createEnterpriseOpsGymAdapter } from './benchmarks/enterpriseops-gym' import { createFinsearchcompAdapter } from './benchmarks/finsearchcomp' import { createFramesAdapter } from './benchmarks/frames' import { createHotpotqaAdapter } from './benchmarks/hotpotqa' @@ -31,6 +32,7 @@ export const ADAPTERS: Record BenchmarkAdapter> = { commit0: createCommit0Adapter, programbench: createProgrambenchAdapter, appworld: createAppWorldAdapter, + 'enterpriseops-gym': createEnterpriseOpsGymAdapter, 'cad-design': createCadDesignAdapter, cadbench: createCadBenchAdapter, cadgenbench: createCadGenBenchAdapter, diff --git a/bench/src/benchmarks/enterpriseops-gym.test.mts b/bench/src/benchmarks/enterpriseops-gym.test.mts new file mode 100644 index 0000000..6ce7b68 --- /dev/null +++ b/bench/src/benchmarks/enterpriseops-gym.test.mts @@ -0,0 +1,77 @@ +/** + * Offline EnterpriseOps-Gym adapter test. The judge needs a live, freshly-seeded + * gym MCP server (Docker), not installed in CI, so this exercises the parts that + * run offline (fixtures loadTasks, the transcript OutputAdapter, goldArtifact) and + * asserts the judge FAILS LOUD with the documented docker fix when no server is + * reachable — never a fake score. Run: + * EOPS_FIXTURES=1 npx tsx --test src/benchmarks/enterpriseops-gym.test.mts + */ +import assert from 'node:assert/strict' +import { test } from 'node:test' +import { createEnterpriseOpsGymAdapter, enterpriseOpsTranscriptOutput } from './enterpriseops-gym' + +process.env.EOPS_FIXTURES = '1' + +type Events = Parameters[0] +const stream = (text: string): Events => [{ data: { finalText: text } }] as unknown as Events + +const itsmId = 'task_20251212_172511_458_e6427839_47076c15' + +test('loadTasks (fixtures) yields enterprise tasks with tool-list + SQL-verifier metadata', async () => { + const a = createEnterpriseOpsGymAdapter() + const tasks = await a.loadTasks({ ids: [itsmId] }) + assert.equal(tasks.length, 1) + const t = tasks[0] + assert.equal(t.id, itsmId) + assert.equal(t.split, 'itsm') + assert.match(t.prompt, /```json/) + assert.match(t.prompt, /update_incident/) + const md = t.metadata as Record + assert.equal(md.taskId, itsmId) + assert.equal(md.domain, 'itsm') + assert.ok(Array.isArray(md.servers) && (md.servers as unknown[]).length === 1) + const verifiers = md.verifiers as Array<{ verifier_type: string; validation_config: { query: string } }> + assert.equal(verifiers.length, 2) + assert.equal(verifiers[0].verifier_type, 'database_state') + assert.match(verifiers[0].validation_config.query, /SELECT COUNT/) +}) + +test('loadTasks scopes by domain split and limit', async () => { + const a = createEnterpriseOpsGymAdapter() + const cal = await a.loadTasks({ split: 'calendar' }) + assert.equal(cal.length, 1) + assert.equal(cal[0].split, 'calendar') + const capped = await a.loadTasks({ limit: 1 }) + assert.equal(capped.length, 1) +}) + +test('transcript OutputAdapter: last fenced ```json wins; fence-less falls back to trimmed text', () => { + const fenced = enterpriseOpsTranscriptOutput.parse( + stream('preamble\n```json\n{"calls":[{"tool":"update_incident","arguments":{}}]}\n```\n'), + ) + assert.equal(fenced, '{"calls":[{"tool":"update_incident","arguments":{}}]}') + const last = enterpriseOpsTranscriptOutput.parse(stream('```json\nFIRST\n```\nmid\n```json\nSECOND\n```')) + assert.equal(last, 'SECOND') + const raw = enterpriseOpsTranscriptOutput.parse(stream(' {"calls":[]} ')) + assert.equal(raw, '{"calls":[]}') +}) + +test('goldArtifact is undefined — oracle is the seeded DB state, documented, not a fabricated transcript', async () => { + const a = createEnterpriseOpsGymAdapter() + const [t] = await a.loadTasks({ ids: [itsmId] }) + assert.equal(await a.goldArtifact(t), undefined) +}) + +test('judge FAILS LOUD with the docker fix when no gym server is reachable (no fake score)', async () => { + const a = createEnterpriseOpsGymAdapter() + const [t] = await a.loadTasks({ ids: [itsmId] }) + // Point the metadata at a definitely-dead port so the SQL-runner POST is refused. + const md = t.metadata as Record + const servers = md.servers as Array<{ mcp_server_url: string }> + servers[0].mcp_server_url = 'http://127.0.0.1:1' + await assert.rejects(a.judge(t, '{"calls":[]}'), (e: Error) => { + assert.match(e.message, /enterpriseops-gym judge failed/) + assert.match(e.message, /docker pull shivakrishnareddyma225\/enterpriseops-gym-mcp-itsm/) + return true + }) +}) diff --git a/bench/src/benchmarks/enterpriseops-gym.ts b/bench/src/benchmarks/enterpriseops-gym.ts new file mode 100644 index 0000000..51f56f8 --- /dev/null +++ b/bench/src/benchmarks/enterpriseops-gym.ts @@ -0,0 +1,332 @@ +/** + * EnterpriseOps-Gym adapter (ServiceNow-AI/EnterpriseOps-Gym, Apache-2.0) — + * stateful agentic planning + tool use in enterprise settings. Each record is an + * enterprise-ops task (Customer Service, HR, ITSM, Calendar, Email, Drive, Teams, + * Hybrid) handed to the agent with a domain `system_prompt`, a `selected_tools` + * allow-list, and one or more containerized gym MCP servers (`gym_servers_config`). + * Worker artifact = the ordered tool-call transcript the agent would issue against + * those servers, emitted as a single fenced ```json block of + * `{ "calls": [ { "tool": ..., "arguments": {...}, "gym_name"?: ... } ] }`. + * + * Judge = the benchmark's OWN deterministic state-checker. The driver replays the + * transcript against a freshly-seeded gym server (mutating its database), then runs + * each task's `database_state` verifier — an SQL SELECT executed via the gym + * server's /api/sql-runner endpoint, compared to `expected_value` under + * `comparison_type` (equals/greater_than/less_than/contains). GRADED: score = + * (verifiers passing) / (total verifiers) = the bench's verifier_level_pass_rate; + * binary `resolved` = ALL verifiers pass = the bench's overall_success_rate. Fully + * deterministic — no LLM judge. + * + * loadTasks enumerates the real suite from the HF rows server (config = tool-set + * MODE oracle|plus_5_tools|plus_10_tools|plus_15_tools; split = DOMAIN); a committed + * sample (bench/fixtures/enterpriseops-gym.json) loads offline. + * + * Requires for a LIVE judge run: a Docker daemon with the domain gym images + * (`docker pull shivakrishnareddyma225/enterpriseops-gym-mcp-:latest`) up + * on the ports in `gym_servers_config`, seeded from gym_dbs.zip. preflight + judge + * fail loud with the exact pull/run/unzip step when a server is unreachable — never + * a fabricated score. No portable gold artifact ships, so goldArtifact is undefined. + */ + +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { join } from 'node:path' +import type { OutputAdapter } from '@tangle-network/agent-runtime/loops' +import { benchRoot, runVenvScriptStdin } from './_harness' +import type { BenchmarkAdapter, BenchScore, BenchTask, LoadOptions } from './types' + +const FIXTURES = join(benchRoot, 'fixtures', 'enterpriseops-gym.json') +const JUDGE = join(benchRoot, 'scripts', 'enterpriseops_gym_judge.py') + +const DATASET = 'ServiceNow-AI/EnterpriseOps-Gym' +/** Tool-set mode = HF config; oracle ships exact tools, plus_N adds N distractors. */ +const DEFAULT_MODE = 'oracle' +/** Domain = HF split. */ +const DEFAULT_DOMAIN = 'itsm' + +const rowsApi = (mode: string, domain: string) => + `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(DATASET)}&config=${encodeURIComponent(mode)}&split=${encodeURIComponent(domain)}` + +interface GymServerConfig { + mcp_server_name: string + mcp_server_url: string + seed_database_file: string + context?: Record + user_info?: Record +} + +interface Verifier { + verifier_type: string + name: string + description?: string + gym_name: string + validation_config: { query: string; expected_value: unknown; comparison_type: string } +} + +interface EopsRow { + task_id: string + domain: string + system_prompt: string + user_prompt: string + selected_tools: string[] + restricted_tools: string[] + mcp_endpoint: string + number_of_runs: number + reset_database_between_runs: boolean + /** HF parquet stores these as JSON strings; fixtures store them as parsed arrays. */ + gym_servers_config: string | GymServerConfig[] + verifiers: string | Verifier[] +} + +interface EopsMeta { + taskId: string + domain: string + mode: string + selectedTools: string[] + servers: GymServerConfig[] + verifiers: Verifier[] +} + +/** Worker transcript = the last fenced ```json block, else the raw text. */ +export const enterpriseOpsTranscriptOutput: OutputAdapter = { + parse(events) { + let text = '' + for (const ev of events) { + const d = (ev as { data?: Record })?.data + const t = d?.finalText ?? d?.text ?? d?.result + if (typeof t === 'string' && t.length > 0) text = t + } + const fences = [...text.matchAll(/```(?:json)?\s*\n([\s\S]*?)```/g)] + return (fences.at(-1)?.[1] ?? text).trim() + }, +} + +function asArray(v: string | T[]): T[] { + return typeof v === 'string' ? (JSON.parse(v) as T[]) : v +} + +function workerContract(tools: string[]): string { + return [ + '', + `You have exactly these tools available (call NO others): ${tools.join(', ')}.`, + 'Plan the full sequence of tool calls that brings the enterprise database to the required final state, honoring every policy in the role above.', + 'Emit your COMPLETE plan as the LAST thing in your reply, in a single fenced ```json block, as an object:', + '{ "calls": [ { "tool": "", "arguments": { ... } } ] }', + 'Include one entry per tool call in execution order. Nothing after the closing fence.', + ].join('\n') +} + +function rowToTask(row: EopsRow, mode: string): BenchTask { + const servers = asArray(row.gym_servers_config) + const verifiers = asArray(row.verifiers) + const meta: EopsMeta = { + taskId: row.task_id, + domain: row.domain, + mode, + selectedTools: row.selected_tools, + servers, + verifiers, + } + return { + id: row.task_id, + split: row.domain, + prompt: [row.system_prompt, '', row.user_prompt, workerContract(row.selected_tools)].join('\n'), + metadata: meta as unknown as Record, + } +} + +function readMeta(task: BenchTask): EopsMeta { + const md = task.metadata + if ( + !md || + typeof md.taskId !== 'string' || + !Array.isArray(md.servers) || + !Array.isArray(md.verifiers) || + (md.verifiers as unknown[]).length === 0 + ) { + throw new Error(`enterpriseops-gym task ${task.id} missing metadata — loadTasks did not populate it`) + } + return md as unknown as EopsMeta +} + +function selectRows(rows: EopsRow[], mode: string, opts: LoadOptions): BenchTask[] { + let tasks = rows.map((r) => rowToTask(r, mode)) + if (opts.ids) { + const want = new Set(opts.ids) + tasks = tasks.filter((t) => want.has(t.id)) + } else if (opts.limit !== undefined) { + tasks = tasks.slice(0, opts.limit) + } + return tasks +} + +async function loadFixtures(mode: string, opts: LoadOptions): Promise { + const rows = JSON.parse(await readFile(FIXTURES, 'utf8')) as EopsRow[] + console.warn( + `[enterpriseops-gym] EOPS_FIXTURES=1 — loading ${rows.length} committed sample rows from ${FIXTURES} (no HF fetch)`, + ) + const domain = opts.split + const scoped = domain ? rows.filter((r) => r.domain === domain) : rows + return selectRows(scoped, mode, opts) +} + +/** Pull real rows from the HF rows server (paged) for one mode/domain. Throws loud on a non-OK response. */ +async function fetchRows(mode: string, domain: string, opts: LoadOptions): Promise { + const target = opts.ids ? opts.ids.length * 4 : (opts.limit ?? 16) + const rows: EopsRow[] = [] + const want = opts.ids ? new Set(opts.ids) : null + const page = 100 + const base = rowsApi(mode, domain) + for (let offset = 0; offset < 1024 && rows.length < target; offset += page) { + const res = await fetch(`${base}&offset=${offset}&length=${page}`) + if (!res.ok) { + throw new Error(`enterpriseops-gym rows HTTP ${res.status} (offset ${offset}): ${(await res.text()).slice(0, 200)}`) + } + const body = (await res.json()) as { rows?: Array<{ row: EopsRow }> } + const got = body.rows ?? [] + if (got.length === 0) break + for (const r of got) { + if (want && !want.has(r.row.task_id)) continue + rows.push(r.row) + } + if (got.length < page) break + } + if (rows.length === 0) throw new Error(`enterpriseops-gym: no rows matched ${JSON.stringify(opts)} for ${mode}/${domain}`) + return rows +} + +/** + * Run the benchmark's own state-checker for one task over the worker's transcript. + * The driver replays the tool calls against the live gym server, runs each + * database_state verifier's SQL via /api/sql-runner, and reports {passes,total}. + * Score = passes/total (verifier_level_pass_rate); resolved = all pass + * (overall_success_rate). This is the expensive Docker-backed boundary — delegated + * to the python driver, not reimplemented. The transcript is piped on stdin via the + * shared stdin-aware runner (execFile's `input` is not honored async and hangs the + * reader). + */ +async function runJudge(meta: EopsMeta, artifact: string): Promise { + // Stage the full task record (servers + verifiers) so the driver has the live + // server URLs/contexts and the SQL it must run. + const taskJsonPath = join(benchRoot, '.eops-task-cache', `${meta.taskId}.json`) + await writeTaskCache(taskJsonPath, meta) + let stdout: string + try { + stdout = await runVenvScriptStdin(JUDGE, ['judge', '--task-json', taskJsonPath], artifact, { cwd: benchRoot }) + } catch (err) { + const e = err as { message?: string } + throw new Error( + `enterpriseops-gym judge failed for ${meta.taskId}: ${(e.message || String(err)).slice(0, 1500)}\n` + + `Fix: ensure the ${meta.domain} gym server is running — ` + + `docker pull shivakrishnareddyma225/enterpriseops-gym-mcp-${meta.domain}:latest ; ` + + `docker run -d -p : shivakrishnareddyma225/enterpriseops-gym-mcp-${meta.domain}:latest ; ` + + `seed it from gym_dbs.zip (unzip into the gym's data dir).`, + ) + } + const report = JSON.parse(stdout.trim().split('\n').at(-1) ?? '{}') as { + success?: boolean + passes?: number + total?: number + error?: string + } + if (report.error) throw new Error(`enterpriseops-gym judge error for ${meta.taskId}: ${report.error}`) + if (typeof report.passes !== 'number' || typeof report.total !== 'number') { + throw new Error(`enterpriseops-gym judge returned no {passes,total}: ${stdout.slice(0, 400)}`) + } + const score = report.total > 0 ? report.passes / report.total : 0 + return { + resolved: report.success === true && report.total > 0 && report.passes === report.total, + score, + detail: JSON.stringify({ taskId: meta.taskId, domain: meta.domain, passes: report.passes, total: report.total }), + } +} + +async function writeTaskCache(path: string, meta: EopsMeta): Promise { + await mkdir(join(path, '..'), { recursive: true }) + await writeFile( + path, + JSON.stringify({ task_id: meta.taskId, domain: meta.domain, gym_servers_config: meta.servers, verifiers: meta.verifiers }), + ) +} + +/** Ping the configured gym servers' SQL endpoint; throw loud with the docker fix if any is unreachable. */ +async function probeServers(servers: GymServerConfig[], domain: string): Promise { + for (const s of servers) { + const url = `${s.mcp_server_url.replace(/\/$/, '')}/api/sql-runner` + try { + // A HEAD/empty POST just proves reachability; a real query runs in judge. + const res = await fetch(url, { method: 'POST', headers: { 'content-type': 'application/json' }, body: '{}' }) + // Any HTTP response (even an error status) proves the server is up. Only a + // transport failure (refused/ENOTFOUND) means the container is not running. + void res.status + } catch (err) { + throw new Error( + `enterpriseops-gym preflight: ${s.mcp_server_name} unreachable at ${url}: ${err instanceof Error ? err.message : err}\n` + + `Fix: docker pull shivakrishnareddyma225/enterpriseops-gym-mcp-${domain}:latest ; ` + + `docker run -d -p : shivakrishnareddyma225/enterpriseops-gym-mcp-${domain}:latest ; ` + + `seed from gym_dbs.zip. Set EOPS_FIXTURES=1 to load sample tasks offline (judge still needs a live server).`, + ) + } + } +} + +export function createEnterpriseOpsGymAdapter(): BenchmarkAdapter { + const fixturesMode = process.env.EOPS_FIXTURES === '1' + const mode = process.env.EOPS_MODE ?? DEFAULT_MODE + + return { + name: 'enterpriseops-gym', + output: enterpriseOpsTranscriptOutput, + + async preflight() { + // Fixtures mode proves only that the sample file is readable; a live judge + // still requires running gym servers (and fails loud there if absent). + if (fixturesMode) { + await readFile(FIXTURES, 'utf8').catch((err) => { + throw new Error(`EOPS_FIXTURES=1 but ${FIXTURES} unreadable: ${err instanceof Error ? err.message : err}`) + }) + return + } + // Live mode: probe the gym servers for the default domain (the suite's tasks + // each carry their own server config; preflight verifies reachability up + // front so a batch fails fast with the docker fix rather than mid-run). + const sample = await loadFixtures(mode, { split: DEFAULT_DOMAIN, limit: 1 }).catch(() => []) + const servers = sample[0] ? readMeta(sample[0]).servers : [] + if (servers.length === 0) { + throw new Error( + `enterpriseops-gym preflight: no gym_servers_config to probe. ` + + `Set EOPS_FIXTURES=1 to load offline, or ensure the dataset rows carry gym_servers_config.`, + ) + } + await probeServers(servers, DEFAULT_DOMAIN) + }, + + async loadTasks(opts: LoadOptions = {}) { + const domain = opts.split ?? DEFAULT_DOMAIN + if (fixturesMode) return loadFixtures(mode, opts) + let rows: EopsRow[] + try { + rows = await fetchRows(mode, domain, opts) + } catch (err) { + console.warn( + `[enterpriseops-gym] live rows fetch failed (${err instanceof Error ? err.message : err}); falling back to committed sample at ${FIXTURES}`, + ) + return loadFixtures(mode, opts) + } + return selectRows(rows, mode, opts) + }, + + async goldArtifact() { + // The benchmark ships no portable per-task oracle transcript — the reference + // is the seeded final DB state the verifiers check, not a tool-call script. + // Judge correctness is proven by replaying a real solve against the live gym + // server, not by a synthetic gold transcript. Returns undefined (documented, + // not faked). + return undefined + }, + + async judge(task: BenchTask, artifact: string): Promise { + const meta = readMeta(task) + return runJudge(meta, artifact) + }, + } +}