Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions bench/fixtures/enterpriseops-gym.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
[
{
"task_id": "task_20251212_172511_458_e6427839_47076c15",
"domain": "itsm",
"system_prompt": "# ITSM Assistant Policy\n\nRole: ITSM Assistant (IT Service Management). Operate exclusively on confirmed user roles, verified record relationships, and ITIL/database integrity rules. Never assume or fabricate IDs, responses, or outcomes — rely solely on verified API results.",
"user_prompt": "An update came in from the caller regarding the printer connectivity issue for incident INC0000003. The user stated that several urgent client documents are still pending, and the inability to print is causing significant delays. Although the submission deadline has been extended by one business day, the service impact remains significant. Therefore increase the impact of the incident to high and update the necessary details.",
"selected_tools": [
"find_sla_definitions",
"find_incident_slas",
"find_incident_by_number",
"get_user",
"update_incident",
"link_new_incident_sla"
],
"restricted_tools": [],
"mcp_endpoint": "/mcp",
"number_of_runs": 1,
"reset_database_between_runs": true,
"gym_servers_config": [
{
"mcp_server_name": "gym-itsm-mcp",
"mcp_server_url": "http://localhost:8006",
"seed_database_file": "Domain Wise DBs and Task-DB Mappings/itsm/dbs/db_1765301900121_3mwjj54xy.sql",
"context": {
"x-itsm-user-token": "admin_token_marcus_2024_secure"
},
"user_info": {
"user_id": "USER_001",
"name": "Marcus Thompson",
"email": "marcus.thompson@techcorp.com"
}
}
],
"verifiers": [
{
"verifier_type": "database_state",
"name": "Verify if the priority of the incident is set correctly.",
"description": "Verify if the priority of the incident is set correctly.",
"gym_name": "gym-itsm-mcp",
"validation_config": {
"query": "SELECT COUNT(*) FROM incident WHERE incident_id = 'INC_003' AND impact = 'high' AND priority = 'high';",
"expected_value": 1,
"comparison_type": "equals"
}
},
{
"verifier_type": "database_state",
"name": "Verify if the correct high-priority SLA is linked to the incident.",
"description": "Verify if the correct high-priority SLA is linked to the incident.",
"gym_name": "gym-itsm-mcp",
"validation_config": {
"query": "SELECT COUNT(*) FROM incident_sla WHERE incident_id = 'INC_003' AND sla_def_id = 'SLA_002';",
"expected_value": 1,
"comparison_type": "equals"
}
}
]
},
{
"task_id": "task_20251117_165528_648_bca89e7d_3e81ece9",
"domain": "calendar",
"system_prompt": "# Calendar Assistant Policy\n\nRole: Calendar Assistant. Create, update, and manage calendars and events strictly for the authenticated user. Never operate on calendars outside the user's access scope.",
"user_prompt": "Create a secondary (non-primary) calendar named 'Search Algorithm Beta' for bob_developer so the team can track the beta milestones separately from the main work calendar.",
"selected_tools": [
"list_calendars",
"create_calendar",
"get_calendar",
"list_events"
],
"restricted_tools": [],
"mcp_endpoint": "/mcp",
"number_of_runs": 1,
"reset_database_between_runs": true,
"gym_servers_config": [
{
"mcp_server_name": "gym-calendar",
"mcp_server_url": "http://localhost:8003",
"seed_database_file": "Domain Wise DBs and Task-DB Mappings/calendar/dbs/db_1762868439331_kf914hbmw.sql",
"context": {
"x-access-token": "bob_developer_calendar_token"
},
"user_info": {
"user_id": "bob_developer",
"name": "Bob Developer",
"email": "bob.developer@techcorp.com"
}
}
],
"verifiers": [
{
"verifier_type": "database_state",
"name": "Calendar Creation",
"description": "Verify secondary calendar created",
"gym_name": "gym-calendar",
"validation_config": {
"query": "SELECT COUNT(*) AS count FROM calendars WHERE summary = 'Search Algorithm Beta' AND is_primary = 0 AND user_id = 'bob_developer';",
"expected_value": 1,
"comparison_type": "equals"
}
}
]
}
]
198 changes: 198 additions & 0 deletions bench/scripts/enterpriseops_gym_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# EnterpriseOps-Gym judge driver. One subcommand the TS adapter shells out to:
# judge --task-json PATH (agent tool-call transcript on stdin)
# -> {"success":bool,"passes":int,"total":int,"verifiers":[{name,passed}...]}
#
# This is the benchmark's OWN evaluation contract (ServiceNow/EnterpriseOps-Gym
# benchmark/verifier.py): tasks are scored on FINAL DATABASE STATE, not action
# sequences. The agent's tool-call transcript is replayed against the live,
# freshly-seeded gym MCP server (one HTTP POST per call to the server's /mcp
# endpoint); then each database_state verifier's SQL is executed via the gym
# server's /api/sql-runner endpoint and compared to expected_value under
# comparison_type (equals / greater_than / less_than / contains) — exactly the
# VerifierEngine._compare_values semantics. Per-task success = ALL verifiers pass
# (overall_success_rate); the fraction passing is the verifier_level_pass_rate.
#
# JSON is emitted as the LAST stdout line. Fail loud: an unreachable gym server,
# a non-database_state verifier we cannot run deterministically, or a malformed
# transcript prints {"error": "..."} and exits nonzero — never a fabricated score.

import argparse
import json
import sys
import urllib.error
import urllib.request


def fail(msg: str) -> None:
print(json.dumps({"error": msg}))
sys.exit(1)


def post_json(url: str, headers: dict, payload: dict, timeout: float = 30.0) -> dict:
body = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(url, data=body, method="POST")
req.add_header("Content-Type", "application/json")
for k, v in headers.items():
req.add_header(k, str(v))
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read().decode("utf-8")
return json.loads(raw) if raw.strip() else {}


def server_for_gym(servers: list, gym_name: str) -> dict:
# The verifier names its target gym (gym_name); match it to the configured
# server. A single-gym task falls back to the only configured server.
by_name = {s.get("mcp_server_name"): s for s in servers}
if gym_name in by_name:
return by_name[gym_name]
if len(servers) == 1:
return servers[0]
fail(f"verifier gym_name {gym_name!r} not in gym_servers_config ({sorted(by_name)})")


def replay_transcript(servers: list, transcript: list) -> None:
# Replay each agent tool call against the live gym MCP server so the final DB
# state reflects the agent's actions. The transcript is a list of
# {"tool":..., "arguments":{...}, "gym_name"?:...} entries (the worker's
# ordered tool calls). The server applies each call to its database; the
# verifiers then read the resulting state. An empty transcript leaves the
# seeded state untouched (the no-op-agent baseline → verifiers fail closed).
if not transcript:
return
for i, call in enumerate(transcript):
tool = call.get("tool")
if not isinstance(tool, str) or not tool:
fail(f"transcript[{i}] has no string 'tool' field: {call!r}")
gym_name = call.get("gym_name")
server = server_for_gym(servers, gym_name) if gym_name else servers[0]
url = server["mcp_server_url"].rstrip("/") + "/mcp"
headers = dict(server.get("context") or {})
payload = {"tool": tool, "arguments": call.get("arguments") or {}}
try:
post_json(url, headers, payload)
except urllib.error.URLError as e:
fail(f"gym server unreachable at {url} replaying tool {tool!r}: {e}")
except Exception as e: # noqa: BLE001
fail(f"tool call {tool!r} failed against {url}: {e}")


def run_sql(server: dict, query: str) -> object:
url = server["mcp_server_url"].rstrip("/") + "/api/sql-runner"
headers = dict(server.get("context") or {})
try:
out = post_json(url, headers, {"query": query})
except urllib.error.URLError as e:
fail(f"gym server unreachable at {url}: {e}")
except Exception as e: # noqa: BLE001
fail(f"sql-runner POST to {url} failed: {e}")
# The sql-runner returns the scalar/first-cell result. Accept the common
# shapes (a bare value, {"result": v}, or rows -> first cell) without faking.
if isinstance(out, dict):
if "result" in out:
return out["result"]
if "rows" in out and out["rows"]:
first = out["rows"][0]
if isinstance(first, dict) and first:
return next(iter(first.values()))
if isinstance(first, list) and first:
return first[0]
if "error" in out:
fail(f"sql-runner error for query {query!r}: {out['error']}")
return out


def compare(actual: object, expected: object, comparison_type: str) -> bool:
# Mirrors VerifierEngine._compare_values: equals / greater_than / less_than /
# contains. Numeric comparisons coerce both sides to float; equals coerces to
# match the SQL COUNT(*) integer against the JSON expected_value.
if comparison_type == "equals":
try:
return float(actual) == float(expected)
except (TypeError, ValueError):
return str(actual) == str(expected)
if comparison_type == "greater_than":
return float(actual) > float(expected)
if comparison_type == "less_than":
return float(actual) < float(expected)
if comparison_type == "contains":
return str(expected) in str(actual)
fail(f"unsupported comparison_type {comparison_type!r}")


def cmd_judge(args) -> None:
raw_transcript = sys.stdin.read()
try:
task = json.load(open(args.task_json, encoding="utf-8"))
except Exception as e: # noqa: BLE001
fail(f"reading task json {args.task_json} failed: {e}")

servers = task.get("gym_servers_config")
if isinstance(servers, str):
servers = json.loads(servers)
if not isinstance(servers, list) or not servers:
fail("task has no gym_servers_config list")

verifiers = task.get("verifiers")
if isinstance(verifiers, str):
verifiers = json.loads(verifiers)
if not isinstance(verifiers, list) or not verifiers:
fail("task has no verifiers list")

# The transcript is JSON: either a list of tool-call objects or {"calls":[...]}.
transcript: list = []
if raw_transcript.strip():
try:
parsed = json.loads(raw_transcript)
except Exception as e: # noqa: BLE001
fail(f"transcript is not valid JSON: {e}")
transcript = parsed.get("calls", []) if isinstance(parsed, dict) else parsed
if not isinstance(transcript, list):
fail("transcript must be a JSON list of tool calls (or {\"calls\":[...]})")

replay_transcript(servers, transcript)

results = []
passes = 0
for v in verifiers:
vtype = v.get("verifier_type")
if vtype != "database_state":
# Only the SQL state-checker is deterministic+deployable here; a
# response_check / tool_execution verifier needs the live agent loop,
# not this state replay. Fail loud rather than skip-and-inflate.
fail(f"verifier {v.get('name')!r} has non-deterministic verifier_type {vtype!r}")
cfg = v.get("validation_config") or {}
query = cfg.get("query")
if not isinstance(query, str) or not query.strip():
fail(f"verifier {v.get('name')!r} has no SQL query")
server = server_for_gym(servers, v.get("gym_name"))
actual = run_sql(server, query)
ok = compare(actual, cfg.get("expected_value"), cfg.get("comparison_type", "equals"))
if ok:
passes += 1
results.append({"name": v.get("name"), "passed": bool(ok)})

total = len(verifiers)
print(
json.dumps(
{
"success": passes == total,
"passes": passes,
"total": total,
"verifiers": results,
}
)
)


def main() -> None:
ap = argparse.ArgumentParser(description="EnterpriseOps-Gym judge driver")
sub = ap.add_subparsers(dest="cmd", required=True)
p = sub.add_parser("judge")
p.add_argument("--task-json", required=True)
args = ap.parse_args()
if args.cmd == "judge":
cmd_judge(args)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions bench/src/adapters.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import { createCadBenchAdapter } from './benchmarks/cadbench'
import { createCadDesignAdapter } from './benchmarks/cad-design'
import { createCadGenBenchAdapter } from './benchmarks/cadgenbench'
import { createCommit0Adapter } from './benchmarks/commit0'
import { createEnterpriseOpsGymAdapter } from './benchmarks/enterpriseops-gym'
import { createFinsearchcompAdapter } from './benchmarks/finsearchcomp'
import { createFramesAdapter } from './benchmarks/frames'
import { createHotpotqaAdapter } from './benchmarks/hotpotqa'
Expand All @@ -31,6 +32,7 @@ export const ADAPTERS: Record<string, () => BenchmarkAdapter> = {
commit0: createCommit0Adapter,
programbench: createProgrambenchAdapter,
appworld: createAppWorldAdapter,
'enterpriseops-gym': createEnterpriseOpsGymAdapter,
'cad-design': createCadDesignAdapter,
cadbench: createCadBenchAdapter,
cadgenbench: createCadGenBenchAdapter,
Expand Down
77 changes: 77 additions & 0 deletions bench/src/benchmarks/enterpriseops-gym.test.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* Offline EnterpriseOps-Gym adapter test. The judge needs a live, freshly-seeded
* gym MCP server (Docker), not installed in CI, so this exercises the parts that
* run offline (fixtures loadTasks, the transcript OutputAdapter, goldArtifact) and
* asserts the judge FAILS LOUD with the documented docker fix when no server is
* reachable — never a fake score. Run:
* EOPS_FIXTURES=1 npx tsx --test src/benchmarks/enterpriseops-gym.test.mts
*/
import assert from 'node:assert/strict'
import { test } from 'node:test'
import { createEnterpriseOpsGymAdapter, enterpriseOpsTranscriptOutput } from './enterpriseops-gym'

process.env.EOPS_FIXTURES = '1'

type Events = Parameters<typeof enterpriseOpsTranscriptOutput.parse>[0]
const stream = (text: string): Events => [{ data: { finalText: text } }] as unknown as Events

const itsmId = 'task_20251212_172511_458_e6427839_47076c15'

test('loadTasks (fixtures) yields enterprise tasks with tool-list + SQL-verifier metadata', async () => {
const a = createEnterpriseOpsGymAdapter()
const tasks = await a.loadTasks({ ids: [itsmId] })
assert.equal(tasks.length, 1)
const t = tasks[0]
assert.equal(t.id, itsmId)
assert.equal(t.split, 'itsm')
assert.match(t.prompt, /```json/)
assert.match(t.prompt, /update_incident/)
const md = t.metadata as Record<string, unknown>
assert.equal(md.taskId, itsmId)
assert.equal(md.domain, 'itsm')
assert.ok(Array.isArray(md.servers) && (md.servers as unknown[]).length === 1)
const verifiers = md.verifiers as Array<{ verifier_type: string; validation_config: { query: string } }>
assert.equal(verifiers.length, 2)
assert.equal(verifiers[0].verifier_type, 'database_state')
assert.match(verifiers[0].validation_config.query, /SELECT COUNT/)
})

test('loadTasks scopes by domain split and limit', async () => {
const a = createEnterpriseOpsGymAdapter()
const cal = await a.loadTasks({ split: 'calendar' })
assert.equal(cal.length, 1)
assert.equal(cal[0].split, 'calendar')
const capped = await a.loadTasks({ limit: 1 })
assert.equal(capped.length, 1)
})

test('transcript OutputAdapter: last fenced ```json wins; fence-less falls back to trimmed text', () => {
const fenced = enterpriseOpsTranscriptOutput.parse(
stream('preamble\n```json\n{"calls":[{"tool":"update_incident","arguments":{}}]}\n```\n'),
)
assert.equal(fenced, '{"calls":[{"tool":"update_incident","arguments":{}}]}')
const last = enterpriseOpsTranscriptOutput.parse(stream('```json\nFIRST\n```\nmid\n```json\nSECOND\n```'))
assert.equal(last, 'SECOND')
const raw = enterpriseOpsTranscriptOutput.parse(stream(' {"calls":[]} '))
assert.equal(raw, '{"calls":[]}')
})

test('goldArtifact is undefined — oracle is the seeded DB state, documented, not a fabricated transcript', async () => {
const a = createEnterpriseOpsGymAdapter()
const [t] = await a.loadTasks({ ids: [itsmId] })
assert.equal(await a.goldArtifact(t), undefined)
})

test('judge FAILS LOUD with the docker fix when no gym server is reachable (no fake score)', async () => {
const a = createEnterpriseOpsGymAdapter()
const [t] = await a.loadTasks({ ids: [itsmId] })
// Point the metadata at a definitely-dead port so the SQL-runner POST is refused.
const md = t.metadata as Record<string, unknown>
const servers = md.servers as Array<{ mcp_server_url: string }>
servers[0].mcp_server_url = 'http://127.0.0.1:1'
await assert.rejects(a.judge(t, '{"calls":[]}'), (e: Error) => {
assert.match(e.message, /enterpriseops-gym judge failed/)
assert.match(e.message, /docker pull shivakrishnareddyma225\/enterpriseops-gym-mcp-itsm/)
return true
})
})
Loading
Loading