Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions bench/src/commit0-gate.mts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import { Sandbox } from '@tangle-network/sandbox'
import { createCommit0Adapter } from './benchmarks/commit0'
import type { BenchTask } from './benchmarks/types'
import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'

function must(name: string): string {
const v = process.env[name]
Expand Down Expand Up @@ -64,6 +65,7 @@ interface Shot {
wallMs: number
/** measured count of stream events from the rollout (0 if it errored before streaming) */
events: number
runtimeEvents?: BenchRuntimeHookEvent[]
}

/** Build the rollout prompt: clone the stub, implement the source, write the diff to
Expand Down Expand Up @@ -149,8 +151,19 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
},
}
let run: SandboxRun<RolloutDeliverable> | undefined
const runtime = createRuntimeHookRecorder()
try {
run = await openSandboxRun(client, { agentRun, signal: controller.signal }, commit0Deliverable)
run = await openSandboxRun(
client,
{
agentRun,
signal: controller.signal,
hooks: runtime.hooks,
runId: `commit0:${task.id}:${attempt}`,
scenarioId: task.id,
},
commit0Deliverable,
)
const turn = await run.start(rolloutPrompt(meta))
const ok = turn.out.diff.trim().length > 0
return {
Expand All @@ -159,12 +172,22 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
diff: turn.out.diff,
ok,
events: turn.events.length,
runtimeEvents: runtime.events,
wallMs: Date.now() - startedAt,
...(ok ? {} : { detail: `empty patch${turn.readError ? ` (read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}` }),
}
} catch (err) {
const msg = err instanceof Error ? err.message : String(err)
return { task, attempt, diff: '', ok: false, events: 0, wallMs: Date.now() - startedAt, detail: `rollout error: ${msg.slice(0, 200)}` }
return {
task,
attempt,
diff: '',
ok: false,
events: 0,
runtimeEvents: runtime.events,
wallMs: Date.now() - startedAt,
detail: `rollout error: ${msg.slice(0, 200)}`,
}
} finally {
if (timer) clearTimeout(timer)
if (run) await run.close()
Expand Down Expand Up @@ -312,6 +335,9 @@ async function main(): Promise<void> {
for (const task of tasks) {
let built = false
const attempts: AttemptRecord[] = []
const runtimeEvents = shots
.filter((x) => x.task.id === task.id)
.flatMap((x) => x.runtimeEvents ?? [])
for (let i = 0; i < k; i += 1) {
const s = shots.find((x) => x.task.id === task.id && x.attempt === i)
let sc: { score: number; resolved: boolean } | undefined
Expand Down Expand Up @@ -350,6 +376,7 @@ async function main(): Promise<void> {
resolved: attempts.some((a) => a.valid === true),
attempts,
infraError: false,
...(runtimeEvents.length > 0 ? { runtimeEvents } : {}),
}
await appendRunRecord(corpusPath, record) // incremental: partial progress survives a crash
}
Expand Down
7 changes: 7 additions & 0 deletions bench/src/corpus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { dirname } from 'node:path'
import { hashContent, type RunSplitTag, validateRunRecord } from '@tangle-network/agent-eval'
import type { CorpusRecord } from '@tangle-network/agent-eval/rl'
import type { Iteration } from '@tangle-network/agent-runtime/loops'
import type { BenchRuntimeHookEvent } from './runtime-hook-recorder'

/** One attempt within a condition-run: the prompt/steer sent, the output, the
* verdict, the measured economics, and a bounded trace summary.
Expand Down Expand Up @@ -67,6 +68,8 @@ export interface RunRecord {
seed?: number
splitTag?: RunSplitTag
commitSha?: string
/** Passive runtime hook evidence captured during the run. Optional and bounded by producers. */
runtimeEvents?: BenchRuntimeHookEvent[]
}

const TRACE_TAIL_MAX = 600
Expand Down Expand Up @@ -116,6 +119,7 @@ export function buildRunRecord<Task, Output>(args: {
seed?: number
splitTag?: RunSplitTag
commitSha?: string
runtimeEvents?: BenchRuntimeHookEvent[]
}): RunRecord {
const attempts = args.iterations.map(summarizeAttempt)
return {
Expand All @@ -131,6 +135,9 @@ export function buildRunRecord<Task, Output>(args: {
...(args.seed !== undefined ? { seed: args.seed } : {}),
...(args.splitTag !== undefined ? { splitTag: args.splitTag } : {}),
...(args.commitSha !== undefined ? { commitSha: args.commitSha } : {}),
...(args.runtimeEvents !== undefined && args.runtimeEvents.length > 0
? { runtimeEvents: args.runtimeEvents }
: {}),
}
}

Expand Down
5 changes: 4 additions & 1 deletion bench/src/experiment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import {
import type { BenchmarkAdapter, BenchTask } from './benchmarks/types'
import { appendRunRecord, buildRunRecord } from './corpus'
import { routerChatWithUsage } from './router-client'
import { createRuntimeHookRecorder } from './runtime-hook-recorder'
import { runPool } from './run-pool'
import { runSteeringExperiment } from './steering-experiment'

Expand Down Expand Up @@ -292,13 +293,14 @@ export async function runExperiment(cfg: ExperimentConfig): Promise<ExperimentRe
return { valid: v.resolved === true, score: v.score }
},
}
const runtime = createRuntimeHookRecorder()
const result = await runLoop<string, string, 'continue' | 'done'>({
driver: createDynamicDriver<string, string>({ planner, maxIterations: rounds }),
agentRun: cfg.agentRun,
output,
validator,
task: task.prompt,
ctx: { sandboxClient: cfg.sandboxClient },
ctx: { sandboxClient: cfg.sandboxClient, hooks: runtime.hooks },
maxIterations: rounds,
})
const iter0 = result.iterations[0]
Expand All @@ -319,6 +321,7 @@ export async function runExperiment(cfg: ExperimentConfig): Promise<ExperimentRe
resolved,
infraError,
...(cfg.now ? { now: cfg.now } : {}),
runtimeEvents: runtime.events,
}),
).catch((err) =>
console.error(
Expand Down
38 changes: 38 additions & 0 deletions bench/src/runtime-hook-recorder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
export type BenchRuntimeHookPhase = 'before' | 'after' | 'error' | 'event'

export interface BenchRuntimeHookEvent<Payload = unknown> {
id: string
runId: string
scenarioId?: string
target: string
phase: BenchRuntimeHookPhase
timestamp: number
stepIndex?: number
parentId?: string
payload?: Payload
metadata?: Record<string, unknown>
}

export interface BenchRuntimeHooks {
onEvent?: (
event: BenchRuntimeHookEvent,
context: { signal?: AbortSignal },
) => void | Promise<void>
}

export interface RuntimeHookRecorder {
readonly events: BenchRuntimeHookEvent[]
readonly hooks: BenchRuntimeHooks
}

export function createRuntimeHookRecorder(): RuntimeHookRecorder {
const events: BenchRuntimeHookEvent[] = []
return {
events,
hooks: {
onEvent: (event) => {
events.push(event)
},
},
}
}
16 changes: 15 additions & 1 deletion bench/src/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
} from '@tangle-network/agent-runtime/loops'
import { Sandbox } from '@tangle-network/sandbox'
import type { BenchTask } from './benchmarks/types'
import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'

export interface WorkerConfig {
sandboxBaseUrl: string
Expand All @@ -28,6 +29,7 @@ export interface ShotResult {
patch: string
ok: boolean
detail?: string
runtimeEvents?: BenchRuntimeHookEvent[]
}

const PATCH_PATH = '/tmp/solution.patch'
Expand Down Expand Up @@ -97,7 +99,18 @@ export async function solveShot(
},
},
}
const run = await openSandboxRun(client, { agentRun, signal: controller.signal }, swePatchDeliverable)
const runtime = createRuntimeHookRecorder()
const run = await openSandboxRun(
client,
{
agentRun,
signal: controller.signal,
hooks: runtime.hooks,
runId: `swe-bench:${task.id}`,
scenarioId: task.id,
},
swePatchDeliverable,
)
try {
const turn = await run.start(prompt)
const empty = turn.out.patch.trim().length === 0
Expand All @@ -107,6 +120,7 @@ export async function solveShot(
detail: empty
? `empty patch${turn.readError ? ` (patch read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}`
: undefined,
runtimeEvents: runtime.events,
}
} finally {
if (timer) clearTimeout(timer)
Expand Down
Loading
Loading