tangle-network · drewstone · Jun 6, 2026 · Jun 6, 2026
diff --git a/bench/src/commit0-gate.mts b/bench/src/commit0-gate.mts
@@ -37,6 +37,7 @@ import { Sandbox } from '@tangle-network/sandbox'
 import { createCommit0Adapter } from './benchmarks/commit0'
 import type { BenchTask } from './benchmarks/types'
 import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
+import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'
 
 function must(name: string): string {
   const v = process.env[name]
@@ -64,6 +65,7 @@ interface Shot {
   wallMs: number
   /** measured count of stream events from the rollout (0 if it errored before streaming) */
   events: number
+  runtimeEvents?: BenchRuntimeHookEvent[]
 }
 
 /** Build the rollout prompt: clone the stub, implement the source, write the diff to
@@ -149,8 +151,19 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
     },
   }
   let run: SandboxRun<RolloutDeliverable> | undefined
+  const runtime = createRuntimeHookRecorder()
   try {
-    run = await openSandboxRun(client, { agentRun, signal: controller.signal }, commit0Deliverable)
+    run = await openSandboxRun(
+      client,
+      {
+        agentRun,
+        signal: controller.signal,
+        hooks: runtime.hooks,
+        runId: `commit0:${task.id}:${attempt}`,
+        scenarioId: task.id,
+      },
+      commit0Deliverable,
+    )
     const turn = await run.start(rolloutPrompt(meta))
     const ok = turn.out.diff.trim().length > 0
     return {
@@ -159,12 +172,22 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
       diff: turn.out.diff,
       ok,
       events: turn.events.length,
+      runtimeEvents: runtime.events,
       wallMs: Date.now() - startedAt,
       ...(ok ? {} : { detail: `empty patch${turn.readError ? ` (read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}` }),
     }
   } catch (err) {
     const msg = err instanceof Error ? err.message : String(err)
-    return { task, attempt, diff: '', ok: false, events: 0, wallMs: Date.now() - startedAt, detail: `rollout error: ${msg.slice(0, 200)}` }
+    return {
+      task,
+      attempt,
+      diff: '',
+      ok: false,
+      events: 0,
+      runtimeEvents: runtime.events,
+      wallMs: Date.now() - startedAt,
+      detail: `rollout error: ${msg.slice(0, 200)}`,
+    }
   } finally {
     if (timer) clearTimeout(timer)
     if (run) await run.close()
@@ -312,6 +335,9 @@ async function main(): Promise<void> {
   for (const task of tasks) {
     let built = false
     const attempts: AttemptRecord[] = []
+    const runtimeEvents = shots
+      .filter((x) => x.task.id === task.id)
+      .flatMap((x) => x.runtimeEvents ?? [])
     for (let i = 0; i < k; i += 1) {
       const s = shots.find((x) => x.task.id === task.id && x.attempt === i)
       let sc: { score: number; resolved: boolean } | undefined
@@ -350,6 +376,7 @@ async function main(): Promise<void> {
       resolved: attempts.some((a) => a.valid === true),
       attempts,
       infraError: false,
+      ...(runtimeEvents.length > 0 ? { runtimeEvents } : {}),
     }
     await appendRunRecord(corpusPath, record) // incremental: partial progress survives a crash
   }

diff --git a/bench/src/corpus.ts b/bench/src/corpus.ts
@@ -17,6 +17,7 @@ import { dirname } from 'node:path'
 import { hashContent, type RunSplitTag, validateRunRecord } from '@tangle-network/agent-eval'
 import type { CorpusRecord } from '@tangle-network/agent-eval/rl'
 import type { Iteration } from '@tangle-network/agent-runtime/loops'
+import type { BenchRuntimeHookEvent } from './runtime-hook-recorder'
 
 /** One attempt within a condition-run: the prompt/steer sent, the output, the
  *  verdict, the measured economics, and a bounded trace summary.
@@ -67,6 +68,8 @@ export interface RunRecord {
   seed?: number
   splitTag?: RunSplitTag
   commitSha?: string
+  /** Passive runtime hook evidence captured during the run. Optional and bounded by producers. */
+  runtimeEvents?: BenchRuntimeHookEvent[]
 }
 
 const TRACE_TAIL_MAX = 600
@@ -116,6 +119,7 @@ export function buildRunRecord<Task, Output>(args: {
   seed?: number
   splitTag?: RunSplitTag
   commitSha?: string
+  runtimeEvents?: BenchRuntimeHookEvent[]
 }): RunRecord {
   const attempts = args.iterations.map(summarizeAttempt)
   return {
@@ -131,6 +135,9 @@ export function buildRunRecord<Task, Output>(args: {
     ...(args.seed !== undefined ? { seed: args.seed } : {}),
     ...(args.splitTag !== undefined ? { splitTag: args.splitTag } : {}),
     ...(args.commitSha !== undefined ? { commitSha: args.commitSha } : {}),
+    ...(args.runtimeEvents !== undefined && args.runtimeEvents.length > 0
+      ? { runtimeEvents: args.runtimeEvents }
+      : {}),
   }
 }
 

diff --git a/bench/src/experiment.ts b/bench/src/experiment.ts
@@ -34,6 +34,7 @@ import {
 import type { BenchmarkAdapter, BenchTask } from './benchmarks/types'
 import { appendRunRecord, buildRunRecord } from './corpus'
 import { routerChatWithUsage } from './router-client'
+import { createRuntimeHookRecorder } from './runtime-hook-recorder'
 import { runPool } from './run-pool'
 import { runSteeringExperiment } from './steering-experiment'
 
@@ -292,13 +293,14 @@ export async function runExperiment(cfg: ExperimentConfig): Promise<ExperimentRe
         return { valid: v.resolved === true, score: v.score }
       },
     }
+    const runtime = createRuntimeHookRecorder()
     const result = await runLoop<string, string, 'continue' | 'done'>({
       driver: createDynamicDriver<string, string>({ planner, maxIterations: rounds }),
       agentRun: cfg.agentRun,
       output,
       validator,
       task: task.prompt,
-      ctx: { sandboxClient: cfg.sandboxClient },
+      ctx: { sandboxClient: cfg.sandboxClient, hooks: runtime.hooks },
       maxIterations: rounds,
     })
     const iter0 = result.iterations[0]
@@ -319,6 +321,7 @@ export async function runExperiment(cfg: ExperimentConfig): Promise<ExperimentRe
           resolved,
           infraError,
           ...(cfg.now ? { now: cfg.now } : {}),
+          runtimeEvents: runtime.events,
         }),
       ).catch((err) =>
         console.error(

diff --git a/bench/src/runtime-hook-recorder.ts b/bench/src/runtime-hook-recorder.ts
@@ -0,0 +1,38 @@
+export type BenchRuntimeHookPhase = 'before' | 'after' | 'error' | 'event'
+
+export interface BenchRuntimeHookEvent<Payload = unknown> {
+  id: string
+  runId: string
+  scenarioId?: string
+  target: string
+  phase: BenchRuntimeHookPhase
+  timestamp: number
+  stepIndex?: number
+  parentId?: string
+  payload?: Payload
+  metadata?: Record<string, unknown>
+}
+
+export interface BenchRuntimeHooks {
+  onEvent?: (
+    event: BenchRuntimeHookEvent,
+    context: { signal?: AbortSignal },
+  ) => void | Promise<void>
+}
+
+export interface RuntimeHookRecorder {
+  readonly events: BenchRuntimeHookEvent[]
+  readonly hooks: BenchRuntimeHooks
+}
+
+export function createRuntimeHookRecorder(): RuntimeHookRecorder {
+  const events: BenchRuntimeHookEvent[] = []
+  return {
+    events,
+    hooks: {
+      onEvent: (event) => {
+        events.push(event)
+      },
+    },
+  }
+}
diff --git a/bench/src/worker.ts b/bench/src/worker.ts
@@ -13,6 +13,7 @@ import {
 } from '@tangle-network/agent-runtime/loops'
 import { Sandbox } from '@tangle-network/sandbox'
 import type { BenchTask } from './benchmarks/types'
+import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'
 
 export interface WorkerConfig {
   sandboxBaseUrl: string
@@ -28,6 +29,7 @@ export interface ShotResult {
   patch: string
   ok: boolean
   detail?: string
+  runtimeEvents?: BenchRuntimeHookEvent[]
 }
 
 const PATCH_PATH = '/tmp/solution.patch'
@@ -97,7 +99,18 @@ export async function solveShot(
       },
     },
   }
-  const run = await openSandboxRun(client, { agentRun, signal: controller.signal }, swePatchDeliverable)
+  const runtime = createRuntimeHookRecorder()
+  const run = await openSandboxRun(
+    client,
+    {
+      agentRun,
+      signal: controller.signal,
+      hooks: runtime.hooks,
+      runId: `swe-bench:${task.id}`,
+      scenarioId: task.id,
+    },
+    swePatchDeliverable,
+  )
   try {
     const turn = await run.start(prompt)
     const empty = turn.out.patch.trim().length === 0
@@ -107,6 +120,7 @@ export async function solveShot(
       detail: empty
         ? `empty patch${turn.readError ? ` (patch read failed: ${turn.readError.slice(0, 120)})` : ''}${turn.out.lastErr ? `; lastError=${turn.out.lastErr}` : ''}`
         : undefined,
+      runtimeEvents: runtime.events,
     }
   } finally {
     if (timer) clearTimeout(timer)