tangle-network · drewstone · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/bench/src/aec-gate.mts b/bench/src/aec-gate.mts
@@ -20,9 +20,10 @@
 
 import { resolveAdapter } from './adapters'
 import type { BenchmarkAdapter, BenchTask } from './benchmarks/types'
-import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
+import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
 import { composeStrategies } from './directives'
 import { type RouterConfig, routerChatWithUsage } from './router-client'
+import { pool } from './stats.mts'
 
 function must(name: string): string {
   const v = process.env[name]
@@ -50,23 +51,6 @@ interface AttemptOutcome {
   infraError?: boolean
 }
 
-/** Bounded-concurrency pool: run `fn` over `items`, at most `limit` in flight. */
-async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
-  const results: R[] = new Array(items.length)
-  let next = 0
-  async function worker(): Promise<void> {
-    for (;;) {
-      const idx = next
-      next += 1
-      if (idx >= items.length) return
-      results[idx] = await fn(items[idx] as T, idx)
-    }
-  }
-  const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker())
-  await Promise.all(workers)
-  return results
-}
-
 async function runAttempt(
   cfg: RouterConfig,
   adapter: BenchmarkAdapter,
@@ -156,20 +140,17 @@ async function runArm(
     const task = tasks[t] as BenchTask
     const taskOutcomes = outcomes.slice(t * k, t * k + k)
     const attempts = taskOutcomes.map((o, i) => toAttemptRecord(o, i))
-    const record: RunRecord = {
-      ts: new Date().toISOString(),
+    const record = buildRunRecordFromAttempts(attempts, {
       benchmark: adapter.name,
       instanceId: task.id,
       condition: arm.condition,
       model: cfg.model,
-      blindResolved: attempts[0]?.valid === true,
       // k-attempt outcome = any usable attempt resolved (the oracle@k ceiling for
       // this run; the deployable selector is scored separately by corpus-replay).
       resolved: taskOutcomes.some((o) => o.resolved),
-      attempts,
       // a task whose every attempt infra-errored is itself infra-errored.
       infraError: taskOutcomes.length > 0 && taskOutcomes.every((o) => o.infraError),
-    }
+    })
     await appendRunRecord(corpusPath, record)
   }
 

diff --git a/bench/src/clbench-codebase-gate.mts b/bench/src/clbench-codebase-gate.mts
@@ -34,8 +34,9 @@ import { promisify } from 'node:util'
 import { acquireSandbox } from '@tangle-network/agent-runtime/loops'
 import { Sandbox } from '@tangle-network/sandbox'
 import { composeStrategies } from './directives'
-import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
+import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
 import { verifierGroundedSelect } from './selector'
+import { type PairedLift, pairedLift, pool } from './stats.mts'
 
 const execFileAsync = promisify(execFile)
 const PATCH_PATH = '/tmp/solution.patch'
@@ -176,59 +177,6 @@ async function judgePatch(inst: Instance, patch: string, clbenchDir: string): Pr
   }
 }
 
-async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
-  const results: R[] = new Array(items.length)
-  let next = 0
-  async function worker(): Promise<void> {
-    for (;;) {
-      const idx = next
-      next += 1
-      if (idx >= items.length) return
-      results[idx] = await fn(items[idx] as T, idx)
-    }
-  }
-  await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
-  return results
-}
-
-function makeRng(seed: number): () => number {
-  let s = seed | 0
-  return () => {
-    s = (s + 0x6d2b79f5) | 0
-    let t = Math.imul(s ^ (s >>> 15), 1 | s)
-    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
-    return ((t ^ (t >>> 14)) >>> 0) / 4294967296
-  }
-}
-
-interface PairedLift {
-  point: number
-  low: number
-  high: number
-  pairs: number
-  discordant: number
-}
-
-function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift {
-  if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms')
-  const n = baseline.length
-  if (n === 0) throw new Error('pairedLift: no pairs')
-  const deltas = baseline.map((b, i) => (treatment[i] as number) - b)
-  const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length
-  const point = mean(deltas)
-  const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length
-  const rng = makeRng(0x9e3779b9)
-  const rint = (m: number) => Math.floor(rng() * m)
-  const boots: number[] = []
-  for (let b = 0; b < bootstrapN; b += 1) {
-    let acc = 0
-    for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number
-    boots.push(acc / n)
-  }
-  boots.sort((x, y) => x - y)
-  return { point, low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN, high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN, pairs: n, discordant }
-}
-
 const pct = (x: number) => `${(x * 100).toFixed(1)}%`
 const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
 
@@ -342,17 +290,18 @@ async function main(): Promise<void> {
       traceTail: (grp.rPatch[round] ?? '').slice(-600),
     }))
     const validPasses = grp.random.filter((p): p is number => p !== null && p !== undefined)
-    const record: RunRecord = {
-      ts: new Date().toISOString(),
+    const record = buildRunRecordFromAttempts(attempts, {
       benchmark: 'clbench-codebase',
       instanceId: inst.instanceId,
       condition: `random@${k}`,
       model,
+      // blindResolved reads off the FIRST non-null score (== full pass), not
+      // attempts[0].valid — a partial-credit first shot is valid but not "blind
+      // resolved". Pass it explicitly so the helper preserves the exact value.
       blindResolved: validPasses[0] === 1,
       resolved: validPasses.some((p) => p > 0),
-      attempts,
       infraError: validPasses.length === 0,
-    }
+    })
     await appendRunRecord(corpusPath, record)
   }
   console.log(`\n=== wrote ${instances.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`)

diff --git a/bench/src/clbench-context-gate.mts b/bench/src/clbench-context-gate.mts
@@ -35,9 +35,10 @@
 import { execFileSync } from 'node:child_process'
 import { existsSync, readFileSync } from 'node:fs'
 import { composeStrategies } from './directives'
-import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
+import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
 import { type RouterConfig, routerChatWithUsage } from './router-client'
 import { selfConsistencySelect, verifierGroundedSelect } from './selector'
+import { type PairedLift, pairedLift, pool } from './stats.mts'
 
 const datasetUrl = 'https://huggingface.co/datasets/tencent/CL-bench/resolve/main/CL-bench.jsonl'
 
@@ -168,67 +169,6 @@ async function judgeRubrics(cfg: RouterConfig, task: CtxTask, output: string): P
   }
 }
 
-async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
-  const results: R[] = new Array(items.length)
-  let next = 0
-  async function worker(): Promise<void> {
-    for (;;) {
-      const idx = next
-      next += 1
-      if (idx >= items.length) return
-      results[idx] = await fn(items[idx] as T, idx)
-    }
-  }
-  await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
-  return results
-}
-
-function makeRng(seed: number): () => number {
-  let s = seed | 0
-  return () => {
-    s = (s + 0x6d2b79f5) | 0
-    let t = Math.imul(s ^ (s >>> 15), 1 | s)
-    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
-    return ((t ^ (t >>> 14)) >>> 0) / 4294967296
-  }
-}
-
-interface PairedLift {
-  point: number
-  low: number
-  high: number
-  pairs: number
-  discordant: number
-}
-
-/** Paired lift = mean over tasks of (treatment − baseline) with a 95% bootstrap CI.
- *  Works on continuous per-task values (rubric fractions) as well as {0,1}. */
-function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift {
-  if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms')
-  const n = baseline.length
-  if (n === 0) throw new Error('pairedLift: no pairs')
-  const deltas = baseline.map((b, i) => (treatment[i] as number) - b)
-  const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length
-  const point = mean(deltas)
-  const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length
-  const rng = makeRng(0x9e3779b9)
-  const rint = (m: number) => Math.floor(rng() * m)
-  const boots: number[] = []
-  for (let b = 0; b < bootstrapN; b += 1) {
-    let acc = 0
-    for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number
-    boots.push(acc / n)
-  }
-  boots.sort((x, y) => x - y)
-  return {
-    point,
-    low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN,
-    high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN,
-    pairs: n,
-    discordant,
-  }
-}
-
 const pct = (x: number) => `${(x * 100).toFixed(1)}%`
 const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
 
@@ -347,17 +287,13 @@ async function main(): Promise<void> {
       eventTypes: { 'router.chat': 1 },
       traceTail: s.output.slice(-600),
     }))
-    const record: RunRecord = {
-      ts: new Date().toISOString(),
+    const record = buildRunRecordFromAttempts(attempts, {
       benchmark: 'clbench-context',
       instanceId: task.id,
       condition: `random@${k}`,
       model,
-      blindResolved: (grp.random[0] as Shot).verdict.allPass,
-      resolved: grp.random.some((s) => s.verdict.allPass),
-      attempts,
       infraError: false,
-    }
+    })
     await appendRunRecord(corpusPath, record)
   }
   console.log(`\n=== wrote ${tasks.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`)

diff --git a/bench/src/commit0-gate.mts b/bench/src/commit0-gate.mts
@@ -36,8 +36,9 @@ import {
 import { Sandbox } from '@tangle-network/sandbox'
 import { createCommit0Adapter } from './benchmarks/commit0'
 import type { BenchTask } from './benchmarks/types'
-import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
+import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
 import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'
+import { pool } from './stats.mts'
 
 function must(name: string): string {
   const v = process.env[name]
@@ -96,6 +97,10 @@ interface ShotCfg {
   routerBaseUrl: string
   routerKey: string
   model: string
+  /** in-box opencode provider. `openai-compat` (default) is the generic passthrough,
+   *  so router-served cheap models resolve in-box; `openai` only accepts its
+   *  registered model names (e.g. gpt-4.1). Override via WORKER_PROVIDER. */
+  provider: string
   timeoutMs: number
   /** local-backend: the opencode CLI binary (cli-bridge fallback when the sandbox is down). */
   opencodeBin: string
@@ -146,7 +151,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
       env: { OPENAI_API_KEY: cfg.routerKey, OPENAI_BASE_URL: cfg.routerBaseUrl },
       backend: {
         type: 'opencode',
-        model: { provider: 'openai', model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey },
+        model: { provider: cfg.provider, model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey },
       },
     },
   }
@@ -267,22 +272,6 @@ async function runShotLocal(task: BenchTask, attempt: number, cfg: ShotCfg): Pro
   }
 }
 
-/** Bounded-concurrency pool. */
-async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
-  const results = new Array<R>(items.length)
-  let next = 0
-  async function worker(): Promise<void> {
-    for (;;) {
-      const idx = next
-      next += 1
-      if (idx >= items.length) return
-      results[idx] = await fn(items[idx] as T, idx)
-    }
-  }
-  await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
-  return results
-}
-
 async function main(): Promise<void> {
   // BACKEND=local → cli-bridge fallback (local opencode, no remote sandbox); needs a
   // sandbox-down workaround. Default 'sandbox' (the remote gateway). Local uses opencode's
@@ -297,6 +286,9 @@ async function main(): Promise<void> {
   const routerKey = needsRouterKey ? must('TANGLE_API_KEY') : (process.env.TANGLE_API_KEY ?? '')
   const sandboxBaseUrl = process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools'
   const opencodeBin = process.env.OPENCODE_BIN ?? join(process.env.HOME ?? '', '.local/bin/opencode')
+  // openai-compat = generic passthrough so cheap router models resolve in-box;
+  // `openai` rejects non-registered model names. Override via WORKER_PROVIDER.
+  const provider = process.env.WORKER_PROVIDER ?? 'openai-compat'
   const concurrency = Number(process.env.CONCURRENCY ?? 3)
   // No tight cap on the agentic rollout — it runs until the agent finishes (the clone→
   // implement→pytest-iterate loop genuinely takes a while). 0 = untimed. Only set
@@ -317,7 +309,7 @@ async function main(): Promise<void> {
   const units = tasks.flatMap((task) => Array.from({ length: k }, (_, attempt) => ({ task, attempt })))
   const where = backend === 'local' ? 'local opencode (cli-bridge)' : `in-box (${PATCH_PATH})`
   console.log(`\n▶ phase 1: ${units.length} rollouts (conc=${concurrency}) via ${where}`)
-  const cfg: ShotCfg = { sandboxBaseUrl, sandboxKey: routerKey, routerBaseUrl, routerKey, model, timeoutMs, opencodeBin }
+  const cfg: ShotCfg = { sandboxBaseUrl, sandboxKey: routerKey, routerBaseUrl, routerKey, model, provider, timeoutMs, opencodeBin }
   const runRollout = backend === 'local' ? runShotLocal : runShot
   const shots = await pool(units, concurrency, async (u) => {
     const s = await runRollout(u.task, u.attempt, cfg)
@@ -366,18 +358,14 @@ async function main(): Promise<void> {
       })
     }
     if (attempts.some((a) => a.score !== undefined)) scoredTasks += 1
-    const record: RunRecord = {
-      ts: new Date().toISOString(),
+    const record = buildRunRecordFromAttempts(attempts, {
       benchmark: adapter.name,
       instanceId: task.id,
       condition: `random@${k}`,
       model,
-      blindResolved: attempts[0]?.valid === true,
-      resolved: attempts.some((a) => a.valid === true),
-      attempts,
       infraError: false,
-      ...(runtimeEvents.length > 0 ? { runtimeEvents } : {}),
-    }
+      runtimeEvents,
+    })
     await appendRunRecord(corpusPath, record) // incremental: partial progress survives a crash
   }
 

diff --git a/bench/src/corpus.test.mts b/bench/src/corpus.test.mts
@@ -1,6 +1,6 @@
 import assert from 'node:assert/strict'
 import { isRunRecord } from '@tangle-network/agent-eval'
-import { type AttemptRecord, benchRecordToCorpusRecords, type RunRecord } from './corpus'
+import { type AttemptRecord, benchRecordToCorpusRecords, buildRunRecordFromAttempts, type RunRecord } from './corpus'
 
 const measuredAttempt = (round: number, output: string, valid: boolean): AttemptRecord => ({
   round,
@@ -103,4 +103,43 @@ const baseRec = (attempts: AttemptRecord[], over: Partial<RunRecord> = {}): RunR
   assert.equal(records[0]?.outcome.searchScore, undefined, 'no searchScore on a holdout record')
 }
 
+// --- buildRunRecordFromAttempts: default derivations from the attempts ---
+{
+  const rec = buildRunRecordFromAttempts(
+    [measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)],
+    { benchmark: 'aec-bench', instanceId: 'i9', condition: 'random@2', model: 'gpt-5', now: () => new Date('2026-06-06T00:00:00.000Z') },
+  )
+  assert.equal(rec.ts, '2026-06-06T00:00:00.000Z', 'now() seam stamps ts')
+  assert.equal(rec.blindResolved, false, 'blindResolved = attempts[0].valid === true')
+  assert.equal(rec.resolved, true, 'resolved = any attempt valid')
+  assert.equal(rec.infraError, false, 'scored+valid attempts ⇒ not infra')
+  assert.equal(rec.attempts.length, 2)
+}
+
+// --- no scored + no valid attempt ⇒ derived infraError ---
+{
+  const bare: AttemptRecord = { round: 0, prompt: 'q', output: '', eventCount: 0, eventTypes: {} }
+  const rec = buildRunRecordFromAttempts([bare], { benchmark: 'aec-bench', instanceId: 'i', condition: 'random@1', model: 'gpt-5' })
+  assert.equal(rec.infraError, true, 'no scored + no valid ⇒ infraError true')
+  assert.equal(rec.blindResolved, false)
+  assert.equal(rec.resolved, false)
+}
+
+// --- explicit overrides preserve a gate's bespoke recorded values ---
+{
+  const partial: AttemptRecord = { round: 0, prompt: 'q', output: 'x', valid: true, score: 0.5, costUsd: 0.01, tokensIn: 1, tokensOut: 1, wallMs: 1, eventCount: 1, eventTypes: {} }
+  const rec = buildRunRecordFromAttempts([partial], {
+    benchmark: 'clbench-codebase',
+    instanceId: 'i',
+    condition: 'random@1',
+    model: 'gpt-5',
+    // a partial-credit (score 0.5) first shot is valid but NOT a full blind-resolve.
+    blindResolved: false,
+    infraError: false,
+  })
+  assert.equal(rec.blindResolved, false, 'override beats the attempts[0].valid default')
+  assert.equal(rec.resolved, true, 'resolved still derives from valid when not overridden')
+  assert.equal(rec.infraError, false)
+}
+
 console.log('corpus.test.mts: all assertions passed')