Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 4 additions & 23 deletions bench/src/aec-gate.mts
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@

import { resolveAdapter } from './adapters'
import type { BenchmarkAdapter, BenchTask } from './benchmarks/types'
import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
import { composeStrategies } from './directives'
import { type RouterConfig, routerChatWithUsage } from './router-client'
import { pool } from './stats.mts'

function must(name: string): string {
const v = process.env[name]
Expand Down Expand Up @@ -50,23 +51,6 @@ interface AttemptOutcome {
infraError?: boolean
}

/** Bounded-concurrency pool: run `fn` over `items`, at most `limit` in flight. */
async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
const results: R[] = new Array(items.length)
let next = 0
async function worker(): Promise<void> {
for (;;) {
const idx = next
next += 1
if (idx >= items.length) return
results[idx] = await fn(items[idx] as T, idx)
}
}
const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker())
await Promise.all(workers)
return results
}

async function runAttempt(
cfg: RouterConfig,
adapter: BenchmarkAdapter,
Expand Down Expand Up @@ -156,20 +140,17 @@ async function runArm(
const task = tasks[t] as BenchTask
const taskOutcomes = outcomes.slice(t * k, t * k + k)
const attempts = taskOutcomes.map((o, i) => toAttemptRecord(o, i))
const record: RunRecord = {
ts: new Date().toISOString(),
const record = buildRunRecordFromAttempts(attempts, {
benchmark: adapter.name,
instanceId: task.id,
condition: arm.condition,
model: cfg.model,
blindResolved: attempts[0]?.valid === true,
// k-attempt outcome = any usable attempt resolved (the oracle@k ceiling for
// this run; the deployable selector is scored separately by corpus-replay).
resolved: taskOutcomes.some((o) => o.resolved),
attempts,
// a task whose every attempt infra-errored is itself infra-errored.
infraError: taskOutcomes.length > 0 && taskOutcomes.every((o) => o.infraError),
}
})
await appendRunRecord(corpusPath, record)
}

Expand Down
65 changes: 7 additions & 58 deletions bench/src/clbench-codebase-gate.mts
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ import { promisify } from 'node:util'
import { acquireSandbox } from '@tangle-network/agent-runtime/loops'
import { Sandbox } from '@tangle-network/sandbox'
import { composeStrategies } from './directives'
import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
import { verifierGroundedSelect } from './selector'
import { type PairedLift, pairedLift, pool } from './stats.mts'

const execFileAsync = promisify(execFile)
const PATCH_PATH = '/tmp/solution.patch'
Expand Down Expand Up @@ -176,59 +177,6 @@ async function judgePatch(inst: Instance, patch: string, clbenchDir: string): Pr
}
}

async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
const results: R[] = new Array(items.length)
let next = 0
async function worker(): Promise<void> {
for (;;) {
const idx = next
next += 1
if (idx >= items.length) return
results[idx] = await fn(items[idx] as T, idx)
}
}
await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
return results
}

function makeRng(seed: number): () => number {
let s = seed | 0
return () => {
s = (s + 0x6d2b79f5) | 0
let t = Math.imul(s ^ (s >>> 15), 1 | s)
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
return ((t ^ (t >>> 14)) >>> 0) / 4294967296
}
}

interface PairedLift {
point: number
low: number
high: number
pairs: number
discordant: number
}

function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift {
if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms')
const n = baseline.length
if (n === 0) throw new Error('pairedLift: no pairs')
const deltas = baseline.map((b, i) => (treatment[i] as number) - b)
const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length
const point = mean(deltas)
const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length
const rng = makeRng(0x9e3779b9)
const rint = (m: number) => Math.floor(rng() * m)
const boots: number[] = []
for (let b = 0; b < bootstrapN; b += 1) {
let acc = 0
for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number
boots.push(acc / n)
}
boots.sort((x, y) => x - y)
return { point, low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN, high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN, pairs: n, discordant }
}

const pct = (x: number) => `${(x * 100).toFixed(1)}%`
const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`

Expand Down Expand Up @@ -342,17 +290,18 @@ async function main(): Promise<void> {
traceTail: (grp.rPatch[round] ?? '').slice(-600),
}))
const validPasses = grp.random.filter((p): p is number => p !== null && p !== undefined)
const record: RunRecord = {
ts: new Date().toISOString(),
const record = buildRunRecordFromAttempts(attempts, {
benchmark: 'clbench-codebase',
instanceId: inst.instanceId,
condition: `random@${k}`,
model,
// blindResolved reads off the FIRST non-null score (== full pass), not
// attempts[0].valid — a partial-credit first shot is valid but not "blind
// resolved". Pass it explicitly so the helper preserves the exact value.
blindResolved: validPasses[0] === 1,
resolved: validPasses.some((p) => p > 0),
attempts,
infraError: validPasses.length === 0,
}
})
await appendRunRecord(corpusPath, record)
}
console.log(`\n=== wrote ${instances.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`)
Expand Down
72 changes: 4 additions & 68 deletions bench/src/clbench-context-gate.mts
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@
import { execFileSync } from 'node:child_process'
import { existsSync, readFileSync } from 'node:fs'
import { composeStrategies } from './directives'
import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
import { type RouterConfig, routerChatWithUsage } from './router-client'
import { selfConsistencySelect, verifierGroundedSelect } from './selector'
import { type PairedLift, pairedLift, pool } from './stats.mts'

const datasetUrl = 'https://huggingface.co/datasets/tencent/CL-bench/resolve/main/CL-bench.jsonl'

Expand Down Expand Up @@ -168,67 +169,6 @@ async function judgeRubrics(cfg: RouterConfig, task: CtxTask, output: string): P
}
}

async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
const results: R[] = new Array(items.length)
let next = 0
async function worker(): Promise<void> {
for (;;) {
const idx = next
next += 1
if (idx >= items.length) return
results[idx] = await fn(items[idx] as T, idx)
}
}
await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
return results
}

function makeRng(seed: number): () => number {
let s = seed | 0
return () => {
s = (s + 0x6d2b79f5) | 0
let t = Math.imul(s ^ (s >>> 15), 1 | s)
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t
return ((t ^ (t >>> 14)) >>> 0) / 4294967296
}
}

interface PairedLift {
point: number
low: number
high: number
pairs: number
discordant: number
}

/** Paired lift = mean over tasks of (treatment − baseline) with a 95% bootstrap CI.
* Works on continuous per-task values (rubric fractions) as well as {0,1}. */
function pairedLift(baseline: number[], treatment: number[], bootstrapN = 10000): PairedLift {
if (baseline.length !== treatment.length) throw new Error('pairedLift: misaligned arms')
const n = baseline.length
if (n === 0) throw new Error('pairedLift: no pairs')
const deltas = baseline.map((b, i) => (treatment[i] as number) - b)
const mean = (a: number[]) => a.reduce((s, x) => s + x, 0) / a.length
const point = mean(deltas)
const discordant = deltas.filter((d) => Math.abs(d) > 1e-9).length
const rng = makeRng(0x9e3779b9)
const rint = (m: number) => Math.floor(rng() * m)
const boots: number[] = []
for (let b = 0; b < bootstrapN; b += 1) {
let acc = 0
for (let j = 0; j < n; j += 1) acc += deltas[rint(n)] as number
boots.push(acc / n)
}
boots.sort((x, y) => x - y)
return {
point,
low: boots[Math.floor(0.025 * bootstrapN)] ?? Number.NaN,
high: boots[Math.floor(0.975 * bootstrapN)] ?? Number.NaN,
pairs: n,
discordant,
}
}

const pct = (x: number) => `${(x * 100).toFixed(1)}%`
const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`

Expand Down Expand Up @@ -347,17 +287,13 @@ async function main(): Promise<void> {
eventTypes: { 'router.chat': 1 },
traceTail: s.output.slice(-600),
}))
const record: RunRecord = {
ts: new Date().toISOString(),
const record = buildRunRecordFromAttempts(attempts, {
benchmark: 'clbench-context',
instanceId: task.id,
condition: `random@${k}`,
model,
blindResolved: (grp.random[0] as Shot).verdict.allPass,
resolved: grp.random.some((s) => s.verdict.allPass),
attempts,
infraError: false,
}
})
await appendRunRecord(corpusPath, record)
}
console.log(`\n=== wrote ${tasks.length} task(s) → ${corpusPath} · gate: tsx src/corpus-replay.mts ${corpusPath} --selector=verifier ===`)
Expand Down
40 changes: 14 additions & 26 deletions bench/src/commit0-gate.mts
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ import {
import { Sandbox } from '@tangle-network/sandbox'
import { createCommit0Adapter } from './benchmarks/commit0'
import type { BenchTask } from './benchmarks/types'
import { type AttemptRecord, appendRunRecord, type RunRecord } from './corpus'
import { type AttemptRecord, appendRunRecord, buildRunRecordFromAttempts } from './corpus'
import { type BenchRuntimeHookEvent, createRuntimeHookRecorder } from './runtime-hook-recorder'
import { pool } from './stats.mts'

function must(name: string): string {
const v = process.env[name]
Expand Down Expand Up @@ -96,6 +97,10 @@ interface ShotCfg {
routerBaseUrl: string
routerKey: string
model: string
/** in-box opencode provider. `openai-compat` (default) is the generic passthrough,
* so router-served cheap models resolve in-box; `openai` only accepts its
* registered model names (e.g. gpt-4.1). Override via WORKER_PROVIDER. */
provider: string
timeoutMs: number
/** local-backend: the opencode CLI binary (cli-bridge fallback when the sandbox is down). */
opencodeBin: string
Expand Down Expand Up @@ -146,7 +151,7 @@ async function runShot(task: BenchTask, attempt: number, cfg: ShotCfg): Promise<
env: { OPENAI_API_KEY: cfg.routerKey, OPENAI_BASE_URL: cfg.routerBaseUrl },
backend: {
type: 'opencode',
model: { provider: 'openai', model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey },
model: { provider: cfg.provider, model: cfg.model, baseUrl: cfg.routerBaseUrl, apiKey: cfg.routerKey },
},
},
}
Expand Down Expand Up @@ -267,22 +272,6 @@ async function runShotLocal(task: BenchTask, attempt: number, cfg: ShotCfg): Pro
}
}

/** Bounded-concurrency pool. */
async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
const results = new Array<R>(items.length)
let next = 0
async function worker(): Promise<void> {
for (;;) {
const idx = next
next += 1
if (idx >= items.length) return
results[idx] = await fn(items[idx] as T, idx)
}
}
await Promise.all(Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, () => worker()))
return results
}

async function main(): Promise<void> {
// BACKEND=local → cli-bridge fallback (local opencode, no remote sandbox); needs a
// sandbox-down workaround. Default 'sandbox' (the remote gateway). Local uses opencode's
Expand All @@ -297,6 +286,9 @@ async function main(): Promise<void> {
const routerKey = needsRouterKey ? must('TANGLE_API_KEY') : (process.env.TANGLE_API_KEY ?? '')
const sandboxBaseUrl = process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools'
const opencodeBin = process.env.OPENCODE_BIN ?? join(process.env.HOME ?? '', '.local/bin/opencode')
// openai-compat = generic passthrough so cheap router models resolve in-box;
// `openai` rejects non-registered model names. Override via WORKER_PROVIDER.
const provider = process.env.WORKER_PROVIDER ?? 'openai-compat'
const concurrency = Number(process.env.CONCURRENCY ?? 3)
// No tight cap on the agentic rollout — it runs until the agent finishes (the clone→
// implement→pytest-iterate loop genuinely takes a while). 0 = untimed. Only set
Expand All @@ -317,7 +309,7 @@ async function main(): Promise<void> {
const units = tasks.flatMap((task) => Array.from({ length: k }, (_, attempt) => ({ task, attempt })))
const where = backend === 'local' ? 'local opencode (cli-bridge)' : `in-box (${PATCH_PATH})`
console.log(`\n▶ phase 1: ${units.length} rollouts (conc=${concurrency}) via ${where}`)
const cfg: ShotCfg = { sandboxBaseUrl, sandboxKey: routerKey, routerBaseUrl, routerKey, model, timeoutMs, opencodeBin }
const cfg: ShotCfg = { sandboxBaseUrl, sandboxKey: routerKey, routerBaseUrl, routerKey, model, provider, timeoutMs, opencodeBin }
const runRollout = backend === 'local' ? runShotLocal : runShot
const shots = await pool(units, concurrency, async (u) => {
const s = await runRollout(u.task, u.attempt, cfg)
Expand Down Expand Up @@ -366,18 +358,14 @@ async function main(): Promise<void> {
})
}
if (attempts.some((a) => a.score !== undefined)) scoredTasks += 1
const record: RunRecord = {
ts: new Date().toISOString(),
const record = buildRunRecordFromAttempts(attempts, {
benchmark: adapter.name,
instanceId: task.id,
condition: `random@${k}`,
model,
blindResolved: attempts[0]?.valid === true,
resolved: attempts.some((a) => a.valid === true),
attempts,
infraError: false,
...(runtimeEvents.length > 0 ? { runtimeEvents } : {}),
}
runtimeEvents,
})
await appendRunRecord(corpusPath, record) // incremental: partial progress survives a crash
}

Expand Down
41 changes: 40 additions & 1 deletion bench/src/corpus.test.mts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import assert from 'node:assert/strict'
import { isRunRecord } from '@tangle-network/agent-eval'
import { type AttemptRecord, benchRecordToCorpusRecords, type RunRecord } from './corpus'
import { type AttemptRecord, benchRecordToCorpusRecords, buildRunRecordFromAttempts, type RunRecord } from './corpus'

const measuredAttempt = (round: number, output: string, valid: boolean): AttemptRecord => ({
round,
Expand Down Expand Up @@ -103,4 +103,43 @@ const baseRec = (attempts: AttemptRecord[], over: Partial<RunRecord> = {}): RunR
assert.equal(records[0]?.outcome.searchScore, undefined, 'no searchScore on a holdout record')
}

// --- buildRunRecordFromAttempts: default derivations from the attempts ---
{
const rec = buildRunRecordFromAttempts(
[measuredAttempt(0, 'a', false), measuredAttempt(1, 'b', true)],
{ benchmark: 'aec-bench', instanceId: 'i9', condition: 'random@2', model: 'gpt-5', now: () => new Date('2026-06-06T00:00:00.000Z') },
)
assert.equal(rec.ts, '2026-06-06T00:00:00.000Z', 'now() seam stamps ts')
assert.equal(rec.blindResolved, false, 'blindResolved = attempts[0].valid === true')
assert.equal(rec.resolved, true, 'resolved = any attempt valid')
assert.equal(rec.infraError, false, 'scored+valid attempts ⇒ not infra')
assert.equal(rec.attempts.length, 2)
}

// --- no scored + no valid attempt ⇒ derived infraError ---
{
const bare: AttemptRecord = { round: 0, prompt: 'q', output: '', eventCount: 0, eventTypes: {} }
const rec = buildRunRecordFromAttempts([bare], { benchmark: 'aec-bench', instanceId: 'i', condition: 'random@1', model: 'gpt-5' })
assert.equal(rec.infraError, true, 'no scored + no valid ⇒ infraError true')
assert.equal(rec.blindResolved, false)
assert.equal(rec.resolved, false)
}

// --- explicit overrides preserve a gate's bespoke recorded values ---
{
const partial: AttemptRecord = { round: 0, prompt: 'q', output: 'x', valid: true, score: 0.5, costUsd: 0.01, tokensIn: 1, tokensOut: 1, wallMs: 1, eventCount: 1, eventTypes: {} }
const rec = buildRunRecordFromAttempts([partial], {
benchmark: 'clbench-codebase',
instanceId: 'i',
condition: 'random@1',
model: 'gpt-5',
// a partial-credit (score 0.5) first shot is valid but NOT a full blind-resolve.
blindResolved: false,
infraError: false,
})
assert.equal(rec.blindResolved, false, 'override beats the attempts[0].valid default')
assert.equal(rec.resolved, true, 'resolved still derives from valid when not overridden')
assert.equal(rec.infraError, false)
}

console.log('corpus.test.mts: all assertions passed')
Loading
Loading