Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 23 additions & 10 deletions bench/src/clbench-context-gate.mts
Original file line number Diff line number Diff line change
Expand Up @@ -65,23 +65,28 @@ interface CtxTask {
* Fail loud on a malformed record — a silently-short task set would poison the gate. */
function loadCtxTasks(limit: number, offset: number): CtxTask[] {
const need = offset + limit
// Fetch a 2-line buffer past `need`: on CL-bench's huge multi-KB records, `head`
// closing the pipe can emit a TRUNCATED final line (SIGPIPE mid-write) → invalid
// JSON. Fetching need+2 and parsing only the first `need` complete lines makes the
// truncated tail land in the discarded buffer.
const fetchN = need + 2
let raw: string
const cached = process.env.CLBENCH_CTX_FILE
if (cached) {
if (!existsSync(cached)) throw new Error(`CLBENCH_CTX_FILE not found: ${cached}`)
raw = execFileSync('bash', ['-c', `head -n ${need} ${JSON.stringify(cached)}`], { maxBuffer: 1 << 30 }).toString('utf8')
raw = execFileSync('bash', ['-c', `head -n ${fetchN} ${JSON.stringify(cached)}`], { maxBuffer: 1 << 30 }).toString('utf8')
} else {
// -fsSL: fail on HTTP error, follow redirects (HF resolve 302s to the CDN). `head`
// closing the pipe after `need` lines gives curl a benign SIGPIPE (exit 23) on a
// multi-hundred-MB file — suppress curl's stderr so it isn't mistaken for a fault;
// a real fetch failure surfaces as 0 parsed tasks below.
raw = execFileSync('bash', ['-c', `curl -fsSL ${JSON.stringify(datasetUrl)} 2>/dev/null | head -n ${need}`], {
// -fsSL: fail on HTTP error, follow redirects (HF resolve 302s to the CDN). curl's
// SIGPIPE (exit 23) when head closes is benign — suppress its stderr; a real fetch
// failure surfaces as 0 parsed tasks below.
raw = execFileSync('bash', ['-c', `curl -fsSL ${JSON.stringify(datasetUrl)} 2>/dev/null | head -n ${fetchN}`], {
maxBuffer: 1 << 30,
}).toString('utf8')
}
const tasks: CtxTask[] = []
for (const line of raw.split('\n')) {
if (line.trim() === '') continue
// Only the first `need` lines are guaranteed complete (the +2 absorbs head's tail).
const lines = raw.split('\n').filter((l) => l.trim() !== '').slice(0, need)
for (const line of lines) {
const d = JSON.parse(line) as {
messages?: ChatMessage[]
rubrics?: unknown[]
Expand Down Expand Up @@ -151,8 +156,16 @@ function parseJudge(reply: string, rubricCount: number): RubricVerdict {
async function judgeRubrics(cfg: RouterConfig, task: CtxTask, output: string): Promise<RubricVerdict> {
if (!output.trim()) return { fraction: 0, allPass: false, graded: 0 }
const rubricsText = task.rubrics.map((r, i) => `${i + 1}. ${r}`).join('\n')
const res = await routerChatWithUsage(cfg, [{ role: 'user', content: judgePrompt(rubricsText, output) }], { temperature: 0 })
return parseJudge(typeof res.content === 'string' ? res.content : '', task.rubrics.length)
// Fault-isolate the judge: a transient router failure (after retries) or an
// unparseable judge reply scores this attempt 0 (eval.py's convention), it must
// NOT throw — one bad grade would otherwise crash the whole N×K×2 run. graded=0
// marks it as judge-failed so it's distinguishable from a real 0/N rubric pass.
try {
const res = await routerChatWithUsage(cfg, [{ role: 'user', content: judgePrompt(rubricsText, output) }], { temperature: 0 })
return parseJudge(typeof res.content === 'string' ? res.content : '', task.rubrics.length)
} catch {
return { fraction: 0, allPass: false, graded: 0 }
}
}

async function pool<T, R>(items: T[], limit: number, fn: (item: T, idx: number) => Promise<R>): Promise<R[]> {
Expand Down
7 changes: 5 additions & 2 deletions bench/src/router-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@ export async function routerChatWithUsage(
}
// Non-retryable (auth/quota/malformed) fails loud immediately; retryable
// statuses back off and continue until the loop's attempt bound, then the
// post-loop throw is the honest "exhausted retries" terminal.
if (![429, 500, 502, 503, 504].includes(status)) throw new Error(lastErr)
// post-loop throw is the honest "exhausted retries" terminal. 408/425 + the
// Cloudflare-origin family (520/522/524) are transient under heavy parallel
// load — a fleet of concurrent gate runs hits 524 ("origin timeout") and must
// retry, not crash the whole run.
if (![408, 425, 429, 500, 502, 503, 504, 520, 522, 524].includes(status)) throw new Error(lastErr)
if (attempt < 4) await new Promise((r) => setTimeout(r, 800 * 2 ** attempt))
}
throw new Error(`${lastErr} (exhausted retries)`)
Expand Down
Loading