Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -827,8 +827,10 @@ export type {
} from './llm-client'
export {
assertLlmRoute,
backoffMs,
callLlm,
callLlmJson,
isTransientLlmError,
LlmCallError,
LlmClient,
LlmRouteAssertionError,
Expand Down
41 changes: 9 additions & 32 deletions src/judge-retry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
* with `mode: 'exclude-failed'` drops the trial.
*/

import { backoffMs, isTransientLlmError } from './llm-client'

/** Retry policy for judge LLM calls. */
export interface JudgeRetryPolicy {
/** Max attempts per model. Default 3 (one initial + two retries). */
Expand All @@ -28,10 +30,11 @@ export interface JudgeRetryPolicy {
/** Exponential backoff function, default `attempt → min(500 * 2^attempt, 16_000)`. */
backoffMs?: (attempt: number) => number
/**
* Predicate deciding whether an error should trigger a retry. Default
* retries on: AbortError, TimeoutError, `fetch failed`, `ECONNRESET`,
* `[This operation was aborted]`, and any LlmCallError with status in
* {429, 502, 503, 504}. JSON-parse errors are NOT retriable (the model
* Predicate deciding whether an error should trigger a retry. Defaults to
* `isTransientLlmError` — the package-wide classifier shared with
* `callLlm` — which retries aborts/timeouts, network faults, HTTP/2
* transport faults, and any `LlmCallError` with status in {429,502,503,504}.
* JSON-parse and schema-rejection errors are NOT retriable (the model
* needs prompt adjustment, not another shot).
*/
isRetryable?: (err: unknown) => boolean
Expand All @@ -55,32 +58,6 @@ export interface JudgeRetryOutcome<T> {

const DEFAULT_MAX_ATTEMPTS = 3
const DEFAULT_TIMEOUT_MS = 90_000
const DEFAULT_BACKOFF = (attempt: number) => Math.min(500 * 2 ** attempt, 16_000)

const ABORT_PATTERNS = [
/AbortError/i,
/TimeoutError/i,
/fetch failed/i,
/ECONNRESET/i,
/ETIMEDOUT/i,
/EAI_AGAIN/i,
/this operation was aborted/i,
/stream.*ended.*unexpectedly/i,
/socket hang up/i,
]

const RETRYABLE_HTTP_STATUS = new Set([429, 502, 503, 504])

function defaultIsRetryable(err: unknown): boolean {
if (err instanceof Error) {
if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true
// LlmCallError exposes `status` as a numeric property; check without
// hard-importing to avoid a circular dep.
const status = (err as unknown as { status?: number }).status
if (typeof status === 'number' && RETRYABLE_HTTP_STATUS.has(status)) return true
}
return false
}

function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms))
Expand All @@ -103,8 +80,8 @@ export async function withJudgeRetry<T>(
): Promise<JudgeRetryOutcome<T>> {
const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS
const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS
const backoff = policy.backoffMs ?? DEFAULT_BACKOFF
const isRetryable = policy.isRetryable ?? defaultIsRetryable
const backoff = policy.backoffMs ?? backoffMs
const isRetryable = policy.isRetryable ?? isTransientLlmError
const models =
policy.models && policy.models.length > 0 ? policy.models : [undefined as unknown as string]

Expand Down
65 changes: 65 additions & 0 deletions src/llm-client.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
callLlm,
callLlmJson,
extractJsonPayload,
isTransientLlmError,
LlmCallError,
LlmClient,
stripFencedJson,
Expand Down Expand Up @@ -235,6 +236,70 @@ describe('llm-client — retry semantics', () => {
const r = await callLlm({ model: 'm', messages: [] }, { fetch, maxRetries: 3 })
expect(r.content).toBe('recovered')
})

it('retries an HTTP/2 transport fault instead of crashing', async () => {
// Regression: undici raises `terminated` / NGHTTP2_INTERNAL_ERROR for an
// HTTP/2 connection that drops mid-response. The old classifier only
// matched `fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN`, so this escaped
// the retry loop and surfaced as an uncaught rejection.
let call = 0
const fetch: typeof globalThis.fetch = (async (_url: string) => {
call++
if (call === 1) {
const cause = new Error('NGHTTP2_INTERNAL_ERROR')
throw new TypeError('terminated', { cause })
}
return mkOkResponse({ choices: [{ message: { content: 'recovered' } }], usage: {} })
}) as unknown as typeof globalThis.fetch
const r = await callLlm({ model: 'm', messages: [] }, { fetch, maxRetries: 3 })
expect(r.content).toBe('recovered')
expect(call).toBe(2)
})
})

describe('llm-client — isTransientLlmError classification', () => {
it('classifies HTTP/2 + undici transport faults as transient', () => {
expect(isTransientLlmError(new Error('terminated'))).toBe(true)
expect(isTransientLlmError(new Error('NGHTTP2_INTERNAL_ERROR'))).toBe(true)
expect(isTransientLlmError(new Error('other side closed'))).toBe(true)
expect(isTransientLlmError(new Error('socket hang up'))).toBe(true)
const coded = Object.assign(new Error('connection lost'), { code: 'UND_ERR_SOCKET' })
expect(isTransientLlmError(coded)).toBe(true)
})

it('follows the undici cause chain to the real socket fault', () => {
const wrapped = new TypeError('fetch failed', {
cause: new Error('terminated', { cause: new Error('NGHTTP2_INTERNAL_ERROR') }),
})
expect(isTransientLlmError(wrapped)).toBe(true)
})

it('classifies network + abort faults as transient', () => {
expect(isTransientLlmError(new Error('ECONNRESET'))).toBe(true)
const abort = new Error('aborted')
abort.name = 'AbortError'
expect(isTransientLlmError(abort)).toBe(true)
})

it('treats LlmCallError as transient only on retriable status', () => {
expect(isTransientLlmError(new LlmCallError('rate limited', 429, '', 'm'))).toBe(true)
expect(isTransientLlmError(new LlmCallError('bad gateway', 503, '', 'm'))).toBe(true)
expect(isTransientLlmError(new LlmCallError('bad request', 400, '', 'm'))).toBe(false)
expect(isTransientLlmError(new LlmCallError('unauthorized', 401, '', 'm'))).toBe(false)
})

it('does NOT retry deterministic failures', () => {
expect(isTransientLlmError(new SyntaxError('Unexpected token < in JSON'))).toBe(false)
expect(isTransientLlmError(new Error('response_format json_schema not supported'))).toBe(false)
expect(isTransientLlmError('not an error')).toBe(false)
expect(isTransientLlmError(undefined)).toBe(false)
})

it('terminates on a self-referential cause chain', () => {
const err = new Error('boom') as Error & { cause?: unknown }
err.cause = err
expect(isTransientLlmError(err)).toBe(false)
})
})

describe('llm-client — callLlmJson + schema degrade', () => {
Expand Down
66 changes: 56 additions & 10 deletions src/llm-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,60 @@ const DEFAULT_MAX_RETRIES = 3

const RETRYABLE_STATUS = new Set([429, 502, 503, 504])

function isRetryableError(err: unknown): boolean {
/**
* Transient transport/network error signatures, matched against an error's
* name, message, and `code`. Covers fetch/undici network failures, aborts
* and timeouts, and — critically — HTTP/2 transport faults a keep-alive
* connection raises mid-response: `terminated`, `NGHTTP2_INTERNAL_ERROR`,
* `UND_ERR_*`, `other side closed`. Those last ones carry no clean HTTP
* status; unrecognised, they escape the retry loop and surface as an
* uncaught rejection.
*/
const TRANSIENT_ERROR_PATTERNS: readonly RegExp[] = [
/AbortError/i,
/TimeoutError/i,
/this operation was aborted/i,
/fetch failed/i,
/ECONNRESET/i,
/ETIMEDOUT/i,
/EAI_AGAIN/i,
/socket hang up/i,
/stream.*ended.*unexpectedly/i,
/terminated/i,
/other side closed/i,
/NGHTTP2/i,
/UND_ERR/i,
]

/**
* True when an error is a transient transport/network fault worth retrying,
* as opposed to a deterministic failure (4xx schema reject, JSON parse) that
* a retry cannot fix. Inspects `LlmCallError.status`, then the error's
* name/message/code, then recurses into `error.cause` — undici nests the
* real socket fault one or more levels under `.cause`.
*
* This is THE retry classifier for the package: `callLlm` and
* `withJudgeRetry` both route through it, so a connection-class error is
* treated identically whether it surfaces in the HTTP client or a
* TCloud-backed judge.
*/
export function isTransientLlmError(err: unknown): boolean {
return classifyTransient(err, 0)
}

function classifyTransient(err: unknown, depth: number): boolean {
if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status)
if (err instanceof Error) {
return (
err.name === 'AbortError' ||
err.name === 'TimeoutError' ||
/fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message)
)
if (!(err instanceof Error)) return false
// Foreign errors (e.g. a TCloud judge SDK error) can carry a numeric HTTP
// status without being an LlmCallError — a retryable status is decisive.
const status = (err as { status?: unknown }).status
if (typeof status === 'number' && RETRYABLE_STATUS.has(status)) return true
const code = (err as { code?: unknown }).code
const haystack = `${err.name}\n${err.message}\n${typeof code === 'string' ? code : ''}`
if (TRANSIENT_ERROR_PATTERNS.some((p) => p.test(haystack))) return true
const cause = (err as { cause?: unknown }).cause
if (depth < 4 && cause instanceof Error && cause !== err) {
return classifyTransient(cause, depth + 1)
}
return false
}
Expand All @@ -157,8 +203,8 @@ function parseRetryAfter(headers: Headers): number | null {
return null
}

function backoffMs(attempt: number): number {
// 500ms, 1s, 2s, 4s, ...
/** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
export function backoffMs(attempt: number): number {
return Math.min(500 * 2 ** attempt, 16_000)
}

Expand Down Expand Up @@ -478,7 +524,7 @@ export async function callLlm(
redactedFields: [],
})
}
if (attempt < maxRetries - 1 && isRetryableError(err)) {
if (attempt < maxRetries - 1 && isTransientLlmError(err)) {
await sleep(backoffMs(attempt))
continue
}
Expand Down
20 changes: 20 additions & 0 deletions tests/judge-retry.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,24 @@ describe('withJudgeRetry — substrate guard against silent-zero judge corruptio
expect(out.succeeded).toBe(true)
expect(out.attempts).toBe(2)
})

it('retries an HTTP/2 transport fault via the shared classifier', async () => {
// Regression: judge-retry and llm-client carried separate retry-pattern
// lists; neither matched undici HTTP/2 faults (`terminated`,
// NGHTTP2_INTERNAL_ERROR). A TCloud-backed judge hitting one would fail
// the trial as a silent non-retry. Both now route through
// isTransientLlmError.
let calls = 0
const fn = async () => {
calls += 1
if (calls === 1) {
throw new TypeError('terminated', { cause: new Error('NGHTTP2_INTERNAL_ERROR') })
}
return { score: 0.81 }
}
const out = await withJudgeRetry(fn, { maxAttempts: 3, backoffMs: () => 1 })
expect(out.succeeded).toBe(true)
expect(out.attempts).toBe(2)
expect(out.value).toEqual({ score: 0.81 })
})
})
Loading