diff --git a/bench/src/drivers/flat-harness.ts b/bench/src/drivers/flat-harness.ts new file mode 100644 index 0000000..4cfc0ec --- /dev/null +++ b/bench/src/drivers/flat-harness.ts @@ -0,0 +1,354 @@ +/** + * Plane-A flat harness — the GATE's control, recovered as the simplest possible + * `Agent.act` over the recursive execution atom (docs/research/recursive-execution-atom.md, + * "Plane A as the simplest act"; "Decision: Plane B contains Plane A"). + * + * The driver spawns ONE child per profile at a FIXED, equal budget, joins them via + * `scope.next()` as each settles, adapts each settled child to the kernel's `Iteration`, + * and selects the best with `defaultSelectWinner` — the single-sourced argmax the loop + * kernel itself uses, so the control's selection is not a forked copy of the kernel's. + * No steering, no widening, no spawn-on-completion: a flat fan-out at equal compute. + * + * The equal-k assertion (critique B3) is the gate's validity guard, exported alongside: + * a treatment cell is admitted only when `Σ iterations(treatment) ≡ Σ iterations(blind)` + * per task (excluding `budgetExempt` runtimes, which are out of the conserved Σk by + * construction). A mismatch FAILS LOUD — the cell is excluded, never silently scored 0 + * (mirroring `experiment.ts`'s infra-error exclusion: a confounded cell is dropped, not + * counted). Without this guard a "diverse@k beat blind@k" claim could be confounded by + * the treatment having spent more compute than the control. + */ + +import { + type AgentProfile, + type AgentSpec, + type Budget, + type DefaultVerdict, + type ExecutorContext, + type ExecutorRegistry, + type Iteration, + type LeafExecutorFactory, + type ResultBlobStore, + type RootHandle, + type Settled, + type SpawnJournal, + type Supervisor, + type SupervisedResult, + type TreeView, + type UsageEvent, + type Agent as Atom, + type Scope as AtomScope, + createExecutorRegistry, + createRootHandle, + createSupervisor, + defaultSelectWinner, + InMemoryResultBlobStore, + InMemorySpawnJournal, + settledToIteration, +} from '@tangle-network/agent-runtime/loops' +import type { BackendType } from '@tangle-network/sandbox' + +/** + * One arm's agent profile + how it maps to a leaf runtime. `harness === null` resolves to + * the router/inline executor (a direct Router call, no box); a `BackendType` resolves to + * the sandbox executor (composes `runLoop` as a leaf). A BYO `AgentSpec.executor` overrides + * both — a user agent (mastra/agno/raw HTTP) is first-class the moment it implements the + * `LeafExecutor` interface. This is the executor's only knob the flat harness needs. + */ +export interface FlatProfile { + /** Stable arm label — becomes the child node's `label` and the selected winner's name. */ + readonly label: string + /** The portable agent profile handed to the resolved executor. */ + readonly profile: AgentProfile + /** Executor mapping: `null` → router/inline, a `BackendType` → sandbox. */ + readonly harness: BackendType | null +} + +/** The flat-harness task: a shared prompt fanned out across one child per profile. */ +export interface FlatTask { + /** The task statement every spawned child receives verbatim. */ + readonly prompt: string + /** One arm per profile; each is spawned once at the fixed equal budget. */ + readonly profiles: ReadonlyArray +} + +/** + * The flat-harness result: the selected winner (or `undefined` when every child was a + * `down` / produced no output) plus the realized settled set so the caller can run the + * equal-k assertion and report the realized tree shape per cell (residual risk R1: equal-k + * is enforceable, equal-topology is not, so the realized shape is reported, not assumed). + */ +export interface FlatResult { + /** The single-sourced argmax over the settled children, or `undefined` (no usable child). */ + readonly winner: + | { + readonly output: unknown + readonly verdict?: DefaultVerdict + readonly label: string + readonly seq: number + } + | undefined + /** Every settlement `next()` delivered, in recorded `seq` order (replay-stable). */ + readonly settled: ReadonlyArray> +} + +/** A leaf coder agent carries its executor mapping as `executorSpec` (the field + * `scope.spawn` reads to resolve a `LeafExecutor`). Its `act` is never invoked — the + * scope drives the resolved executor, not the leaf's `act` — so reaching it is a wiring + * bug (fail loud). */ +interface LeafAtom extends Atom { + readonly executorSpec: AgentSpec +} + +function leafCoder(p: FlatProfile): LeafAtom { + const executorSpec: AgentSpec = { profile: p.profile, harness: p.harness } + return { + name: p.label, + executorSpec, + act(): Promise { + throw new Error( + `flat-harness leaf "${p.label}": act() was invoked — a leaf is executed via its LeafExecutor, never its act (wiring bug)`, + ) + }, + } +} + +/** + * The flat-harness driver: an `Agent` whose `act` spawns one child per profile at the + * fixed `childBudget`, joins them all via `scope.next()`, and selects the best. + * + * Replay-safe by construction: it reads only what `Settled` delivers (`out`, `verdict`, + * `seq`) — never `Date.now`, `Math.random`, or an unordered collection — and joins strictly + * in the `seq` order `next()` yields, so a replay re-derives the identical winner. + * + * Selection is `defaultSelectWinner` over the `settledToIteration` adaptation of each `done` + * child (M4 / build step 8) — the supervisor never re-ranks behind the driver (selector ≠ + * judge): the driver returns the synthesized winner and the supervisor content-addresses it. + */ +export function flatHarness(childBudget: Budget): Atom { + return { + name: 'flat-harness', + async act(task: FlatTask, scope: AtomScope): Promise { + const open = scope as unknown as AtomScope + if (task.profiles.length === 0) { + throw new Error('flat-harness: task.profiles is empty — nothing to fan out') + } + + for (const p of task.profiles) { + const spawned = open.spawn(leafCoder(p), task.prompt, { + budget: childBudget, + label: p.label, + }) + // Fail loud on a fail-closed admission: a flat-harness cell that cannot afford its + // full fan-out under the conserved pool is a misconfigured budget, not a partial run. + if (!spawned.ok) { + throw new Error( + `flat-harness: spawn of "${p.label}" rejected (${spawned.reason}) — the root budget cannot cover one equal-budget child per profile`, + ) + } + } + + const settled: Settled[] = [] + const iterations: Iteration[] = [] + for (;;) { + const s = await open.next() + if (s === null) break + settled.push(s) + // Only a `done` child is an iteration; a `down` child is excluded from selection + // (and from the equal-k Σ) exactly as an infra-errored cell is in experiment.ts. + if (s.kind === 'done') iterations.push(settledToIteration(s)) + } + + const won = defaultSelectWinner(iterations) + const winner = won + ? { + output: won.output, + ...(won.verdict ? { verdict: won.verdict } : {}), + label: won.agentRunName, + seq: won.iterationIndex, + } + : undefined + return { winner, settled } + }, + } +} + +// ── The equal-k assertion (critique B3) ────────────────────────────────────────── + +/** Per-arm realized iteration count, tagged so a mismatch names the offending arm. */ +export interface ArmRealizedK { + /** Arm label (the control is named explicitly by the caller, not inferred here). */ + readonly label: string + /** Σ of conserved iterations across this arm's `done` children, EXCLUDING any whose + * runtime is `budgetExempt` (e.g. a `cli`/RLM subprocess without token accounting). */ + readonly k: number + /** Count of `down` (failed/infra) children — reported so a caller can see why an arm's + * realized k differs even when both arms were spawned identically (residual risk R1). */ + readonly downCount: number +} + +/** + * The equal-k outcome: either every arm spent the SAME conserved compute (the cell is + * admissible) or it did not (the cell is excluded). Typed — the caller inspects `ok` + * before trusting the cell, never a silent zero (fail-loud per B3). + */ +export type EqualKOutcome = + | { readonly ok: true; readonly k: number; readonly arms: ReadonlyArray } + | { + readonly ok: false + readonly reason: 'unequal-k' | 'no-arms' + readonly arms: ReadonlyArray + } + +/** Σ of conserved iterations across an arm's `done` children. Reads `spent.iterations` + * (the conserved pool's iteration channel) off each settled child — the same evidence + * replay reads — never a wall-clock or re-derived count. A `budgetExempt` runtime (a + * `cli`/RLM subprocess without token accounting) reports zero conserved spend by contract, + * so its iterations fall out of this Σ automatically — no per-runtime special case needed. */ +export function realizedK(label: string, settled: ReadonlyArray>): ArmRealizedK { + let k = 0 + let downCount = 0 + for (const s of settled) { + if (s.kind === 'down') { + downCount += 1 + continue + } + k += s.spent.iterations + } + return { label, k, downCount } +} + +/** + * The equal-k assertion (critique B3): a treatment cell is admissible only when every arm + * realized the SAME conserved compute as the blind control — `Σ iterations(treatment) ≡ + * Σ iterations(blind)` per task, excluding `budgetExempt` runtimes. Returns a typed + * `EqualKOutcome`; the caller EXCLUDES the cell on `!ok` (never scores it 0), so a + * compute-confounded cell is dropped, exactly as an infra-errored cell is in `experiment.ts`. + * + * `arms` is `[control, ...treatments]` — the blind control FIRST (the same discipline + * `runSteeringExperiment` enforces structurally). Every arm's k is compared to the + * control's; one differing arm excludes the whole cell (the gate compares arms pairwise per + * task, so a single confounded arm contaminates the pairing). + */ +export function assertEqualK( + arms: ReadonlyArray<{ label: string; settled: ReadonlyArray> }>, +): EqualKOutcome { + if (arms.length === 0) return { ok: false, reason: 'no-arms', arms: [] } + const realized = arms.map((a) => realizedK(a.label, a.settled)) + const controlK = realized[0]!.k + const equal = realized.every((r) => r.k === controlK) + if (!equal) return { ok: false, reason: 'unequal-k', arms: realized } + return { ok: true, k: controlK, arms: realized } +} + +/** + * Strict equal-k guard for an in-driver invariant: throws on a mismatch instead of + * returning a typed outcome. Use where the cell MUST be equal-k by construction (a unit + * test or a same-budget self-comparison); the experiment harness prefers `assertEqualK` + * so it can EXCLUDE rather than abort the whole run on one confounded cell. + */ +export function assertEqualKOrThrow( + arms: ReadonlyArray<{ label: string; settled: ReadonlyArray> }>, +): { k: number; arms: ReadonlyArray } { + const outcome = assertEqualK(arms) + if (!outcome.ok) { + const detail = outcome.arms.map((a) => `${a.label}=${a.k}`).join(' ') + throw new Error( + `equal-k assertion failed (${outcome.reason}): arms spent unequal conserved compute [${detail}] — the cell is confounded and must be excluded`, + ) + } + return { k: outcome.k, arms: outcome.arms } +} + +// ── Running the flat harness through the Supervisor (the gate's control runner) ─── + +/** Seams every spawned executor reads off `ExecutorContext.seams`, keyed by the seam name a + * built-in narrows (`router` → router/inline base+key+model, `sandbox` → the loop sandbox + * client). Opaque here; each built-in reads its own key and fails loud when its seam is + * absent — so a flat-harness cell that names a sandbox/router arm MUST supply that seam. */ +export interface FlatHarnessSeams { + readonly router?: { routerBaseUrl: string; routerKey: string; model?: string } + readonly sandbox?: { sandboxClient: unknown; maxIterations?: number; loopCtx?: unknown } + readonly [seam: string]: unknown +} + +/** + * Bind seams onto an `ExecutorRegistry` so every resolved factory builds its executor with + * THIS cell's seams. The supervisor constructs the root scope with an empty `seams` map (it + * is task-agnostic), so the seams must ride on the registry instead: the wrapper overrides + * the `ExecutorContext.seams` the scope passes through, preserving the caller's `signal`. + * A BYO `AgentSpec.executor` resolves to a factory that ignores `ctx` entirely, so binding + * is a no-op for it — exactly right. This is a real composition, not a passthrough stub. + */ +function bindSeams(base: ExecutorRegistry, seams: Readonly>): ExecutorRegistry { + return { + register(runtime: string, factory: LeafExecutorFactory): void { + base.register(runtime, factory) + }, + resolve(spec: AgentSpec) { + const resolved = base.resolve(spec) + if (!resolved.succeeded) return resolved + const inner = resolved.value + const bound: LeafExecutorFactory = (s, ctx: ExecutorContext) => + inner(s, { signal: ctx.signal, seams }) + return { succeeded: true as const, value: bound } + }, + } +} + +export interface RunFlatHarnessConfig { + /** The fan-out task: the shared prompt + one profile per arm. */ + readonly task: FlatTask + /** The FIXED, equal per-child budget — every arm gets the identical ceiling (the + * equal-k precondition: equal reservations make `Σk` equal by construction when no + * child is `budgetExempt` and none goes `down`). */ + readonly childBudget: Budget + /** The root conserved-pool ceiling. Must cover one `childBudget` per profile or the + * flat-harness `act` fails loud on the first un-affordable spawn. */ + readonly rootBudget: Budget + /** Trace-correlation + journal/blob root key. */ + readonly runId: string + /** Per-runtime executor seams threaded into every spawned child. */ + readonly seams: FlatHarnessSeams + /** Open executor registry; defaults to the built-ins (router/inline · sandbox · cli) + * plus any BYO `AgentSpec.executor`. Inject to register additional runtimes. */ + readonly executors?: ExecutorRegistry + /** Event source; defaults to the in-memory journal (durable JSONL/FS is injectable). */ + readonly journal?: SpawnJournal + /** Result-blob store backing `outRef` rehydration; defaults to in-memory. */ + readonly blobs?: ResultBlobStore + /** Optional live root handle (the Q2 chat/pi-viz substrate) attached before `run`. */ + readonly rootHandle?: RootHandle + /** Caller abort signal — cascades into every live child's executor. */ + readonly signal?: AbortSignal + /** Injected clock for deterministic journal timestamps (tests). */ + readonly now?: () => number +} + +/** + * One flat-harness cell: spawn one child per profile at `childBudget` under a conserved + * `rootBudget` pool, join, select. Returns the typed `SupervisedResult` (a no-winner is + * never coerced to a best-effort output, M2). + * + * Seams ride on the registry, not the supervisor: `bindSeams` overrides the + * `ExecutorContext.seams` each resolved factory receives with this cell's `cfg.seams`, so a + * router/sandbox child reads its seam even though the supervisor builds the root scope with + * an empty seams map. A BYO `AgentSpec.executor` ignores seams and resolves unchanged. + */ +export async function runFlatHarness(cfg: RunFlatHarnessConfig): Promise> { + const supervisor: Supervisor = createSupervisor() + if (cfg.rootHandle) supervisor.attach(cfg.rootHandle) + const executors = bindSeams(cfg.executors ?? createExecutorRegistry(), cfg.seams) + return supervisor.run(flatHarness(cfg.childBudget), cfg.task, { + budget: cfg.rootBudget, + runId: cfg.runId, + journal: cfg.journal ?? new InMemorySpawnJournal(), + blobs: cfg.blobs ?? new InMemoryResultBlobStore(), + executors, + ...(cfg.signal ? { signal: cfg.signal } : {}), + ...(cfg.now ? { now: cfg.now } : {}), + }) +} + +/** Re-exported so a caller building the live root substrate gets it from one place. */ +export { createRootHandle } +export type { TreeView, UsageEvent } diff --git a/bench/src/drivers/llm-meta-driver.ts b/bench/src/drivers/llm-meta-driver.ts new file mode 100644 index 0000000..bb1ee73 --- /dev/null +++ b/bench/src/drivers/llm-meta-driver.ts @@ -0,0 +1,374 @@ +/** + * @experimental + * + * LLM meta-driver — the TREATMENT variant of the recursive execution atom's two driver-act + * bodies (the coded progressive-widening control is in `./progressive-widening.ts`; both + * share the flat-by-default `WidenGate`). Operator's call: build it now, on top of the + * budget-reservation invariant that keeps an equal-k result valid. + * + * The policy: `act` asks the Router (an LLM) for an initial spawn plan — which child agents + * to spawn and their per-child budgets ("driver A for n shots, B for k shots" = + * heterogeneous per-child `maxIterations`) — then reacts to each `scope.next()` completion. + * On a promising settlement it asks the Router again for a widen plan: spawn one more child + * toward a lineage, under the conserved pool. Children resolve their `LeafExecutor` through + * the open registry off their `AgentSpec` (`harness: null` → a direct Router call, no box; + * a `BackendType` → sandboxed; or a BYO `executor`) — the meta-driver never switches on the + * runtime itself. + * + * The same two firewall invariants the control upholds (critique R2): + * - `WidenGate` DEFAULTS TO FLAT (`defaultWidenGate`), so a gate run never asks for a + * widen and the selector≠judge conflict stays dormant. + * - The LLM is shown ONLY trace-derived findings (the `analyze` hook → `AnalystFinding[]`) + * when deciding to widen — NEVER the raw `verdict.score`. Letting the meta-controller + * read the judge verdict for a spawn decision requires the gate's explicit + * `judgeExempt: true` (off by default), the documented hatch that re-couples steering to + * the judge. + * + * Selection stays single-sourced (`settledToIteration` + `defaultSelectWinner`). + * + * The Router is an external boundary: `routerChatWithUsage` (reused from `../router-client`, + * not re-copied) reports REAL token usage and throws on a non-OK response. The driver + * inspects the parse outcome before acting on a plan — a malformed plan fails loud, never a + * silent empty fan-out. + */ + +import type { AnalystFinding } from '@tangle-network/agent-eval' +import { defaultSelectWinner } from '../../../src/loops/run-loop.ts' +import { settledToIteration } from '../../../src/loops/supervise/scope.ts' +import type { + Agent, + AgentSpec, + Budget, + Scope, + Settled, + WidenGate, +} from '../../../src/loops/supervise/types.ts' +import { routerChatWithUsage, type RouterConfig } from '../router-client.ts' +import { defaultWidenGate } from './progressive-widening.ts' + +/** A child the meta-driver can spawn, keyed by a stable name the Router references in its + * plan. The `agent` carries its `AgentSpec` as `executorSpec` — the field `scope.spawn` + * reads to resolve the runtime (`harness: null` → router/inline; `BackendType` → sandbox; + * BYO `executor`). */ +export interface MetaChild { + readonly key: string + readonly agent: Agent + readonly task: unknown + /** One-line capability summary the Router sees when choosing this child. */ + readonly description: string +} + +/** One spawn the Router asked for: a child key from the catalog + its per-child budget. */ +export interface SpawnPlanEntry { + readonly childKey: string + readonly shots: number + readonly maxTokens: number + readonly maxUsd?: number +} + +/** The Router's decision, parsed and validated. `done: true` means "no more spawns, + * synthesize the winner from what settled". */ +export interface SpawnPlan { + readonly spawns: ReadonlyArray + readonly done: boolean +} + +export type AnalyzeSettled = ( + settled: Extract, { kind: 'done' }>, +) => Promise> + +export interface LlmMetaDriverOptions { + readonly name?: string + /** Router seam (base url + key + model). Reused for every meta-decision call. */ + readonly router: RouterConfig + /** The catalog of spawnable children the Router plans over, keyed by `key`. */ + readonly catalog: ReadonlyArray> + /** One-line statement of the goal the Router optimizes the spawn plan toward. */ + readonly objective: string + /** Trace-analyst wire feeding the widen decision — the ONLY child signal the Router + * sees post-settlement. Omit to run flat (no findings → never widens under the default + * gate). */ + readonly analyze?: AnalyzeSettled + /** The widening governor. Defaults to `defaultWidenGate` (flat — never widens). */ + readonly gate?: WidenGate + /** Deadline budget for one child the Router omits a deadline for. */ + readonly perChildDeadlineMs?: number +} + +/** + * Build the LLM meta-driver `Agent`. Its `act` body: ask the Router for an initial spawn + * plan → spawn the planned children at their heterogeneous per-child budgets → react to + * each `next()` → on a promising (trace-derived) settlement, ask the Router for a widen + * plan and spawn one more under budget → synthesize with the single-sourced selector. + */ +export function createLlmMetaDriver(opts: LlmMetaDriverOptions): Agent { + const gate = opts.gate ?? defaultWidenGate() + const analyze = opts.analyze + const byKey = new Map(opts.catalog.map((c) => [c.key, c])) + + return { + name: opts.name ?? 'llm-meta-driver', + async act(task: unknown, scope: Scope): Promise { + // Ask the Router for the initial spawn plan. The prompt shows the catalog + budget + // readout; the LLM decides which children and their per-child shots/tokens. + const initial = await requestPlan(opts, scope, task, undefined, []) + spawnPlanned(initial, byKey, opts, scope) + + const done: Array, { kind: 'done' }>> = [] + for (let settled = await scope.next(); settled !== null; settled = await scope.next()) { + if (settled.kind === 'down') continue // infra/bad child: excluded from merge n + equal-k + done.push(settled) + + // Flat gate (the default) short-circuits before any Router call — a gate run never + // pays for a widen decision and the firewall conflict stays dormant. + if (!gate.shouldWiden(settled, scope.budget)) continue + const findings = analyze ? await analyze(settled) : [] + if (!widenIsWarranted(findings, gate, settled)) continue + + // Ask the Router for a widen plan, showing it ONLY the trace-derived findings (never + // the verdict). It returns the next children to spawn, or `done` to stop widening. + const widen = await requestPlan(opts, scope, task, settled, findings) + if (widen.done) continue + spawnPlanned(widen, byKey, opts, scope) + } + + const iterations = done.map((s) => settledToIteration(s)) + const winner = defaultSelectWinner(iterations) + if (!winner) { + throw new Error( + 'llm-meta-driver: no done child to select a winner from (all children were down)', + ) + } + return winner.output as Out + }, + } +} + +/** Spawn every entry the Router planned, mapping each to its catalog child and per-child + * budget. A plan entry referencing an unknown child key fails loud (a hallucinated plan is + * a diagnostic, not a silently-dropped spawn). A spawn that fails pool admission is dropped + * — fail closed, never overcommit the conserved pool. */ +function spawnPlanned( + plan: SpawnPlan, + byKey: Map>, + opts: LlmMetaDriverOptions, + scope: Scope, +): void { + for (const entry of plan.spawns) { + const child = byKey.get(entry.childKey) + if (!child) { + throw new Error( + `llm-meta-driver: Router planned a spawn for unknown child key "${entry.childKey}" (catalog: ${[...byKey.keys()].join(', ')})`, + ) + } + assertSpawnable(child) + const budget = entryBudget(entry, opts) + scope.spawn(child.agent, child.task, { budget, label: child.key }) + } +} + +/** Project a Router plan entry into the conserved `Budget` (heterogeneous per child — this + * is the "driver A n shots, B k shots" dial). */ +function entryBudget(entry: SpawnPlanEntry, opts: LlmMetaDriverOptions): Budget { + return { + maxIterations: entry.shots, + maxTokens: entry.maxTokens, + ...(entry.maxUsd !== undefined ? { maxUsd: entry.maxUsd } : {}), + ...(opts.perChildDeadlineMs !== undefined ? { deadlineMs: opts.perChildDeadlineMs } : {}), + } +} + +/** + * Ask the Router for a spawn plan. The external-boundary call returns real usage and throws + * on a non-OK response; the JSON parse is inspected before the plan is acted on — a + * malformed plan throws (fail loud), never degrades to a silent empty fan-out. When + * `settled`/`findings` are present this is a widen decision and the prompt carries ONLY the + * trace-derived findings (selector ≠ judge). + */ +async function requestPlan( + opts: LlmMetaDriverOptions, + scope: Scope, + task: unknown, + settled: Extract, { kind: 'done' }> | undefined, + findings: ReadonlyArray, +): Promise { + const prompt = settled + ? widenPrompt(opts, scope, settled, findings) + : initialPrompt(opts, scope, task) + const res = await routerChatWithUsage(opts.router, [ + { role: 'system', content: metaSystemPrompt }, + { role: 'user', content: prompt }, + ]) + const parsed = parsePlan(res.content) + if (!parsed.ok) { + throw new Error(`llm-meta-driver: Router returned an unparseable spawn plan — ${parsed.error}`) + } + return parsed.plan +} + +const metaSystemPrompt = [ + 'You are a spawn meta-driver over a budget-conserving execution scope.', + 'You decide which child agents to spawn and their per-child budgets (shots, tokens).', + 'Spawning is asynchronous: a child runs, settles, and you may then widen toward a', + 'promising lineage under the remaining conserved budget. Do NOT fan out eagerly.', + 'Reply with ONLY a JSON object: {"spawns":[{"childKey":string,"shots":number,"maxTokens":number,"maxUsd"?:number}],"done":boolean}.', + 'When you have enough settled children to synthesize a winner, reply {"spawns":[],"done":true}.', +].join(' ') + +function initialPrompt( + opts: LlmMetaDriverOptions, + scope: Scope, + task: unknown, +): string { + return [ + `Objective: ${opts.objective}`, + `Task: ${stringifyForPrompt(task)}`, + `Conserved budget: ${budgetLine(scope)}`, + 'Catalog of spawnable children:', + catalogLines(opts.catalog), + 'Choose the initial spawn plan: which children, with which per-child shots/tokens.', + ].join('\n') +} + +/** The widen prompt shows ONLY trace-derived findings about the settled child — never its + * verdict score. This is the firewall: the meta-controller steers from the diagnosis, not + * the judge. */ +function widenPrompt( + opts: LlmMetaDriverOptions, + scope: Scope, + settled: Extract, { kind: 'done' }>, + findings: ReadonlyArray, +): string { + return [ + `Objective: ${opts.objective}`, + `A child "${settled.handle.label}" just settled. Trace-analyst findings (steer from these, NOT any score):`, + renderFindings(findings), + `Remaining conserved budget: ${budgetLine(scope)}`, + 'Catalog of spawnable children:', + catalogLines(opts.catalog), + 'Widen toward the promising lineage with at most one more spawn, or reply done if there is nothing worth widening.', + ].join('\n') +} + +function catalogLines(catalog: ReadonlyArray>): string { + return catalog.map((c) => ` - ${c.key}: ${c.description}`).join('\n') +} + +function budgetLine(scope: Scope): string { + const b = scope.budget + return `tokensLeft=${b.tokensLeft} usdLeft=${b.usdLeft} reservedTokens=${b.reservedTokens}` +} + +function renderFindings(findings: ReadonlyArray): string { + if (findings.length === 0) return ' (no findings)' + return findings + .map( + (f) => + ` - [${f.severity}/${f.area}] ${f.claim}${f.recommended_action ? ` → ${f.recommended_action}` : ''}`, + ) + .join('\n') +} + +/** + * Widen warranted? The trace-derived gate: a `high`/`critical` finding with a + * `recommended_action` is a correctable middle band worth one more shot. Empty findings are + * NOT warranted (flat). The ONLY verdict-reading path is the gate's explicit + * `judgeExempt: true` hatch (off by default), which re-couples steering to the judge. + */ +function widenIsWarranted( + findings: ReadonlyArray, + gate: WidenGate, + settled: Extract, { kind: 'done' }>, +): boolean { + if (gate.judgeExempt === true) { + const score = (settled.verdict as { score?: unknown } | undefined)?.score + return typeof score === 'number' && score > 0 + } + return findings.some( + (f) => + (f.severity === 'high' || f.severity === 'critical') && + typeof f.recommended_action === 'string' && + f.recommended_action.length > 0, + ) +} + +/** Parse + validate the Router's JSON plan. A non-object, a missing/!array `spawns`, a + * non-boolean `done`, or a malformed entry is a typed parse failure the caller fails loud + * on — never a silent empty plan. */ +function parsePlan(content: string): { ok: true; plan: SpawnPlan } | { ok: false; error: string } { + const json = extractJsonObject(content) + if (json === undefined) return { ok: false, error: 'no JSON object in response' } + let raw: unknown + try { + raw = JSON.parse(json) + } catch (err) { + return { ok: false, error: `JSON.parse failed: ${err instanceof Error ? err.message : String(err)}` } + } + if (typeof raw !== 'object' || raw === null) return { ok: false, error: 'plan is not an object' } + const obj = raw as Record + if (!Array.isArray(obj.spawns)) return { ok: false, error: '`spawns` is not an array' } + if (typeof obj.done !== 'boolean') return { ok: false, error: '`done` is not a boolean' } + const spawns: SpawnPlanEntry[] = [] + for (const e of obj.spawns) { + if (typeof e !== 'object' || e === null) return { ok: false, error: 'a spawn entry is not an object' } + const entry = e as Record + if (typeof entry.childKey !== 'string') return { ok: false, error: 'a spawn entry has no string `childKey`' } + if (typeof entry.shots !== 'number' || entry.shots <= 0) { + return { ok: false, error: `spawn "${entry.childKey}" has a non-positive \`shots\`` } + } + if (typeof entry.maxTokens !== 'number' || entry.maxTokens <= 0) { + return { ok: false, error: `spawn "${entry.childKey}" has a non-positive \`maxTokens\`` } + } + spawns.push({ + childKey: entry.childKey, + shots: entry.shots, + maxTokens: entry.maxTokens, + ...(typeof entry.maxUsd === 'number' ? { maxUsd: entry.maxUsd } : {}), + }) + } + return { ok: true, plan: { spawns, done: obj.done } } +} + +/** Slice the first balanced `{...}` object out of a model response (tolerates prose around + * the JSON). Returns undefined when no balanced object is present. */ +function extractJsonObject(content: string): string | undefined { + const start = content.indexOf('{') + if (start === -1) return undefined + let depth = 0 + for (let i = start; i < content.length; i++) { + const ch = content[i] + if (ch === '{') depth++ + else if (ch === '}') { + depth-- + if (depth === 0) return content.slice(start, i + 1) + } + } + return undefined +} + +function stringifyForPrompt(task: unknown): string { + if (typeof task === 'string') return task + try { + return JSON.stringify(task) + } catch { + return String(task) + } +} + +/** A `MetaChild`'s agent must carry its `executorSpec` (AgentSpec) — the field + * `scope.spawn` resolves the runtime from. Fail loud if absent (only the agent author + * knows its profile/harness). */ +function assertSpawnable(child: MetaChild): void { + const carried = (child.agent as { executorSpec?: unknown }).executorSpec + if (!isAgentSpec(carried)) { + throw new Error( + `llm-meta-driver: child "${child.key}" agent carries no executorSpec (AgentSpec); cannot resolve its LeafExecutor`, + ) + } +} + +function isAgentSpec(value: unknown): value is AgentSpec { + if (typeof value !== 'object' || value === null) return false + const v = value as Record + return 'profile' in v && 'harness' in v +} diff --git a/bench/src/drivers/progressive-widening.ts b/bench/src/drivers/progressive-widening.ts new file mode 100644 index 0000000..af0d5d1 --- /dev/null +++ b/bench/src/drivers/progressive-widening.ts @@ -0,0 +1,223 @@ +/** + * @experimental + * + * Coded progressive-widening driver — the CONTROL variant of the recursive execution + * atom's two driver-act bodies (the LLM meta-driver in `./llm-meta-driver.ts` is the + * treatment). Both share the `WidenGate` below. + * + * The policy (MCTS progressive widening, the governor that keeps "full generality" from + * becoming "boil the ocean"): seed a NARROW frontier (one child per seed), then react to + * each `scope.next()` completion. A node widens — spawns ONE more child toward the same + * promising lineage under the conserved pool — only when the `WidenGate` says so. No + * eager fan-out: the frontier grows by at most one per settlement, bounded by the + * conserved budget reservation (`scope.spawn` fails closed when the pool can't cover it). + * + * Two firewall invariants this driver upholds by construction (critique R2): + * - `WidenGate` DEFAULTS TO FLAT: `defaultWidenGate.shouldWiden` returns false for every + * settlement, so a gate run never widens and the selector≠judge conflict stays dormant. + * - When widening IS enabled, `promising` is derived from TRACE findings (the `analyze` + * hook → `AnalystFinding[]`), NEVER from a raw `verdict.score`. Reading the judge + * verdict for a spawn decision requires the gate's explicit `judgeExempt: true` (off by + * default) — the documented escape hatch that re-couples steering to the judge. + * + * Selection stays single-sourced: settled children adapt to `Iteration` via + * `settledToIteration` and `defaultSelectWinner` picks the winner — the driver never + * forks the kernel's argmax (selector ≠ judge). + */ + +import type { AnalystFinding } from '@tangle-network/agent-eval' +import { defaultSelectWinner } from '../../../src/loops/run-loop.ts' +import { settledToIteration } from '../../../src/loops/supervise/scope.ts' +import type { + Agent, + AgentSpec, + Budget, + DefaultVerdict, + Scope, + Settled, + WidenGate, +} from '../../../src/loops/supervise/types.ts' + +/** A child the driver can spawn: a leaf `Agent` plus the `AgentSpec` the open registry + * resolves its `LeafExecutor` from (`harness: null` → router/inline; `BackendType` → + * sandbox; or a BYO `executor`). The spec rides on the agent as `executorSpec` because + * that is the field `scope.spawn` reads to resolve a runtime — fail loud if it is absent. */ +export interface ChildAgent { + readonly agent: Agent + readonly task: unknown + readonly label: string +} + +/** A seed of the narrow initial frontier: the child to spawn and its per-child budget. */ +export interface WideningSeed { + readonly child: ChildAgent + readonly budget: Budget +} + +/** + * Trace-analyst hook: read a settled child's TRACE (rehydrated `out` + lineage) into + * `AnalystFinding[]`. This is the analyst→driver wire (mirrors `PlannerContext.analyses`) + * and the ONLY signal `promising` may read when the gate is flat-with-findings. The hook + * MUST return trace-derived findings; the gate never inspects `settled.verdict` unless it + * is explicitly `judgeExempt`. + */ +export type AnalyzeSettled = ( + settled: Extract, { kind: 'done' }>, +) => Promise> + +export interface ProgressiveWideningOptions { + readonly name?: string + /** The narrow initial frontier — one child per seed, no eager fan-out. */ + readonly seed: (task: unknown) => ReadonlyArray> + /** Build the next child to widen toward a promising lineage. Returns `null` to stop + * widening this lineage (e.g. the lineage has converged). */ + readonly widen: ( + settled: Extract, { kind: 'done' }>, + findings: ReadonlyArray, + ) => WideningSeed | null + /** Trace-analyst wire feeding `promising`. Omit to run flat (no findings → never + * widens under the default gate). */ + readonly analyze?: AnalyzeSettled + /** The widening governor. Defaults to `defaultWidenGate` (flat — never widens). */ + readonly gate?: WidenGate +} + +/** + * Build the coded progressive-widening `Agent`. Its `act` body is the control policy: + * seed narrow → react to each `next()` → widen toward a promising lineage under budget → + * synthesize the winner with the single-sourced selector. `WidenGate` defaults flat, so + * with no `gate`/`analyze` supplied this is exactly the "spawn the seeds, pick the best" + * flat harness. + */ +export function createProgressiveWideningDriver( + opts: ProgressiveWideningOptions, +): Agent { + const gate = opts.gate ?? defaultWidenGate() + const analyze = opts.analyze + + return { + name: opts.name ?? 'progressive-widening', + async act(task: unknown, scope: Scope): Promise { + // Seed the NARROW frontier: one child per seed, reserved atomically from the pool. + // A seed that fails admission (pool can't cover it) is dropped — fail closed, never + // overcommit; the conserved Σk holds by construction. + for (const s of opts.seed(task)) { + scope.spawn(asSpawnable(s.child), s.child.task, { budget: s.budget, label: s.child.label }) + } + + const done: Array, { kind: 'done' }>> = [] + // React to settlements one at a time (ray.wait n=1). `next()` is null only when the + // live set is empty — every spawned child eventually settles done or down. + for (let settled = await scope.next(); settled !== null; settled = await scope.next()) { + if (settled.kind === 'down') continue // infra/bad child: excluded from merge n + equal-k + done.push(settled) + + // Progressive widening: spawn AT MOST one more child toward this lineage, and only + // when the gate says promising AND the pool can still cover a widen. The findings + // are TRACE-derived (`analyze`); the gate reads them, never the raw verdict. + if (!gate.shouldWiden(settled, scope.budget)) continue + const findings = analyze ? await analyze(settled) : [] + if (!isPromising(findings, gate, settled)) continue + const next = opts.widen(settled, findings) + if (next === null) continue + scope.spawn(asSpawnable(next.child), next.child.task, { + budget: next.budget, + label: next.child.label, + }) + } + + // Single-sourced selection: adapt the done children to the kernel's Iteration shape + // and let `defaultSelectWinner` pick (best-valid-score, ties → earliest). The driver + // does NOT fork the argmax (selector ≠ judge). + const iterations = done.map((s) => settledToIteration(s)) + const winner = defaultSelectWinner(iterations) + if (!winner) { + throw new Error( + 'progressive-widening: no done child to select a winner from (all children were down)', + ) + } + return winner.output as Out + }, + } +} + +/** + * The flat-by-default widening governor (the shared `WidenGate`). `shouldWiden` returns + * false for EVERY settlement, so a gate run never widens — the firewall conflict (R2) + * stays dormant by construction. Override it with a findings-driven gate (severity/area + * thresholds over trace findings) to enable widening; only an explicit `judgeExempt: true` + * gate may read `verdict.score`. + */ +export function defaultWidenGate(): WidenGate { + return { + shouldWiden(): boolean { + return false + }, + } +} + +/** + * A findings-driven widening gate (opt-in, never the default). Widens toward a lineage + * whose TRACE findings show a correctable middle band — a high/critical finding that + * carries a `recommended_action` (the analyst says "this is fixable, do X"). It reads ONLY + * trace-derived findings, never the verdict, so it composes with the steer firewall. The + * `minTokensLeft` guard keeps a widen from starving the pool below a usable per-child floor. + */ +export function findingsWidenGate(opts: { minTokensLeft: number }): WidenGate { + return { + shouldWiden(_settled: Settled, budget: Scope['budget']): boolean { + return budget.tokensLeft >= opts.minTokensLeft + }, + } +} + +/** + * Is this lineage promising enough to widen? Promise is computed from TRACE findings, not + * the judge verdict: a `high`/`critical` finding that names a `recommended_action` is a + * correctable middle band worth one more shot. Empty findings are NOT promising (flat). + * + * The ONLY path that reads `verdict.score` is the gate's explicit `judgeExempt: true` + * escape hatch — it re-couples steering to the judge, so it must be argued per cell and is + * off by default. + */ +function isPromising( + findings: ReadonlyArray, + gate: WidenGate, + settled: Extract, { kind: 'done' }>, +): boolean { + if (gate.judgeExempt === true) return judgeScore(settled.verdict) > 0 + return findings.some( + (f) => + (f.severity === 'high' || f.severity === 'critical') && + typeof f.recommended_action === 'string' && + f.recommended_action.length > 0, + ) +} + +/** Read a verdict's scalar score. Used ONLY behind the explicit `judgeExempt` hatch — the + * steering-from-the-judge path the firewall otherwise forbids. */ +function judgeScore(verdict: DefaultVerdict | undefined): number { + if (!verdict) return 0 + const score = (verdict as { score?: unknown }).score + return typeof score === 'number' ? score : 0 +} + +/** Attach the child's `AgentSpec` as the `executorSpec` field `scope.spawn` resolves the + * runtime from. A `ChildAgent` whose `agent` already carries a matching `executorSpec` + * passes through unchanged; otherwise this is a fail-loud no-op (the agent must carry the + * spec, since only the agent author knows its profile/harness). */ +function asSpawnable(child: ChildAgent): Agent { + const carried = (child.agent as { executorSpec?: unknown }).executorSpec + if (!isAgentSpec(carried)) { + throw new Error( + `progressive-widening: child "${child.label}" agent carries no executorSpec (AgentSpec); cannot resolve its LeafExecutor`, + ) + } + return child.agent +} + +function isAgentSpec(value: unknown): value is AgentSpec { + if (typeof value !== 'object' || value === null) return false + const v = value as Record + return 'profile' in v && 'harness' in v +} diff --git a/bench/src/rsi.ts b/bench/src/rsi.ts new file mode 100644 index 0000000..10d22b8 --- /dev/null +++ b/bench/src/rsi.ts @@ -0,0 +1,86 @@ +/** + * The RSI driver experiment, instantiated. The whole thing in one file: pick a + * benchmark adapter, pick the steer POLICIES (the arms), run them through the one + * flow at equal compute, read the result. Everything else is the library + * (src/experiment.ts). Adding a benchmark is one import; adding a policy is one + * steer function. + * + * BENCH=swe-bench N=20 ROUNDS=3 tsx src/rsi.ts + * + * Caveat: `blind`/`random` are independent fresh attempts (the compute control). + * A `continue` / "build on your prior work" policy is only meaningful with + * CONTINUED-SESSION execution (the kernel reusing one box across turns); the loop + * is fresh-box-per-attempt today, so it would degrade to a re-attempt. The + * prompt-steering policies below (critical-audit, aggressive-push) are live now. + */ +import { Sandbox } from '@tangle-network/sandbox' +import { createFinsearchcompAdapter } from './benchmarks/finsearchcomp' +import { createSweBenchAdapter } from './benchmarks/swe-bench' +import type { BenchmarkAdapter } from './benchmarks/types' +import { type Arm, analystArm, arm, llmAnalyst, randomArm, runExperiment, sandboxAgentRun } from './experiment' + +const must = (k: string): string => { + const v = process.env[k] + if (!v) throw new Error(`env ${k} is required`) + return v +} + +// The benchmark roster. Long-horizon adapters (commit0, swe-lancer, tau2, appworld, +// blueprint) slot in here as one entry each; the loop below never changes. +const ADAPTERS: Record BenchmarkAdapter> = { + 'swe-bench': createSweBenchAdapter, + finsearchcomp: createFinsearchcompAdapter, +} + +async function main() { + const make = ADAPTERS[process.env.BENCH ?? 'swe-bench'] + if (!make) throw new Error(`unknown BENCH=${process.env.BENCH} (have: ${Object.keys(ADAPTERS).join(', ')})`) + const adapter = make() + const model = process.env.WORKER_MODEL ?? 'gpt-5' + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const routerKey = must('TANGLE_API_KEY') + const rounds = Number(process.env.ROUNDS ?? 3) + const router = { routerBaseUrl, routerKey, model } + const client = new Sandbox({ + baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools', + apiKey: routerKey, + timeoutMs: 1_200_000, + } as never) + + // The steer policies under test. Each is an arm = a steer f(rootPrompt, history). + const policies: [Arm, ...Arm[]] = [ + randomArm('blind'), // compute control: independent retries, no steer + analystArm('critical-audit', llmAnalyst(router)), // audit the prior attempt, steer on the findings + arm('aggressive-push', (root, _h, r) => + r === 0 ? root : `${root}\n\nShip the most complete working end-to-end result NOW. Prefer done over polish; finish it.`), + ] + + const corpus = process.env.CORPUS ?? `${process.cwd()}/corpus/rsi-${adapter.name}.jsonl` + const r = await runExperiment({ + adapter, + sandboxClient: client, + agentRun: sandboxAgentRun({ model, routerBaseUrl, routerKey }), + arms: policies, + model, + rounds, + n: Number(process.env.N ?? 10), + ids: process.env.IDS ? process.env.IDS.split(',') : undefined, + concurrency: Number(process.env.CONCURRENCY ?? 3), + ...(adapter.output ? { output: adapter.output } : {}), + corpusPath: corpus, + }) + + const pct = (x: number) => (r.n > 0 ? `${((x / r.n) * 100).toFixed(1)}%` : 'n/a') + console.log(`\n=== ${adapter.name}: ${r.arms.length} policies x rounds=${rounds} (clean n=${r.n}, excluded ${r.errored}) ===`) + console.log(` blind (1 attempt): ${pct(r.blind)}`) + for (const a of r.arms) { + const tag = a.label === r.arms[0]?.label ? ' <- compute control' : ` delta vs control ${((a.deltaVsControl / Math.max(r.n, 1)) * 100).toFixed(1)}pp` + console.log(` ${a.label}@${rounds}: ${pct(a.resolved)}${tag}`) + } + console.log(`corpus: ${corpus} -> paired CI + BH via: tsx src/corpus-report.mts ${corpus}`) +} + +main().catch((e) => { + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) + process.exit(1) +}) diff --git a/docs/README.md b/docs/README.md index c71dd41..d060d99 100644 --- a/docs/README.md +++ b/docs/README.md @@ -14,6 +14,17 @@ Read top-to-bottom for the full picture. | 4 | [learning-flywheel.md](./learning-flywheel.md) | theory deep-dive | The moat thesis — the `(π, τ, J, D, O)` recursion and cross-run flywheel. Points to `architecture.md` as the canonical entry. | | 5 | [../bench/README.md](../bench/README.md) | empirical harness | The benchmark surface and current empirical status (what's been run, what wins, what's untested). | +## Research track + +Forward-looking design research — surveys, multi-agent design passes, decision logs. Not the canonical spine; promotions into `architecture.md` happen explicitly once a design ships. + +| Doc | Role | Purpose | +|---|---|---| +| [research/README.md](./research/README.md) | research index | The active design thread + decision log + source-artifact pointers. | +| [research/recursive-execution-atom.md](./research/recursive-execution-atom.md) | design (in progress) | The next generation: one recursive `Agent` atom run as a durable, observable supervision tree (drivers-of-drivers, analyst-as-agent-with-runtime, async dynamic spawning). Plane B — contains the flat harness. | +| [research/flat-harness-design.md](./research/flat-harness-design.md) | design synthesis | Plane A — the assumption-free experiment harness (profiles × steer × executionMode × allocation). Recovered as the simplest `act` body on Plane B. | +| [research/long-horizon-benchmark-survey.md](./research/long-horizon-benchmark-survey.md) | survey | Adversarially-verified long-horizon + multi-turn benchmark survey. Top picks: Commit0, τ²-bench. | + ## Reference track The package API and subsystems. diff --git a/docs/research/README.md b/docs/research/README.md new file mode 100644 index 0000000..629248d --- /dev/null +++ b/docs/research/README.md @@ -0,0 +1,61 @@ +> **Track:** Architecture (research) · **Role:** design-research log · **Status:** open — keystone design in flight + +# Research log — RSI driver architecture + +Design research for the next architecture generation: turning the flat experiment harness +into a **recursive execution atom** (agents that drive agents, recursively; analysts as +agents; an async, observable, dynamically-spawning supervision tree). This dir tracks the +inputs (surveys, design passes), the decisions, and the open forks so the thread is +resumable and the expensive multi-agent passes are not re-run. + +On any *architecture* conflict, [`../architecture.md`](../architecture.md) still wins. These +docs are forward-looking design research, not the canonical spine — promotions into the +spine happen explicitly, with `file:line` anchors, once a design ships. + +## Documents + +| Doc | What it holds | +|-----|---------------| +| [recursive-execution-atom.md](./recursive-execution-atom.md) | **The main thread.** The vision (verbatim intent), the Plane-A-vs-B framing, the proposed surface (one atom + `Scope` + `Supervisor`), analyst-as-agent-with-runtime, what exists vs the gap (file-grounded), the open questions, and the decision log. | +| [flat-harness-design.md](./flat-harness-design.md) | **Plane A.** The assumption-free experiment-harness synthesis (profiles × steer × executionMode × allocation; rip-out list; durability argument; migration phases). Recovered as the simplest `act` body on Plane B. | +| [long-horizon-benchmark-survey.md](./long-horizon-benchmark-survey.md) | Adversarially-verified survey of long-horizon + multi-turn benchmarks. Top picks: **Commit0** (graded + natively multi-turn software build), **τ²-bench** (multi-turn agent↔user with tools). | + +## Source artifacts (multi-agent passes) + +| Run | Pass | Result lands in | +|-----|------|-----------------| +| `w9ntld2vt` | deep-research benchmark survey (102 agents, 20 sources, 25 claims adversarially verified) | long-horizon-benchmark-survey.md | +| `wuh46e5zp` | durable-architecture design — 3 proposals → adversarial synthesis | flat-harness-design.md | +| `wnrxtvdta` | recursive-atom-surface — 6 prior-art lenses + 4 codebase mappers → synthesis → adversarial critique → reconcile | recursive-execution-atom.md (appended on completion) | + +## Decision log + +- **Full tensor now**, not "not-foreclose / flat-v1." The architecture must *be* the recursive + execution atom now, built as durable mechanism (so it survives even a negative gate), not a + flat harness with seams. _(interview, 2026-06-04)_ +- **Plane B contains Plane A.** We do not pick "experiment harness" or "recursive atom" — the + flat harness is the simplest `act` body over the atom. The `wuh46e5zp` design becomes the + canonical example, not a competing v1. +- **Analyst = Agent + harness.** Halo-CLI / our inline trace-analyst / a sandboxed agent are + one type. The runtime is **derived from the agent's `AgentProfile.harness`**: `harness: null` = + direct Router inference call; `harness: ` = sandboxed; future `mastra`/`agno`/`ai-sdk` + harnesses register their own `LeafExecutor`. _(operator, 2026-06-04)_ +- **Leaves are opaque, self-parallelizing coding harnesses.** The recursion is in the *drivers*; + the bottom is a coding agent that fans out internally on its own. +- **The 4 forks resolved (operator, 2026-06-04):** event-sourced **yes**; observability **substrate + now**; LLM meta-driver **built now** (operator override of the pass's "make it wait"), as the + *treatment* on top of the budget-reservation invariant, with coded progressive-widening + + flat-harness as controls; hard ceiling **yes — sharpened to a conserved reservation pool** + (`Σk(treatment) ≡ Σk(blind)` by construction, fail-closed). +- **The keystone is the budget-conserving reactive `Scope` + `Supervisor`** (not the LLM driver). + The critique proved a *ceiling* budget + data-dependent spawning is a confound generator; the + conserved *reservation* pool is the one invariant that makes any meta-driver result valid. + `WidenGate` defaults to flat so the selector≠judge firewall conflict (R2) stays dormant until + widening is argued. See [recursive-execution-atom.md](./recursive-execution-atom.md) for the + frozen surface + build order. + +## Open engineering forks (not blocking the v1 keystone) + +- **F1** — does `Scope` supersede `runProgram`'s loop-layer `parallel`, or coexist? (deletion deferred until `Scope` is proven) +- **F2** — adopt a Temporal/DBOS durable backend now, or type-shape-only until days-long resumable runs are a near-term product? +- **F3** — is `cli`/Halo a first-class equal-k participant (needs external-process token accounting first) or observability-only (`budgetExempt`, permanent)? diff --git a/docs/research/flat-harness-design.md b/docs/research/flat-harness-design.md new file mode 100644 index 0000000..6bfe853 --- /dev/null +++ b/docs/research/flat-harness-design.md @@ -0,0 +1,99 @@ +> **Track:** Architecture (research) · **Role:** design synthesis · **Status:** subsumed — this is Plane A, recovered as the simplest `act` body on [recursive-execution-atom.md](./recursive-execution-atom.md) + +# Flat experiment harness (Plane A) + +Synthesis of the `wuh46e5zp` design pass (3 independent proposals → adversarial synthesis): +the durable, assumption-free **experiment harness** for comparing steer policies at equal +compute. All three proposals converged tightly and identically on the same surface. + +This is **not** a competing v1. It is the flat plane — and the recursive atom *contains* it: +the harness below is the simplest possible `act` (spawn one child per profile, fixed budget, +select the best). Captured here because its mechanism/content split, its rip-out list, and its +`executionMode` primitive are directly reused by Plane B. + +## The converged surface + +```ts +const result = await runRsiExperiment({ + benchmark: adapter, // researcher's task + deterministic judge + profiles: AgentProfile[], // the arms — FULL profiles, not keyword strings + steerPolicies: ((root, history, round) => prompt)[], // pure fns; read trace/events, never the verdict + executionMode: { kind: 'fresh-box' | 'continued-session' | 'fork', maxTurns }, + allocation: { kind: 'round-robin' | 'adaptive-thompson' | 'variance-based', k }, + sandboxClient, n, concurrency, corpusPath, +}) +``` + +- **Arms are full `AgentProfile`s** (model, tools, MCP, persona, capabilities) composed with + `mergeAgentProfiles` — never keyword strings like `critical-audit`. +- **Steer is a pure function** `(rootPrompt, history, round) => nextPrompt`, fully visible to the + researcher. No hidden directives. +- **The researcher's experiment is ~50 lines**; the framework is <500 LOC. + +## Framework owns (mechanism) vs researcher supplies (content) + +| Framework (once) | Researcher (per experiment) | +|---|---| +| `ExecutionMode` mechanics (box lifecycle per mode) | full `AgentProfile`s (the arms) | +| loop kernel (`runLoop`, `createDynamicDriver`) | steer policies (pure fns; their hypotheses) | +| measurement (`BenchmarkAdapter`, `OutputAdapter`, `Validator`) | the task adapter + deterministic judge | +| allocation scheduling (`thompson`/`variance` from agent-eval) | execution-mode + allocation choice (explicit) | +| corpus (`RunRecord`, paired bootstrap + BH) | optional `OutputAdapter`/`Validator` overrides | +| **steer firewall** (selector ≠ judge, type-level) | — | +| **compute-control enforcement** (control arm required to compile) | — | + +## `executionMode` — the one new runtime primitive + +A required field on the kernel; default `fresh-box` (today's behavior). This is the +"continued-session execution dial," and it plugs into the existing `collectBox` seam in +`src/loops/run-loop.ts`. + +- **`fresh-box`** — new sandbox per iteration; stateless; the **compute control** (bandit-like; k independent samples). +- **`continued-session`** — one sandbox reused across turns; filesystem/shell state persists; steering compounds (MDP-like). The kernel creates the box once and reuses it; the driver rewrites the prompt per turn via the steer policy. +- **`fork`** — checkpoint + branch (what-if / counterfactual); deferred (needs sandbox checkpoint/restore). + +Allocation composes orthogonally: `round-robin` (fair, the baseline), `adaptive-thompson`, +`variance-based`. The corpus `condition` field logs mode + allocation so offline analysis can +reject mismatched comparisons (a policy is only comparable within the same `executionMode`). + +## Rip out (hardcoded content → researcher config) + +- `bench/src/directives.ts` — **delete** all `DEFAULT_*` directive constants + `DIVERSE_STRATEGY_LENSES`. Keep only `composeStrategies()` as a helper. Directives are researcher hypotheses, not framework policy. +- `bench/src/run.ts` — **delete** the `batch-blind` / `batch-oracle` / `batch-compare` presets and the env-driven dispatch (`BACKEND`, `WORKER_MODEL`, `ANALYST`). One entry point loads a researcher config. +- `bench/src/experiment.ts` — **move** `randomArm`/`refineArm`/`diverseArm`/`llmAnalyst`/`loopAnalyst`/`analystArm` to examples; they are templates, not framework. +- `WorkerBackendType` enum — **delete**. Backend is part of the `AgentProfile` (the cost dial is a backend type, not a separate knob). +- `ADAPTERS[key]` lookup — **delete**. The config imports the adapter directly. + +## Baked assumptions explicitly rejected + +Arms-are-keywords; directives-are-framework-policy; one-box-per-iteration-is-the-only-model; +diverse-lenses-are-fixed; allocation-is-always-fixed-k; the-task-is-always-a-string; +backend-is-a-separate-knob; the-firewall-is-a-soft-rule (→ make `PlannerContext` carry only +`output`+`events`, never `verdict`, at the type level); control-is-optional (→ `runSteeringExperiment` +requires a control arm; omitting it is a compile error). + +## Durability argument (why it survives 2 years) + +Content/mechanism split isolates the framework from trend-chasing (new domains need adapters, +not rewrites); substrate-maximal leverage (`AgentProfile` from the sandbox SDK, `runLoop` from +runtime) tracks upstream not internal drift; profiles-as-versioning (a config file in git +reproduces a run 18 months later); `RunRecord` decouples sweeps from analysis (replay the +corpus under new hypotheses without re-running); `executionMode` as an axis (if +continued-session is a dead end, no framework bloat); only two contracts (`BenchmarkAdapter`, +`AgentProfile`); no hardcoded strings. + +## Migration phases (from the synthesis) + +Dependency-ordered, each small and verifiable: (1) add `ExecutionMode` to `agent-runtime` +types, default `fresh-box`, behavior unchanged; (2) implement `continued-session` on the +`collectBox` seam; (3) extract `SteerPolicy`, move arm factories to examples; (4) rip out +directives; (5) flow `executionMode` into the corpus; (6) `RsiExperimentConfig` + +`runRsiExperiment`; (7) allocation strategies as plugins; (8) firewall type-enforcement; +(9) delete `batch-*`; (10) docs + examples + migration guide. + +## Top risks flagged + +Session leaks if `executionMode` unset (→ default `fresh-box`, required field); continued-session +state explosion (→ SDK memory cap + cleanup flag); adaptive allocation overfits at low n (→ loud +docs, fixed-k for n<20); "arm beats control" ≠ "steering beats compute" without paired CI (→ +control required by the type signature; corpus-report pairs the delta). diff --git a/docs/research/long-horizon-benchmark-survey.md b/docs/research/long-horizon-benchmark-survey.md new file mode 100644 index 0000000..5d7224c --- /dev/null +++ b/docs/research/long-horizon-benchmark-survey.md @@ -0,0 +1,71 @@ +> **Track:** Architecture (research) · **Role:** survey (adversarially verified) · **Status:** reference · **Run:** `w9ntld2vt` (102 agents, 20 sources, 100 claims → 25 verified, 23 confirmed / 2 killed) + +# Long-horizon & multi-turn benchmark survey + +For the RSI driver experiment: run an agent over multiple turns on a hard task, compare +**steer policies** (continue / critical-audit / aggressive-ship / personas) against blind +independent retries, and measure whether steering gets farther per added turn. The experiment +wants a benchmark that is **natively multi-turn** (context carries across turns) and whose +completion signal is **GRADED** (fraction of tests passing), not binary, so the adaptation +curve is smooth. + +## Top recommendations + +- **Long-horizon software build, steer a continued conversation, compare policies → Commit0.** + The only surveyed benchmark that is simultaneously **graded** (pass-rate of unit tests, a + continuous 0–100%), **natively multi-turn/interactive** (multi-stage unit-test + static-analysis + + coverage feedback the agent adapts to across turns — the curve measurably moves with feedback, + e.g. iterating on test errors lifts pass-rate to ~26%), and genuinely **long-horizon** (implement + entire real Python libraries from scratch against long-form specs; 54–57 libraries). + Sources: arXiv 2412.01769, commit-0.github.io. NeurIPS 2024 D&B. + +- **Multi-turn agent↔user conversation with tools → τ²-bench (tau2-bench).** A natively multi-turn + **dual-control** Tool-Agent-User benchmark: a simulated user and the agent converse turn-by-turn + and **both** can call tools (a Dec-POMDP). Sources: github.com/sierra-research/tau2-bench, + arXiv 2506.07982. **Caveat:** rewards are effectively **binary** per task (gated by required + actions + `reward_basis`) — it is the *conversation* pick, **not** a graded-curve pick (a + verifier vote killed the "graded" claim 0–3). + +## Verified verdicts + +| Benchmark | Graded? | Natively multi-turn / continued-session? | Fit for "steer a continued build conversation" | Vote | +|---|---|---|---|---| +| **Commit0** | **Yes** — unit-test pass-rate % | **Yes** — interactive multi-stage feedback the agent adapts to | **Best** | 3-0 | +| **FeatureBench** | **Yes** — Passed-Rate (frac. of fail→pass tests) + binary Resolved-Rate | **Yes** — agentic scaffolds, ≤500 steps, diminishing returns ~100 | Strong runner-up; *feature-level*, not greenfield whole-project | 3-0 | +| **DevBench** | **Yes** — test pass-rate, coverage %, env-setup success | **No** — 5 waterfall stages graded independently with *reference* inputs; only a review-role refine loop | Graded + from-scratch, but **not** one continuous build conversation | 3-0 / 2-1 | +| **ProgramBench** (Meta/FAIR, arXiv 2605.03546) | Headline **binary** (% Resolved = all tests pass); a secondary "% Tests Passed" partial-progress metric exists | **Yes** — write-compile-debug, 1,000-step / 6-hr cap, median ~868 cmds/task (model-dependent) | **Single-agent-only by design**; multi-agent + human-guided modes are *future work* | graded headline REFUTED 1-2 | +| **SlopCodeBench** (arXiv 2603.24755) | **Yes** — 4 solve-rate variants + continuous [0,1] erosion/verbosity | Iterative **on the artifact only** — *deliberately wipes prior conversation*; fresh Docker per checkpoint, only the workdir persists | Disqualified for *conversational* steer (no carried context). NB: it already ran a steer comparison — quality prompts cut initial erosion but did **not** slow per-checkpoint degradation (~1.3pp/ckpt), at +12.1% cost | 3-0 | +| **SWE-Lancer** | **No** — payout only if *all* applicable tests pass; graded only by summed $ of whole tasks | **No** — independent single-deliverable tasks + managerial choices | Poor (no smooth curve) | 3-0 | +| **MLE-bench** | Medal/percentile (effectively binary per task) | **No** — one final CSV; the agent's own internal ~24h loop, graded only on the submission | Moderate at best | 2-1 | + +## What ProgramBench / "program bench" is + +The Meta/FAIR **rebuild-from-scratch** benchmark (arXiv 2605.03546, github.com/facebookresearch/programbench, +May 2026): a single SWE-agent rebuilds programs via a human-like write-compile-debug cycle in a +persistent Docker session (1,000 steps / 6 hours). Single-agent-only by design; **not** built for +steer-policy comparison (that is invited as future work). A usable graded substrate via its +"% Tests Passed per instance" secondary metric, but the headline "% Resolved" is binary. + +## Caveats (carried verbatim from the verifier) + +- **Scope gap — not adversarially verified this round:** SWE-Gym, SWE-bench Verified, SWE-bench + Multimodal, MLAgentBench, RepoBench, the original single-control τ-bench, AppWorld, + TerminalBench, OSWorld, GAIA, WebArena, VisualWebArena, Cybench. Most are predominantly + binary/single-deliverable or web/OS/security-domain (likely poor for a graded software-build + curve), but confirm before relying on it. +- **Name collisions:** the graded software-dev **DevBench** is arXiv **2403.08604** (not 2601.11895); + **FeatureBench** (2602.10975) ≠ the 2025 "FeatBench" (2509.22237); **ProgramBench** resolves only + to the Meta/FAIR 2605.03546. +- **Dating:** ProgramBench / FeatureBench / SlopCodeBench carry 2026 arXiv IDs; their leaderboard + numbers will move, but the *design* properties cited (graded vs binary, step caps, context-carry + semantics) are structural and stable. +- **Interpretive hedge:** "smooth curve" depends on per-task test count. SlopCodeBench's existing + steer result (steering does not slow degradation) is the closest direct evidence for the + hypothesis, but it is artifact-iterative, not conversation-continued, so it may not generalize. + +## Implication for the harness + +For a graded, multi-turn, long-horizon software-build adapter, **Commit0 is the slot-in** +(graded + natively interactive). It plugs into the `BenchmarkAdapter` contract as one entry; the +`executionMode: 'continued-session'` dial is what makes "steer a continued build conversation" +meaningful (without it, steering degrades to a re-attempt). diff --git a/docs/research/recursive-execution-atom.md b/docs/research/recursive-execution-atom.md new file mode 100644 index 0000000..912d5b5 --- /dev/null +++ b/docs/research/recursive-execution-atom.md @@ -0,0 +1,295 @@ +> **Track:** Architecture (research) · **Role:** design research (in progress) · **Status:** surface proposed; keystone build plan pending the `wnrxtvdta` design pass + 4 user forks + +# Recursive execution atom + +The next architecture generation. Today the loop is one level deep: a driver drives one +agent over rounds. The target is **full generality**: an agent that *is* a driver, fanning +out sub-loops of drivers-driving-agents, recursively — with analysts watching at every +level, dynamic asynchronous spawning, and a conversational, observable root. + +This doc holds the vision, the proposed surface, the honest gap vs the current code, and the +open forks. It supersedes nothing in [`../architecture.md`](../architecture.md) until a design ships. + +## The vision (the intent, distilled from the operator) + +- **Agents run tasks. Drivers drive agents. Analysts watch.** Traces from the agents flow to + the driver; analysts turn traces into findings the driver steers on. +- **Analysts come in three runtimes.** An external CLI/RLM (e.g. Halo), our inline trace-analyst + (a bare LLM call, not a sandboxed agent), or a full agent in a sandbox tasked with "analyze + these traces and metadata, emit an output." These are *not* three types. +- **Nested: an agent is a driver of drivers.** An agent can fan out multiple loops of + drivers-driving-agents; that agent is then itself a driver. Recursive, self-similar. +- **The "tensor" is dynamic and asynchronous, not eager fan-out.** We do **not** want an agent + exploding into 20 sub-drivers up front. We want: when one branch completes, the agent can + spawn a *new* branch (possibly a different flow); the agent can say "run driver A for n + shots and driver B for k shots" (heterogeneous per-child budgets); branches run async. +- **Leaves are opaque, self-parallelizing coding harnesses.** The coding agents sit at the + bottom. They are full harnesses that parallelize *inside themselves* (their own sub-agents). + The recursion we build is the *driver/policy* layer above them. +- **The root is eventually conversational + observable.** You hook the root agent to a chatbot + (a pi extension with a live visualization of the spawning tree). You ask it "what's currently + in flow?" while branches run asynchronously. +- **Test 100% of the problem space, disciplined.** Build the general mechanism now — not a thing + that traps us testing 5% today and tomorrow — but keep it focused, not crazy. + +## Two planes — and B contains A + +| | Plane A — experiment harness | Plane B — recursive execution atom | +|---|---|---| +| Shape | flat: compare N arms at equal compute | recursive: agent → drivers → agents, async | +| Surface | `profiles × steer × executionMode × allocation` | one `Agent` atom + a `Scope` + a `Supervisor` | +| Built by | `wuh46e5zp` (see [flat-harness-design.md](./flat-harness-design.md)) | this doc | +| Answers | the gate (diverse@k vs blind@k) | the full vision | + +**Decision: Plane B contains Plane A.** The flat harness is recovered as *the simplest possible +`act` body* — a root driver that spawns one child per profile at a fixed budget and selects the +best. So the `wuh46e5zp` design is not a competing v1; it becomes the canonical example program +over the atom, and its `executionMode`/`allocation` axes become spawn options. + +## The thesis: one recursive atom, run as a durable, observable supervision tree + +Not three subsystems — **one atom + one executor**, plus two things this repo already has +(the durable journal in `src/durable/`, the conversation engine in `src/conversation/`) wired +in as the observability skin. The shape is the intersection of three mature systems: + +- **Structured concurrency** (Trio nursery / Swift TaskGroup / Ray dynamic task graph): `act` + runs inside a *scope* that can `spawn` children dynamically and react to them **as each + finishes**. This is "spawn-on-completion" and "driver A for n shots, B for k shots." +- **Durable execution** (Temporal): the tree is **event-sourced** — every spawn/complete is + journaled, so it is resumable, queryable ("what's in flow?"), and a chat/signal handle can + attach to the live root. Observability falls out of the event log; you don't build it twice. +- **MCTS progressive widening**: the reason you do *not* fan out to 20 at once — a node widens + (spawns more children) only as a branch proves promising, under a global budget. This is the + governor that keeps "full generality" from becoming "boil the ocean." + +### The atom (one self-similar type) + +```ts +interface Agent { + act(task: Task, scope: Scope): Promise +} +``` + +- **Coder** = an `Agent` that does not spawn (a leaf). The coding harness self-parallelizes; opaque to us. +- **Driver** = an `Agent` whose `act` spawns child agents and runs a policy over their streaming + results. "An agent is a driver" = a driver is just an `Agent` that spawns. +- **Analyst** = an `Agent` whose task is "read these traces → findings." The CLI/inline/sandbox + question collapses to a `runtime` on the spawn (below). Same type, three backends. + +### The `Scope` — the only new mechanism + +```ts +scope.spawn(agent, task, { budget, runtime, label }) // -> Handle ; dynamic, async +scope.next() // resolves as each child finishes -> react, spawn more (ray.wait) +scope.view() // the live tree: every node's id / parent / status / budget / partial result +``` + +```ts +type Runtime = 'sandbox' | 'cli' | 'inline' +// 'cli' = Halo / an external RLM invoked as a subprocess +// 'inline' = a bare LLM call (today's trace-analyst), no box +// 'sandbox'= a full coding/analysis agent in a box +``` + +The **analyst answer**: an analyst is an `Agent`; *where it runs* is the `runtime`. Halo is +`runtime: 'cli'`, our trace-analyst is `runtime: 'inline'`, a sandboxed analysis agent is +`runtime: 'sandbox'`. One type, three handlers — no `Analyst` subsystem. + +### Plane A as the simplest `act` (sketch) + +```ts +// The flat harness, recovered: spawn one child per profile, fixed budget, pick the best. +const flatHarness: Agent = { + async act(bench, scope) { + for (const p of bench.profiles) scope.spawn(coder(p), bench.task, { budget: bench.k, runtime: 'sandbox', label: p.name }) + const results = [] + while (results.length < bench.profiles.length) results.push(await scope.next()) + return selectBest(results) + }, +} +``` + +### Spawn-on-completion + progressive widening (the dynamic shape) + +```ts +// A driver that widens toward promising branches under a global budget, async. +async act(task, scope) { + let live = seedChildren(task).map((c) => scope.spawn(c.agent, c.task, { budget: c.shots, runtime: 'sandbox' })) + const done = [] + while (scope.budget.remaining() > 0 && live.length) { + const ev = await scope.next() // a child finished + done.push(ev) + if (promising(ev) && scope.budget.remaining() > THRESH) + live.push(scope.spawn(widen(ev), nextTask(ev), { budget: ev.shots, runtime: 'sandbox' })) // widen, don't pre-fan + } + return synthesize(done) +} +``` + +## What exists vs the gap (file-grounded; verify before building) + +| Component | File | Status | Gap | +|---|---|---|---| +| The atom signature | `src/loops/program.ts` (`Agent.act → Output \| Program`, op-set, `runProgram`, `maxDepth=4`) | **right shape** | `act` returns a *static `Program`*; need `act(task, scope)` with **dynamic** `spawn`/`next` (not a pre-authored tree). | +| Leaf execution | `src/loops/run-loop.ts` (box create / `streamPrompt` / teardown; the `collectBox` same-sandbox seam) | **keep** | The leaf already runs a coding harness; `runtime: 'sandbox'` maps here. | +| Round-synchronous planner | `src/loops/drivers/dynamic.ts` (`createDynamicDriver`, `PlannerContext.analyses`, selector≠judge firewall) | **evolve** | Planner is round-synchronous (plan → run a batch → observe all → plan). Need async-streaming reaction (`scope.next()` on *individual* completions). | +| Durable journal | `src/durable/` (`handleChatTurn`, journal/resume) | **wire-in** | Candidate **event source** for the Supervisor (every spawn/complete journaled → replay + query). Needs node-level events. | +| Conversation engine | `src/conversation/` (turn loop, `selectSpeaker`, `ConversationJournal`) | **wire-in** | Candidate **chat handle** over a live Supervisor ("talk to the root / what's in flow"). | +| Supervisor executor | — | **net-new** | The keystone: a live node registry running `act`, async, on the journal. Replaces the batch `runProgram` tree-walk. | +| `Scope` | — | **net-new** | The keystone capability: `spawn` / `next` / `view` + budget. | + +**The keystone is `Scope` + `Supervisor`.** Leaves, the analyst hook, Plane A, observability, +and the chat handle all fall out of it (or already exist). + +## Open forks (recommended answers; awaiting the operator) + +1. **Event-sourced supervisor?** _Recommended: yes, from day one._ This repo's science needs a + reproducible corpus (paired bootstrap + BH), but a free-running async supervisor is + nondeterministic. Build the Supervisor on `src/durable/`'s journal as the source of truth → + replayable (science) *and* queryable/resumable (the chat handle). Temporal proves you get + observability for free from the event log; don't build two executors. **Most load-bearing.** +2. **Conversation now, or substrate-now / client-later?** _Recommended: substrate now._ Build + `scope.view()` + a node-event channel in v1; defer the chatbot/pi-viz to a thin client. + "Eventually" → make a rewrite unnecessary, don't pay for the UI now. +3. **Spawn policy: code, LLM, or both — default?** _Recommended: `act` is code; LLM-decided + spawning is the researcher's choice._ v1 ships coded policies (fixed / round-robin / + progressive-widening); the **LLM meta-driver** is opt-in, not default — a learned/LLM + meta-controller is exactly the "mechanism ahead of the gate" the repo warns against, and it + is nondeterministic. +4. **Global budget as a hard ceiling?** _Recommended: yes, fail-closed at the root._ One root + budget (tokens / $ / wall); the Supervisor enforces it; policies widen within it. + +## Decision log + +- **Full tensor now** (the recursive atom is v1, built as durable mechanism). _(2026-06-04)_ +- **B contains A** (flat harness = simplest `act`). _(2026-06-04)_ +- **Analyst = Agent + `runtime`** (`cli`/`inline`/`sandbox`). _(2026-06-04)_ +- **Leaves = opaque self-parallelizing coding harnesses.** _(2026-06-04)_ + +## Design pass `wnrxtvdta` — reconciled (the frozen contract) + +6 prior-art lenses + 4 codebase mappers → synthesis → adversarial critique → reconcile. + +**BLUF.** The mechanism is agreed: `scope.next()` = a ray.wait cursor over a structured-concurrency +nursery. The critique then landed **3 blockers + 3 majors**, all on one fault line: *the headline +property (durable + queryable + reproducible replay) and the reason-to-exist (a clean equal-k gate) +both break for the same root cause — budget was a **ceiling** not a **reservation**, and the journal +recorded **decisions** but not the **evidence** those decisions consumed.* Two invariants make the +keystone survive: (1) **budget is an atomically-reserved conserved pool**, so `Σk(treatment) ≡ Σk(blind)` +by construction; (2) **the journal records a content-addressed `outRef`** per child result, so replay +rehydrates the exact `Settled` the driver branched on. The keystone is the **budget-conserving reactive +`Scope`** — not the LLM meta-driver. + +### The frozen surface (build against this) + +```ts +// One self-similar atom. A leaf is an Agent that never calls scope.spawn. +interface Agent { readonly name: string; act(task: Task, scope: Scope): Promise } + +// The runtime is ONE OPEN INTERFACE, not a closed union (operator's refinement). A LeafExecutor +// is anything with an `execute` that returns a Promise OR an async stream of normalized usage. +// Our built-ins are just the initial IMPLEMENTATIONS; a user's own agent (mastra, agno, a raw +// HTTP call, anything) is first-class the moment it implements the interface. NO per-vendor +// adapters, no "future adapter" code — the interface IS the extension point. +// - router/inline : a direct Router/HTTP inference call, no box (an agent with harness: null) +// - sandbox : COMPOSES the existing runLoop kernel as a leaf (+ PR #150's `lineage` +// passthrough for leaf-level continue/fork — does NOT reinvent checkpoint/fork) +// - cli : Halo/RLM subprocess; budgetExempt, excluded from equal-k by construction +// An agent selects its executor via its AgentProfile (harness: null => router/inline; harness: +// => sandbox), OR carries a custom LeafExecutor / executor-factory directly (BYO). +interface LeafExecutor { + // returns a Promise for one-shot executors, OR an async stream of UsageEvents for + // streaming ones; the architect picks the minimal shape that supports both with normalized usage. + execute(task: unknown, signal: AbortSignal): Promise> | AsyncIterable + teardown(grace: number | 'brutalKill' | 'infinity'): Promise<{ destroyed: boolean }> + resultArtifact(): { outRef: string; out: Out; verdict?: DefaultVerdict; spent: Spend } // B1: replay source +} +type UsageEvent = { kind: 'tokens'; input: number; output: number } | { kind: 'cost'; usd: number } | { kind: 'iteration' } +// M3/B3: LoopTokenUsage is {input,output} ONLY — usd is a SEPARATE channel. + +interface Budget { readonly maxIterations: number; readonly maxTokens: number; readonly maxUsd?: number; readonly deadlineMs?: number } +interface Spend { iterations: number; tokens: LoopTokenUsage; usd: number; ms: number } + +type Restart = 'temporary' | 'transient' | 'permanent' // OTP child_spec +type NodeStatus = 'pending' | 'acquiring' | 'running' | 'done' | 'failed' | 'cancelled' // M1: 'acquiring' first-class +interface SpawnOpts { readonly budget: Budget; readonly label: string; readonly restart?: Restart; readonly shutdown?: number | 'brutalKill' | 'infinity' } +interface Handle { readonly id: NodeId; readonly label: string; readonly status: NodeStatus; abort(reason?: string): void } +// M1: abort() is defined over the ACQUIRE lifecycle (chains into acquireSandbox signal + reaps find-by-name orphan box). + +type Settled = + | { kind: 'done'; handle: Handle; out: Out; outRef: string; verdict?: DefaultVerdict; spent: Spend; seq: number } + | { kind: 'down'; handle: Handle; reason: string; infra: boolean; restartCount: number; seq: number } +// B2: seq = monotonic cursor order next() yielded (NOT wall-clock); replay delivers strictly in seq order. + +interface Scope { + // M5: reserves budget atomically from the shared pool; FAILS CLOSED when the pool can't cover it; refunds unspent on settle. + spawn(agent: Agent, task: unknown, opts: SpawnOpts): + { ok: true; handle: Handle } | { ok: false; reason: 'budget-exhausted' | 'depth-exceeded' } + next(): Promise | null> // ray.wait n=1 over THIS scope's IN-MEMORY live set; null when empty + readonly view: TreeView // reads the in-memory nursery (NOT the log); O(live) + readonly budget: Readonly<{ tokensLeft: number; usdLeft: number; deadlineMs: number; reservedTokens: number }> +} + +// Event source — the decision/payload split the replay argument rests on (B1/B2): +type SpawnEvent = + | { kind: 'spawned'; id: NodeId; parent?: NodeId; label: string; budget: Budget; runtime: Runtime; seq: number; at: string } + | { kind: 'settled'; id: NodeId; status: 'done' | 'down'; outRef?: string; verdict?: DefaultVerdict; spent: Spend; infra?: boolean; seq: number; at: string } + | { kind: 'cancelled'; id: NodeId; reason: string; seq: number; at: string } +interface SpawnJournal { loadTree(root: NodeId): Promise; beginTree(root: NodeId, at: string): Promise; appendEvent(root: NodeId, ev: SpawnEvent): Promise } +interface ResultBlobStore { put(outRef: string, artifact: unknown): Promise; get(outRef: string): Promise } + +// Supervisor — owns the conserved pool, the spawn log, the abort cascade, the OTP intensity breaker, the root handle. +interface Supervisor { run(root: Agent, task: Task, opts: SupervisorOpts): Promise>; attach(h: RootHandle): void } +type SupervisedResult = + | { kind: 'winner'; out: Out; outRef: string; verdict?: DefaultVerdict; tree: TreeView; spentTotal: Spend } + | { kind: 'no-winner'; reason: 'all-children-down' | 'budget-exhausted' | 'aborted'; tree: TreeView; downCount: number } // M2: typed, never best! +interface RootHandle { view(): TreeView; signal(msg: RootSignal): void; abort(reason?: string): void } // Q2 substrate +``` + +**Replay invariant (now enforceable):** a driver's `act()` may read `verdict`, `spent`, and `out` +(rehydrated by `outRef`); it MUST NOT read anything not delivered through `Settled` — no `Date.now`, +no `Math.random`, no unordered collections. `next()` delivers strictly in recorded `seq` order. + +### Build order (v1 = the instrument) + +| # | Step | Net-new/Evolve | File | Fixes | +|---|------|---|---|---| +| 1 | `mapPool` one-for-all → one-for-one: a thrown child becomes a `down` record, excluded from merge `n`; survivors still reach `concatRuns`. | Evolve | `program.ts:408-433` | infra-exclusion | +| 2 | **Conserved budget pool**: `Spend` from a normalized `UsageEvent` stream (tokens + usd separate); atomic reserve-on-spawn / reconcile-on-settle; fail-closed admission. | Evolve | `types.ts`, `drivers/report-usage.ts` | **M5,B3** | +| 3 | `SpawnJournal` + `ResultBlobStore` (in-mem + JSONL/FS); sink over the existing `LoopTraceEvent` lineage. | Net-new/Evolve | `src/durable/spawn-journal.ts` (new); wire `run-loop.ts:183` | **B1** | +| 4 | **`Scope` impl** (KEYSTONE): ray.wait cursor over in-memory nursery; `spawn` reserves from step-2 pool; deterministic `${parent}:s${seq}` ids; `view`/`inFlight` read memory. | Net-new | `src/loops/scope.ts` (new) | **B2,m1,m2** | +| 5 | **`Supervisor` impl** (KEYSTONE): nursery join barrier (generalize run-loop's `finally{allSettled(destroy)}`); abort cascade; abort-chains-into-`acquireSandbox` + find-by-name reap; OTP intensity breaker; typed `SupervisedResult`. | Net-new | `src/loops/supervisor.ts` (new) | **M1,M2** | +| 6 | `LeafExecutor` + per-harness impls (`inline`/`sandbox`/`cli`), each emitting normalized `UsageEvent`; `sandbox` = existing `runLoop` as a leaf; `cli`-without-accounting = `budgetExempt` + excluded from equal-k. | Evolve | `types.ts`, `src/loops/runtime.ts` (new) | **M3** | +| 7 | Replay executor: re-feed `SpawnJournal` + rehydrate `out` from `ResultBlobStore` in `seq` order; `view()` materializer for resume. | Net-new | `src/durable/spawn-journal.ts` | **B1,B2** | +| 8 | `Settled.done → Iteration` adapter at the merge boundary so `defaultSelectWinner` stays single-sourced. | Net-new (small) | `src/loops/scope.ts` | **M4** | +| — | `flatHarness` driver (Plane-A control) + **equal-k assertion** `Σiterations(treatment) ≡ Σiterations(blind)` per task or the cell is excluded. | Net-new | `bench/` | **B3** | +| — | **LLM meta-driver** (treatment) + coded progressive-widening — `WidenGate` **defaults to flat** (never widens) so the firewall conflict stays dormant; widening, when on, derives "promising" from **trace findings, not raw `verdict`**, or carries an explicit argued `judgeExempt`. | Net-new | `bench/` | **R2** | + +**Deferred** (gated on a *positive* diverse-strategy result): a tuned MCTS-PW algorithm, learned +widening, per-branch adaptive sub-agents, a Temporal/DBOS durable backend, the OTP strategy matrix, +deleting `runProgram`'s loop-layer `parallel` op (supersede-vs-coexist is fork F1). + +### Resolved / risks / verdict + +- **Resolved by the surface:** B1 (outRef + replay invariant), B2 (in-memory live set + seq cursor), M1 (`acquiring` + acquire-aware abort), M2 (typed `SupervisedResult`), M3 (`LeafExecutor` + normalized usage), M5 (atomic reservation, fail-closed). +- **Residual risks (measure, don't hide):** R1 — the recorded interleaving is *one* sample; equal-*k* is enforceable, equal-*topology* is not → report realized tree shape per cell. R2 — widening-from-`verdict` *is* steering-from-the-judge (collides with `assertTraceDerivedFindings`, dynamic.ts:344); dormant while `WidenGate` is flat. R3 — runtime `maxDepth` is weaker than the static guard; pair it with the conserved pool so runaway recursion hits budget-exhaustion first. +- **Pass verdict (advisory):** "ship the keystone, make the LLM meta-driver wait." **Operator override (2026-06-04): build the LLM meta-driver now, as the treatment, on top of the budget-reservation invariant** — the invariant is what keeps the result valid; the coded progressive-widening + flat-harness are the controls; `WidenGate` defaults to flat for gate runs. + +## Decisions resolved (the 4 forks) + +- **Q1 — yes, event-sourced** (SpawnJournal + ResultBlobStore + replay; budget-pool conserved). +- **Q2 — substrate now** (`TreeView` + `RootHandle.view`/`signal` + the event stream; chatbot/pi-viz is a later thin client). +- **Q3 — LLM meta-driver built now** (operator call), as the treatment, with coded progressive-widening + flat-harness as controls. The runtime is **one open `LeafExecutor` interface** (`execute` → promise or async stream), not a closed union — built-ins (router/inline, sandbox, cli) are implementations, and any user agent (mastra/agno/HTTP/custom) is first-class by implementing it. An agent selects its executor via `AgentProfile` (`harness: null` = direct Router call; `harness: ` = sandboxed) or carries a custom executor directly. +- **Q4 — hard ceiling, yes — sharpened to a conserved *reservation* pool** (atomic reserve/refund, fail-closed), tokens + usd, enforced at the root. + +## Relationship to PR #150 (leaf-level continued-session + fork) + +PR #150 (`feat/runloop-session-continuation-and-fork`) adds `RunLoopOptions.lineage` — opt-in, +default-OFF, backend-blind — so a *single* `runLoop` can continue a session across its iterations +(`sessionContinuity`) or fork a parent checkpoint across a fanout (`forkFanout`, gated on +`criuStatus().canFork`). That is the **leaf-level** depth/breadth dial. The recursive atom sits +**on top**: the `sandbox` `LeafExecutor` *composes* `runLoop` and forwards this `lineage` +passthrough — it does **not** reinvent checkpoint/fork. (Reviewed 2026-06-04: approve-to-land; +before enabling, verify the platform honors a client-minted `sessionId` (else `continue` is a +silent no-op), bound fork box-creation by `maxConcurrency`, and document that `forkFanout` +inherits the parent image so heterogeneous-profile branches must not use it.) diff --git a/src/durable/spawn-journal.ts b/src/durable/spawn-journal.ts new file mode 100644 index 0000000..e1ff440 --- /dev/null +++ b/src/durable/spawn-journal.ts @@ -0,0 +1,464 @@ +/** + * @experimental + * + * Event-sourced spawn journal for the recursive execution atom (build steps 3 + 7). + * + * The supervision tree is journaled as an append-only event log: every `spawned`, + * `settled`, and `cancelled` is recorded AFTER it is observed-committed (never + * speculative), mirroring `ConversationJournal`'s begin/append/load shape. The log + * holds only the THIN decision record — ids, parentage, budget, the spend a decision + * consumed, and a content-addressed `outRef`. The payloads the driver branched on + * (the `out` artifacts) live in a separate `ResultBlobStore`, keyed by `outRef`, so + * the journal stays small (decisions) and replay rehydrates the exact `Settled` from + * the blob store (evidence). This is the decision/payload split the replay argument + * rests on (B1/B2). + * + * Replay determinism (B2): `seq` is the monotonic cursor order `scope.next()` yielded + * each settlement — NOT wall-clock. `replaySpawnTree` sorts strictly by `seq` before + * touching the blob store, so the order in which rehydration `get`s resolve can never + * reorder the replayed `Settled[]`; the result is identical regardless of blob latency. + */ + +import { createHash } from 'node:crypto' +import type { + NodeId, + NodeSnapshot, + NodeStatus, + ResultBlobStore, + Runtime, + Settled, + SpawnEvent, + SpawnJournal, + Spend, + TreeView, +} from '../loops/supervise/types' +import { zeroTokenUsage } from '../loops/util' + +// ── Content addressing ────────────────────────────────────────────────────── + +/** + * Mint the content-addressed `outRef` for a result artifact: `sha256:` over a + * stable JSON encoding. Producers call this to derive the `outRef` they journal and + * `put`; the FS/in-mem stores re-derive it on `put` to verify the supplied ref + * matches (fail loud on a mismatch — a forged ref breaks the replay invariant). + * + * Stable encoding: object keys are sorted recursively so two structurally-equal + * artifacts hash identically regardless of key insertion order. + */ +export function contentAddress(artifact: unknown): string { + const hex = createHash('sha256').update(stableStringify(artifact), 'utf-8').digest('hex') + return `sha256:${hex}` +} + +function stableStringify(value: unknown): string { + if (value === null || typeof value !== 'object') return JSON.stringify(value) ?? 'null' + if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]` + const entries = Object.entries(value as Record) + .filter(([, v]) => v !== undefined) + .sort(([a], [b]) => (a < b ? -1 : a > b ? 1 : 0)) + return `{${entries.map(([k, v]) => `${JSON.stringify(k)}:${stableStringify(v)}`).join(',')}}` +} + +// ── Result blob store ───────────────────────────────────────────────────────── + +/** + * In-memory `ResultBlobStore`. Content-addressed: `put` verifies the supplied + * `outRef` matches the artifact's hash so a stale/forged ref fails loud rather than + * silently rehydrating the wrong payload. Idempotent on an identical re-put. + */ +export class InMemoryResultBlobStore implements ResultBlobStore { + private readonly blobs = new Map() + + async put(outRef: string, artifact: unknown): Promise { + assertContentAddress(outRef, artifact) + this.blobs.set(outRef, artifact) + } + + async get(outRef: string): Promise { + return this.blobs.has(outRef) ? this.blobs.get(outRef) : undefined + } +} + +/** + * FS `ResultBlobStore`. One JSON file per artifact under `dir`, named by a + * filesystem-safe encoding of the `outRef` (`sha256:` → `sha256-.json`). + * `put` fsyncs so a crash between writes never loses an acknowledged blob. + */ +export class FileResultBlobStore implements ResultBlobStore { + constructor(private readonly dir: string) {} + + async put(outRef: string, artifact: unknown): Promise { + assertContentAddress(outRef, artifact) + const fs = await import('node:fs/promises') + await fs.mkdir(this.dir, { recursive: true }) + const fh = await fs.open(this.blobPath(outRef), 'w') + try { + await fh.write(JSON.stringify(artifact)) + await fh.sync() + } finally { + await fh.close() + } + } + + async get(outRef: string): Promise { + const fs = await import('node:fs/promises') + let text: string + try { + text = await fs.readFile(this.blobPath(outRef), 'utf8') + } catch (err) { + if (isNoEntError(err)) return undefined + throw err + } + return JSON.parse(text) + } + + private blobPath(outRef: string): string { + return `${this.dir}/${outRef.replace(/:/g, '-')}.json` + } +} + +function assertContentAddress(outRef: string, artifact: unknown): void { + const expected = contentAddress(artifact) + if (outRef !== expected) { + throw new Error( + `blob outRef '${outRef}' does not match the artifact content hash '${expected}'; ` + + 'a content-addressed store refuses a mismatched ref (breaks the replay invariant)', + ) + } +} + +// ── Spawn journal ────────────────────────────────────────────────────────────── + +/** + * In-memory `SpawnJournal`. Appends are observed-committed only; the impl enforces + * the corruption guards a durable replay rests on: + * - an event before `beginTree` is a corrupted tree (fail loud), + * - a duplicate `seq` within a tree is a corrupted cursor (fail loud) — two + * settlements cannot share the cursor position replay orders by. + */ +export class InMemorySpawnJournal implements SpawnJournal { + private readonly trees = new Map() + + async loadTree(root: NodeId): Promise { + const tree = this.trees.get(root) + if (!tree) return undefined + return tree.events.map((ev) => ({ ...ev })) + } + + async beginTree(root: NodeId, at: string): Promise { + const existing = this.trees.get(root) + if (existing) { + if (existing.begunAt !== at) { + throw new Error( + `spawn tree '${root}' already begun at ${existing.begunAt}; refusing to overwrite with ${at}`, + ) + } + return + } + this.trees.set(root, { begunAt: at, events: [] }) + } + + async appendEvent(root: NodeId, ev: SpawnEvent): Promise { + const tree = this.trees.get(root) + if (!tree) { + throw new Error(`appendEvent called for unknown spawn tree '${root}'; call beginTree first`) + } + assertSeqUnique(root, tree.events, ev) + tree.events.push({ ...ev }) + } +} + +/** + * JSONL on disk. One line per record: the first record is `begin`, subsequent records + * are `event` envelopes wrapping a `SpawnEvent`. `loadTree` replays the whole file, + * filtering by `root`, and applies the same begin-precedes-events + unique-seq + * corruption guards as the in-memory impl. Each append fsyncs so a crash between + * writes never loses an acknowledged event. + */ +export class FileSpawnJournal implements SpawnJournal { + constructor(private readonly path: string) {} + + async loadTree(root: NodeId): Promise { + const fs = await import('node:fs/promises') + let text: string + try { + text = await fs.readFile(this.path, 'utf8') + } catch (err) { + if (isNoEntError(err)) return undefined + throw err + } + const lines = text.split('\n').filter((line) => line.length > 0) + let begun = false + const events: SpawnEvent[] = [] + for (const line of lines) { + const record = JSON.parse(line) as SpawnJournalRecord + if (record.root !== root) continue + if (record.kind === 'begin') { + begun = true + } else { + if (!begun) { + throw new Error( + `spawn journal corrupted: event for tree '${root}' precedes its begin record`, + ) + } + assertSeqUnique(root, events, record.event) + events.push(record.event) + } + } + return begun ? events : undefined + } + + async beginTree(root: NodeId, at: string): Promise { + const existing = await this.loadTreeBegin(root) + if (existing) { + if (existing !== at) { + throw new Error( + `spawn tree '${root}' already begun in ${this.path} at ${existing}; refusing to overwrite with ${at}`, + ) + } + return + } + await this.appendRecord({ kind: 'begin', root, at }) + } + + async appendEvent(root: NodeId, ev: SpawnEvent): Promise { + const events = await this.loadTree(root) + if (events === undefined) { + throw new Error(`appendEvent called for unknown spawn tree '${root}'; call beginTree first`) + } + assertSeqUnique(root, events, ev) + await this.appendRecord({ kind: 'event', root, event: ev }) + } + + private async loadTreeBegin(root: NodeId): Promise { + const fs = await import('node:fs/promises') + let text: string + try { + text = await fs.readFile(this.path, 'utf8') + } catch (err) { + if (isNoEntError(err)) return undefined + throw err + } + const lines = text.split('\n').filter((line) => line.length > 0) + for (const line of lines) { + const record = JSON.parse(line) as SpawnJournalRecord + if (record.root === root && record.kind === 'begin') return record.at + } + return undefined + } + + private async appendRecord(record: SpawnJournalRecord): Promise { + const fs = await import('node:fs/promises') + const path = await import('node:path') + await fs.mkdir(path.dirname(this.path), { recursive: true }) + const fh = await fs.open(this.path, 'a') + try { + await fh.write(`${JSON.stringify(record)}\n`) + await fh.sync() + } finally { + await fh.close() + } + } +} + +type SpawnJournalRecord = + | { kind: 'begin'; root: NodeId; at: string } + | { kind: 'event'; root: NodeId; event: SpawnEvent } + +/** + * Two `seq` namespaces share the journal: a `spawned` event's `seq` is the spawn ordinal + * (the order children were created), and a `settled`/`cancelled` event's `seq` is the + * monotonic CURSOR order `scope.next()` yielded that settlement (B2). The uniqueness + * replay rests on is the cursor namespace — two settlements cannot share the position + * replay orders by — so the guard checks only settled/cancelled events. A `spawned` + * ordinal legitimately equals a later `settled` cursor seq and is not a collision. + */ +function assertSeqUnique(root: NodeId, events: SpawnEvent[], ev: SpawnEvent): void { + if (ev.kind === 'spawned') return + if (events.some((e) => e.kind !== 'spawned' && e.seq === ev.seq)) { + throw new Error( + `spawn journal corrupted: duplicate cursor seq ${ev.seq} in tree '${root}'; ` + + 'the cursor order replay relies on is not unique', + ) + } +} + +// ── Replay executor (build step 7) ─────────────────────────────────────────────── + +/** + * Re-feed a journaled spawn tree in strict `seq` order, rehydrating each settled + * child's `out` from the blob store by `outRef`, and return the `Settled[]` exactly + * as `scope.next()` originally delivered them. + * + * Determinism (B2): the events are sorted by `seq` BEFORE any blob `get`, so the + * replay order is the recorded cursor order regardless of how fast each rehydration + * resolves. `at` (wall-clock) is never a replay input. Fail loud on a tree that was + * never begun, a settled-done event missing its `outRef`, or a blob the store can't + * rehydrate — a silent gap would let `act` branch on the wrong evidence. + */ +export async function replaySpawnTree( + journal: SpawnJournal, + blobs: ResultBlobStore, + root: NodeId, +): Promise[]> { + const events = await journal.loadTree(root) + if (events === undefined) { + throw new Error(`replaySpawnTree: no journaled tree for root '${root}'`) + } + const ordered = [...events].sort((a, b) => a.seq - b.seq) + const labels = new Map() + for (const ev of ordered) { + if (ev.kind === 'spawned') labels.set(ev.id, ev.label) + } + const settled: Settled[] = [] + for (const ev of ordered) { + if (ev.kind === 'spawned') continue + if (ev.kind === 'cancelled') { + settled.push({ + kind: 'down', + handle: replayHandle(ev.id, labels.get(ev.id) ?? ev.id, 'cancelled'), + reason: ev.reason, + infra: false, + restartCount: 0, + seq: ev.seq, + }) + continue + } + if (ev.status === 'down') { + settled.push({ + kind: 'down', + handle: replayHandle(ev.id, labels.get(ev.id) ?? ev.id, 'failed'), + reason: ev.verdict?.notes ?? 'child down', + infra: ev.infra === true, + restartCount: 0, + seq: ev.seq, + }) + continue + } + if (ev.outRef === undefined) { + throw new Error( + `replaySpawnTree: settled-done event for '${ev.id}' (seq ${ev.seq}) has no outRef; ` + + 'cannot rehydrate the result the driver branched on', + ) + } + const out = await blobs.get(ev.outRef) + if (out === undefined) { + throw new Error( + `replaySpawnTree: blob store has no artifact for outRef '${ev.outRef}' (node '${ev.id}', seq ${ev.seq})`, + ) + } + settled.push({ + kind: 'done', + handle: replayHandle(ev.id, labels.get(ev.id) ?? ev.id, 'done'), + out, + outRef: ev.outRef, + verdict: ev.verdict, + spent: ev.spent, + seq: ev.seq, + }) + } + return settled +} + +function replayHandle(id: NodeId, label: string, status: NodeStatus) { + return { + id, + label, + status, + abort() { + throw new Error(`cannot abort node '${id}': replayed handles are terminal, not live`) + }, + } +} + +/** + * Materialize the live tree (`TreeView`) from a journaled event list for resume. Folds + * `spawned`/`settled`/`cancelled` into a per-node snapshot in `seq` order so the + * resumed view matches what `scope.view` showed at the recorded cursor position. + */ +export function materializeTreeView(events: SpawnEvent[]): TreeView { + const nodes = new Map() + let root: NodeId | undefined + // `spawned` (ordinal namespace) and `settled`/`cancelled` (cursor namespace) carry + // overlapping `seq` values, so create every node before any update — process spawns in + // ordinal order, then settlements/cancellations in cursor order. A settle/cancel for an + // un-spawned node is a corrupted log (fail loud via requireNode). + const spawns = events + .filter((ev): ev is Extract => ev.kind === 'spawned') + .sort((a, b) => a.seq - b.seq) + const settlements = events.filter((ev) => ev.kind !== 'spawned').sort((a, b) => a.seq - b.seq) + for (const ev of spawns) { + if (ev.parent === undefined && root === undefined) root = ev.id + nodes.set(ev.id, { + id: ev.id, + parent: ev.parent, + label: ev.label, + status: 'pending', + runtime: ev.runtime, + budget: ev.budget, + spent: zeroSpend(), + }) + } + for (const ev of settlements) { + if (ev.kind === 'settled') { + const node = requireNode(nodes, ev.id) + node.status = ev.status === 'done' ? 'done' : 'failed' + node.spent = ev.spent + node.outRef = ev.outRef + } else { + const node = requireNode(nodes, ev.id) + node.status = 'cancelled' + } + } + const snapshots = [...nodes.values()].map(freezeSnapshot) + return { + root: root ?? snapshots[0]?.id ?? '', + nodes: snapshots, + inFlight: snapshots.filter((n) => n.status === 'running' || n.status === 'acquiring').length, + } +} + +interface MutableSnapshot { + id: NodeId + parent?: NodeId + label: string + status: NodeStatus + runtime: Runtime + budget: NodeSnapshot['budget'] + spent: Spend + outRef?: string +} + +function zeroSpend(): Spend { + return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 } +} + +function requireNode(nodes: Map, id: NodeId): MutableSnapshot { + const node = nodes.get(id) + if (!node) { + throw new Error(`spawn journal corrupted: settle/cancel for node '${id}' with no prior spawn`) + } + return node +} + +function freezeSnapshot(node: MutableSnapshot): NodeSnapshot { + return { + id: node.id, + parent: node.parent, + label: node.label, + status: node.status, + runtime: node.runtime, + budget: node.budget, + spent: node.spent, + outRef: node.outRef, + } +} + +function isNoEntError(err: unknown): boolean { + return ( + typeof err === 'object' && + err !== null && + 'code' in err && + (err as { code: unknown }).code === 'ENOENT' + ) +} diff --git a/src/loops/index.ts b/src/loops/index.ts index 9d1bf13..e6025a6 100644 --- a/src/loops/index.ts +++ b/src/loops/index.ts @@ -17,6 +17,19 @@ export type { SandboxEvent, SandboxInstance, } from '@tangle-network/sandbox' +// Recursive execution atom (the keystone): the open `LeafExecutor` runtime, the +// budget-conserving reactive `Scope`, the event-sourced `Supervisor`, and the spawn +// journal. Substrate types come from `./supervise/types`; the durable journal + +// replay live in `../durable/spawn-journal`. +export { + contentAddress, + FileResultBlobStore, + FileSpawnJournal, + InMemoryResultBlobStore, + InMemorySpawnJournal, + materializeTreeView, + replaySpawnTree, +} from '../durable/spawn-journal' export { type CompletionAnalyst, type CompletionEvidence, @@ -57,7 +70,15 @@ export { type LoopOptionsForDispatch, loopDispatch, } from './loop-dispatch' -export type { Agent, Program, ProgramResult, RunProgramOptions } from './program' +// The recursive execution atom owns the headline `Agent` (re-exported from +// `./supervise/types` below). The program op-set's static-tree atom is a distinct +// concept (`act` returns a `Program`), surfaced as `ProgramAgent`. +export type { + Agent as ProgramAgent, + Program, + ProgramResult, + RunProgramOptions, +} from './program' export { agentProgramPlanner, compileProgram, @@ -68,7 +89,7 @@ export { } from './program' export { reportLoopUsage, type UsageSink } from './report-usage' export type { RunLoopOptions } from './run-loop' -export { createSandboxForSpec, runLoop } from './run-loop' +export { createSandboxForSpec, defaultSelectWinner, runLoop } from './run-loop' export { type AcquireOptions, acquireSandbox } from './sandbox-acquire' export { type CriuCapableClient, @@ -84,6 +105,58 @@ export { type SandboxLineageHandle, type SessionCapableBox, } from './sandbox-lineage' +export { + type BudgetPool, + type BudgetReadout, + createBudgetPool, + type ReservationTicket, + spendFromUsageEvents, +} from './supervise/budget' +export { + type CliSeam, + cliExecutor, + createExecutorRegistry, + type RouterSeam, + routerInlineExecutor, + type SandboxSeam, + sandboxExecutor, +} from './supervise/runtime' +export { createScope, type ScopeArgs, settledToIteration } from './supervise/scope' +export { + createRootHandle, + createSupervisor, +} from './supervise/supervisor' +export type { + Agent, + AgentSpec, + Budget, + ExecutorContext, + ExecutorRegistry, + Handle, + LeafExecutor, + LeafExecutorFactory, + LeafResult, + NodeId, + NodeSnapshot, + NodeStatus, + Restart, + ResultBlobStore, + RootHandle, + RootSignal, + Runtime, + Scope, + Settled, + SpawnEvent, + SpawnJournal, + SpawnOpts, + Spend, + SupervisedResult, + Supervisor, + SupervisorOpts, + TreeView, + UsageEvent, + WidenGate, +} from './supervise/types' export type { AgentRunSpec, DefaultVerdict, diff --git a/src/loops/program.ts b/src/loops/program.ts index 4787a65..2c58129 100644 --- a/src/loops/program.ts +++ b/src/loops/program.ts @@ -345,10 +345,35 @@ async function runParallel( throw new PlannerError('Program parallel{} must carry a non-empty branches[]') } const limit = opts.maxParallel ?? branches.length - const runs = await mapPool(branches, limit, (branch, i) => + const settled = await mapPool(branches, limit, (branch, i) => runProgram(branch, opts, `${idSuffix}/p${i}`, depth + 1), ) - return concatRuns(runs, 'max', opts) + // One-for-one: a branch that threw is a `down` record EXCLUDED from the merge `n`; + // survivors still merge. A real cancel (abort signal fired) is NOT a branch failure — + // it propagates so the abort cascade stays loud. + if (opts.ctx.signal?.aborted) { + const aborted = settled.find((r) => !r.ok) + if (aborted && !aborted.ok) throw aborted.error + } + const survivors = settled.filter( + (r): r is { ok: true; value: ProgramResult } => r.ok, + ) + if (survivors.length === 0) { + // Every branch went down: there is nothing to merge, so the program genuinely + // failed. Surface the FIRST branch's original error (its real type + message — e.g. + // a maxDepth guard) rather than a lossy summary; a structural guard must not be + // swallowed as an excluded infra `down`. + const firstDown = settled.find((r) => !r.ok) + if (firstDown && !firstDown.ok) throw firstDown.error + throw new PlannerError( + `Program parallel{} merged 0 branches — all ${branches.length} sub-loops went down`, + ) + } + return concatRuns( + survivors.map((r) => r.value), + 'max', + opts, + ) } /** A `seq` containing a `parallel` → run maximal straight-line runs as single loops @@ -404,32 +429,33 @@ async function runSeq( return acc } -/** Bounded-concurrency map preserving order. Drains all in-flight before throwing the - * first error, and stops scheduling NEW work once any branch fails (mirrors the - * kernel's `runBatch` discipline so a failure can't orphan running sub-loops). */ +type MapPoolOutcome = { ok: true; value: R } | { ok: false; error: unknown } + +/** Bounded-concurrency map preserving order. One-for-one isolation: a thrown item is + * CAPTURED as a per-item `{ ok: false }` outcome — it does NOT abort siblings or stop + * scheduling, so survivors all run to completion. The caller decides whether a failed + * outcome is an excluded branch (infra `down`) or a propagated cancel. */ async function mapPool( items: T[], limit: number, fn: (item: T, index: number) => Promise, -): Promise { - const results = new Array(items.length) - let firstError: unknown +): Promise[]> { + const results = new Array>(items.length) let next = 0 const workers = Math.max(1, Math.min(limit, items.length)) const worker = async (): Promise => { - while (firstError === undefined) { + while (true) { const i = next next += 1 if (i >= items.length) return try { - results[i] = await fn(items[i] as T, i) + results[i] = { ok: true, value: await fn(items[i] as T, i) } } catch (err) { - if (firstError === undefined) firstError = err + results[i] = { ok: false, error: err } } } } await Promise.all(Array.from({ length: workers }, () => worker())) - if (firstError !== undefined) throw firstError return results } diff --git a/src/loops/supervise/budget.ts b/src/loops/supervise/budget.ts new file mode 100644 index 0000000..73b9bc2 --- /dev/null +++ b/src/loops/supervise/budget.ts @@ -0,0 +1,225 @@ +/** + * @experimental + * + * The conserved budget reservation pool — the invariant the whole instrument + * rests on (critique M5/B3). One root `Budget` becomes a conserved pool of three + * quantities (tokens, usd, iterations) plus an absolute deadline. Children RESERVE + * atomically at spawn and RECONCILE at settle: + * + * total ≡ free + reserved + committed (invariant, always) + * + * `reserve` moves a child's whole ceiling from `free` → `reserved` and FAILS CLOSED + * when `free` can't cover it (never read-then-spawn overcommit, so `Σk(treatment) ≡ + * Σk(blind)` by construction). `reconcile` releases the reservation, commits ACTUAL + * spend, and refunds the unspent remainder to `free`. Tokens and usd are SEPARATE + * channels (`LoopTokenUsage` has no `usd`); iterations are conserved alongside them. + * + * Pure and deterministic: `now()` is injected, there is no I/O, and no wall-clock or + * RNG read. A `reserve`/`reconcile` ticket is single-use (fail-loud on double or + * unknown reconcile) so a child can never refund twice. + */ + +import { addTokenUsage, zeroTokenUsage } from '../util' +import type { Budget, LoopTokenUsage, Spend, UsageEvent } from './types' + +export type { Budget, Spend, UsageEvent } + +/** Opaque, single-use reservation handle returned by `reserve` and consumed by + * `reconcile`. Carries the reserved ceilings so reconciliation needs no lookup. */ +export interface ReservationTicket { + readonly id: number + readonly reserved: { + readonly tokens: number + readonly usd: number + readonly iterations: number + } +} + +/** Post-reservation pool readout — the shape `Scope.budget` exposes. `tokensLeft`, + * `usdLeft`, and `reservedTokens` reflect committed-but-unsettled reservations; + * `deadlineMs` is the ABSOLUTE wall-clock deadline (0 when the root set none). */ +export type BudgetReadout = Readonly<{ + tokensLeft: number + usdLeft: number + deadlineMs: number + reservedTokens: number +}> + +export interface BudgetPool { + /** + * Atomically reserve a child's full ceiling from the free balance. Fails closed + * ({ ok: false }) when the pool can't cover tokens, usd, or iterations — the + * caller inspects `ok` before `ticket`. + */ + reserve( + b: Budget, + ): { ok: true; ticket: ReservationTicket } | { ok: false; reason: 'budget-exhausted' } + /** + * Release a reservation: commit the actual `spent`, refund the unspent remainder + * to the free pool. Throws on an unknown or already-reconciled ticket (fail loud — + * a double refund would silently break conservation). + */ + reconcile(ticket: ReservationTicket, spent: Spend): void + /** Fold a normalized `UsageEvent` stream (or array) into a `Spend`. Tokens via + * `addTokenUsage`, usd on its own channel, iterations from `'iteration'` events. + * `ms` is left zero — wall-clock duration is the caller's to record, not the pool's. */ + spendFrom(events: AsyncIterable | UsageEvent[]): Promise + /** The current readout, reflecting all outstanding reservations. */ + readout(): BudgetReadout +} + +/** Fold a normalized `UsageEvent` array into a `Spend`. Tokens and usd are separate + * channels; iterations come from `'iteration'` events. Pure; `ms` stays zero (the + * pool does not read wall-clock). */ +export function spendFromUsageEvents(events: UsageEvent[]): Spend { + const tokens = zeroTokenUsage() + let usd = 0 + let iterations = 0 + for (const ev of events) { + if (ev.kind === 'tokens') { + addTokenUsage(tokens, { input: ev.input, output: ev.output }) + } else if (ev.kind === 'cost') { + usd += ev.usd + } else { + iterations += 1 + } + } + return { iterations, tokens, usd, ms: 0 } +} + +async function foldUsage(events: AsyncIterable | UsageEvent[]): Promise { + if (Array.isArray(events)) return spendFromUsageEvents(events) + const tokens = zeroTokenUsage() + let usd = 0 + let iterations = 0 + for await (const ev of events) { + if (ev.kind === 'tokens') { + addTokenUsage(tokens, { input: ev.input, output: ev.output }) + } else if (ev.kind === 'cost') { + usd += ev.usd + } else { + iterations += 1 + } + } + return { iterations, tokens, usd, ms: 0 } +} + +function totalTokens(usage: LoopTokenUsage): number { + return usage.input + usage.output +} + +/** + * Create a conserved reservation pool from a root `Budget`. `now()` is injected so the + * deadline readout is deterministic; defaults to `Date.now` for non-test callers. The + * absolute deadline is fixed at construction (`now() + budget.deadlineMs`) so the + * readout's `deadlineMs` is a stable wall-clock instant, not a shrinking remainder. + */ +export function createBudgetPool(root: Budget, now: () => number = Date.now): BudgetPool { + // free + reserved + committed ≡ root totals, per channel, always. + let freeTokens = root.maxTokens + let reservedTokens = 0 + let committedTokens = 0 + + const usdCapped = root.maxUsd !== undefined + let freeUsd = root.maxUsd ?? 0 + let reservedUsd = 0 + let committedUsd = 0 + + let freeIterations = root.maxIterations + let reservedIterations = 0 + let committedIterations = 0 + + const absoluteDeadlineMs = root.deadlineMs !== undefined ? now() + root.deadlineMs : 0 + + let nextTicketId = 0 + const open = new Set() + + function reserve( + b: Budget, + ): { ok: true; ticket: ReservationTicket } | { ok: false; reason: 'budget-exhausted' } { + const wantTokens = b.maxTokens + const wantUsd = b.maxUsd ?? 0 + const wantIterations = b.maxIterations + // Fail-closed admission: every requested channel must fit the free balance. A + // usd request against an uncapped root is unsatisfiable (the root declared no $). + if (wantTokens > freeTokens) return { ok: false, reason: 'budget-exhausted' } + if (wantIterations > freeIterations) return { ok: false, reason: 'budget-exhausted' } + if (wantUsd > 0 && (!usdCapped || wantUsd > freeUsd)) { + return { ok: false, reason: 'budget-exhausted' } + } + + freeTokens -= wantTokens + reservedTokens += wantTokens + freeIterations -= wantIterations + reservedIterations += wantIterations + if (wantUsd > 0) { + freeUsd -= wantUsd + reservedUsd += wantUsd + } + + const id = nextTicketId++ + open.add(id) + return { + ok: true, + ticket: { id, reserved: { tokens: wantTokens, usd: wantUsd, iterations: wantIterations } }, + } + } + + function reconcile(ticket: ReservationTicket, spent: Spend): void { + if (!open.has(ticket.id)) { + throw new Error(`budget pool: reconcile of unknown or already-settled ticket ${ticket.id}`) + } + open.delete(ticket.id) + + const { tokens: rTokens, usd: rUsd, iterations: rIterations } = ticket.reserved + + // Clamp actual spend to the reservation: a child must never commit more than it + // reserved (that would overdraw the conserved pool). Over-spend is a fail-loud bug. + const spentTokens = totalTokens(spent.tokens) + if (spentTokens > rTokens) { + throw new Error( + `budget pool: ticket ${ticket.id} spent ${spentTokens} tokens > reserved ${rTokens}`, + ) + } + if (spent.iterations > rIterations) { + throw new Error( + `budget pool: ticket ${ticket.id} spent ${spent.iterations} iterations > reserved ${rIterations}`, + ) + } + if (spent.usd > rUsd) { + throw new Error(`budget pool: ticket ${ticket.id} spent $${spent.usd} > reserved $${rUsd}`) + } + + // Release the whole reservation, then commit actual spend; the difference is the + // refund that flows back to `free`. + reservedTokens -= rTokens + committedTokens += spentTokens + freeTokens += rTokens - spentTokens + + reservedIterations -= rIterations + committedIterations += spent.iterations + freeIterations += rIterations - spent.iterations + + if (rUsd > 0) { + reservedUsd -= rUsd + committedUsd += spent.usd + freeUsd += rUsd - spent.usd + } + } + + function readout(): BudgetReadout { + return { + tokensLeft: freeTokens, + usdLeft: usdCapped ? freeUsd : 0, + deadlineMs: absoluteDeadlineMs, + reservedTokens, + } + } + + return { + reserve, + reconcile, + spendFrom: foldUsage, + readout, + } +} diff --git a/src/loops/supervise/runtime.ts b/src/loops/supervise/runtime.ts new file mode 100644 index 0000000..2b07e6c --- /dev/null +++ b/src/loops/supervise/runtime.ts @@ -0,0 +1,628 @@ +/** + * @experimental + * + * The leaf runtime — the built-in `LeafExecutor` IMPLEMENTATIONS behind the ONE + * open interface frozen in `./types`, plus the open resolver/registry that maps + * an `AgentSpec` to one of them OR accepts a bring-your-own executor verbatim. + * + * The interface is the extension point, not a closed `inline|sandbox|cli` union: + * - router/inline : a direct OpenAI-compatible Router call, no box (one-shot). + * - sandbox : COMPOSES the existing `runLoop` kernel as a single-task + * leaf and surfaces its token/cost usage as `UsageEvent`s; + * forwards PR #150's optional `lineage` passthrough WITHOUT + * reinventing checkpoint/fork (streaming). + * - cli : a Halo/RLM subprocess; `budgetExempt` (no token accounting), + * excluded from the equal-k arms by construction (streaming). + * Every metered runtime reports through the SAME normalized `UsageEvent` channel + * so the conserved budget pool meters them identically. A user's own agent is + * first-class the moment it implements `LeafExecutor` — register it by name or + * pass it as `AgentSpec.executor`. + * + * Layering: `estimateCost`/`isModelPriced` are substrate primitives from + * `@tangle-network/agent-eval`; `runLoop`/`acquireSandbox` are runtime kernels + * from this package. No per-vendor adapters live here. + */ + +import { spawn } from 'node:child_process' +import { estimateCost, isModelPriced } from '@tangle-network/agent-eval' +import type { BackendType, SandboxEvent } from '@tangle-network/sandbox' +import { ValidationError } from '../../errors' +import type { RunLoopOptions } from '../run-loop' +import { runLoop } from '../run-loop' +import type { + AgentRunSpec, + Driver, + ExecCtx, + Iteration, + LoopSandboxClient, + OutputAdapter, +} from '../types' +import { zeroTokenUsage } from '../util' +import type { + AgentSpec, + DefaultVerdict, + ExecutorContext, + ExecutorRegistry, + LeafExecutor, + LeafExecutorFactory, + LeafResult, + Runtime, + Spend, + UsageEvent, +} from './types' + +// ── Seam contracts (read off ExecutorContext.seams, narrowed per built-in) ───── + +/** + * Router/inline connection seam. A direct OpenAI-compatible Router endpoint — + * the cheapest leaf, no box, no tools. `model` overrides the profile's model + * hint when present; otherwise the profile's `model.default` is required. + */ +export interface RouterSeam { + routerBaseUrl: string + routerKey: string + model?: string +} + +/** + * Sandbox executor seam. The `sandboxClient` the composed `runLoop` creates + * boxes through, plus the optional trace/run/lineage wiring forwarded into the + * loop. `lineage` is opaque here (PR #150's `RunLoopOptions.lineage`): forwarded + * forward-compatibly, never inspected — this executor does NOT reinvent + * checkpoint/fork. + */ +export interface SandboxSeam { + sandboxClient: LoopSandboxClient + /** Forwarded into the composed `runLoop`'s `ctx` (trace emitter, run handle, etc.). */ + loopCtx?: Partial> + /** PR #150 `RunLoopOptions.lineage` passthrough — opaque; forwarded, not parsed. */ + lineage?: unknown + /** Hard cap on the composed loop's iterations. The budget pool reserves against + * the spawn `Budget.maxIterations`; this is the leaf's own ceiling. Default 1. */ + maxIterations?: number +} + +/** CLI subprocess seam. `bin` + `args` describe the Halo/RLM process to spawn. */ +export interface CliSeam { + bin: string + args?: string[] + /** Extra environment for the subprocess (merged over `process.env`). */ + env?: Record + /** Working directory for the subprocess. */ + cwd?: string +} + +const routerSeamKey = 'router' +const sandboxSeamKey = 'sandbox' +const cliSeamKey = 'cli' + +// ── Content-addressed result pointers (the B1 replay source) ─────────────────── + +/** Deterministic content hash for an `outRef`. FNV-1a 32-bit over the canonical + * JSON of the result — not cryptographic, sufficient for content-addressing the + * replay blob so two identical outputs collapse to one pointer. */ +function contentRef(prefix: string, value: unknown): string { + let str: string + try { + str = JSON.stringify(value) ?? String(value) + } catch { + str = String(value) + } + let h = 0x811c9dc5 + for (let i = 0; i < str.length; i += 1) { + h ^= str.charCodeAt(i) + h = Math.imul(h, 0x01000193) + } + return `${prefix}:${(h >>> 0).toString(16).padStart(8, '0')}` +} + +function zeroSpend(): Spend { + return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 } +} + +// ── router/inline executor (harness === null) ────────────────────────────────── + +/** + * A direct OpenAI-compatible Router chat-completion. One-shot: resolves a + * `LeafResult` and reports its terminal usage as `UsageEvent`s through the + * conserved pool. Reports REAL token usage — when the provider omits `usage`, + * the spend records zero tokens but the call still counts one iteration (a + * phantom fabricated 0 is never emitted as a priced cost). + * + * NOTE for the Integrate phase: this duplicates the minimal body of + * `bench/src/router-client.ts#routerChatWithUsage`. `bench/` is a sub-package + * outside this package's `rootDir: "src"`, so it cannot be imported here without + * breaking the build. Integrate should lift that helper into `src/loops/` and + * have both call sites share it (do not re-copy a third time). + */ +export const routerInlineExecutor: LeafExecutorFactory = (spec, ctx) => { + const seam = readSeam(ctx, routerSeamKey, 'router/inline') + const model = seam.model ?? spec.profile.model?.default + if (!model) { + throw new ValidationError( + 'routerInlineExecutor: no model — set RouterSeam.model or AgentProfile.model.default', + ) + } + if (!seam.routerBaseUrl || !seam.routerKey) { + throw new ValidationError('routerInlineExecutor: RouterSeam.routerBaseUrl + routerKey required') + } + + const controller = new AbortController() + const abortIfSignalled = () => { + if (ctx.signal.aborted) controller.abort() + } + abortIfSignalled() + if (!ctx.signal.aborted) ctx.signal.addEventListener('abort', abortIfSignalled, { once: true }) + + let artifact: LeafResult | undefined + + return { + runtime: 'router' as Runtime, + async execute(task, signal): Promise> { + const messages = taskToMessages(task, spec) + const started = Date.now() + const linked = linkSignals(signal, controller.signal) + const res = await fetch(`${seam.routerBaseUrl.replace(/\/$/, '')}/chat/completions`, { + method: 'POST', + headers: { 'content-type': 'application/json', authorization: `Bearer ${seam.routerKey}` }, + body: JSON.stringify({ model, messages, temperature: 0.2 }), + ...(linked ? { signal: linked } : {}), + }) + if (!res.ok) { + throw new ValidationError( + `routerInlineExecutor: router ${res.status}: ${(await res.text()).slice(0, 200)}`, + ) + } + const data = (await res.json()) as { + choices?: Array<{ message?: { content?: string } }> + usage?: { prompt_tokens?: number; completion_tokens?: number } + } + const u = data.usage + const usage = + u && typeof u.prompt_tokens === 'number' && typeof u.completion_tokens === 'number' + ? { input: u.prompt_tokens, output: u.completion_tokens } + : undefined + const usd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : 0 + const content = data.choices?.[0]?.message?.content ?? '' + const spent: Spend = { + iterations: 1, + tokens: usage ? { input: usage.input, output: usage.output } : zeroTokenUsage(), + usd, + ms: Date.now() - started, + } + const out = { content } as unknown + artifact = { outRef: contentRef('router', { model, content }), out, spent } + return artifact + }, + teardown(_grace): Promise<{ destroyed: boolean }> { + controller.abort() + return Promise.resolve({ destroyed: true }) + }, + resultArtifact() { + if (!artifact) { + throw new ValidationError('routerInlineExecutor: resultArtifact() read before execute()') + } + return { ...artifact, spent: artifact.spent } + }, + } +} + +// ── sandbox executor (harness is a BackendType) ──────────────────────────────── + +/** + * COMPOSES `runLoop` as a single-task leaf: one box, a refine driver bounded to + * the seam's `maxIterations` (default 1), the spec's profile as the agent run. + * Surfaces the loop's aggregated `tokenUsage` + `costUsd` as `UsageEvent`s after + * it drains, and yields one `iteration` event per loop iteration. Forwards the + * optional `lineage` passthrough WITHOUT importing sandbox-lineage / reinventing + * checkpoint/fork. + * + * Streaming shape: the loop runs to completion inside the first `next()`, then + * the recorded usage events are yielded; the terminal artifact is read from + * `resultArtifact()` after the stream drains. + */ +export const sandboxExecutor: LeafExecutorFactory = (spec, ctx) => { + if (spec.harness === null) { + throw new ValidationError('sandboxExecutor: harness is null (router/inline) — wrong executor') + } + const harness = spec.harness as BackendType + const seam = readSeam(ctx, sandboxSeamKey, 'sandbox') + if (!seam.sandboxClient || typeof seam.sandboxClient.create !== 'function') { + throw new ValidationError('sandboxExecutor: SandboxSeam.sandboxClient.create required') + } + const maxIterations = seam.maxIterations ?? 1 + if (!Number.isFinite(maxIterations) || maxIterations <= 0) { + throw new ValidationError('sandboxExecutor: maxIterations must be > 0') + } + + const controller = new AbortController() + const abortIfSignalled = () => { + if (ctx.signal.aborted) controller.abort() + } + abortIfSignalled() + if (!ctx.signal.aborted) ctx.signal.addEventListener('abort', abortIfSignalled, { once: true }) + + let artifact: LeafResult | undefined + + // The leaf runs an opaque, self-parallelizing coding harness; the loop just + // refines once over it. Output is the raw event stream parsed to its tail text. + const output: OutputAdapter = { + parse(events: SandboxEvent[]): SandboxLeafOut { + return { events } + }, + } + const driver = singleShotDriver(maxIterations) + + return { + runtime: 'sandbox' as Runtime, + execute(task, signal): AsyncIterable { + return streamSandboxLeaf({ + task, + signal, + harness, + spec, + seam, + output, + driver, + maxIterations, + controller, + loopCtx: seam.loopCtx, + onArtifact: (a) => { + artifact = a + }, + }) + }, + teardown(_grace): Promise<{ destroyed: boolean }> { + // The composed runLoop owns its box teardown (finally{allSettled(destroy)}); + // aborting the loop's signal cascades into that barrier. + controller.abort() + return Promise.resolve({ destroyed: true }) + }, + resultArtifact() { + if (!artifact) { + throw new ValidationError('sandboxExecutor: resultArtifact() read before stream drained') + } + return artifact + }, + } +} + +interface SandboxLeafOut { + events: SandboxEvent[] +} + +interface StreamSandboxArgs { + task: unknown + signal: AbortSignal + harness: BackendType + spec: AgentSpec + seam: SandboxSeam + output: OutputAdapter + driver: Driver + maxIterations: number + controller: AbortController + loopCtx?: Partial> + onArtifact: (a: LeafResult) => void +} + +async function* streamSandboxLeaf(args: StreamSandboxArgs): AsyncIterable { + const linked = new AbortController() + const cascade = () => linked.abort() + if (args.signal.aborted || args.controller.signal.aborted) linked.abort() + else { + args.signal.addEventListener('abort', cascade, { once: true }) + args.controller.signal.addEventListener('abort', cascade, { once: true }) + } + + const agentRun: AgentRunSpec = { + profile: args.spec.profile, + taskToPrompt: (t) => taskToPrompt(t), + name: args.spec.profile.name ?? args.harness, + sandboxOverrides: { backend: { type: args.harness } }, + } + const started = Date.now() + + // `lineage` is a PR #150 RunLoopOptions field absent on this branch — forwarded + // forward-compatibly without coupling to its (not-yet-present) static type. + const loopOptions = { + driver: args.driver, + agentRun, + output: args.output, + task: args.task, + maxIterations: args.maxIterations, + maxConcurrency: 1, + ctx: { + ...(args.loopCtx ?? {}), + sandboxClient: args.seam.sandboxClient, + signal: linked.signal, + } as ExecCtx, + ...(args.seam.lineage !== undefined ? { lineage: args.seam.lineage } : {}), + } as RunLoopOptions + + try { + const result = await runLoop(loopOptions) + const out = result.winner?.output ?? { events: [] } + const verdict = result.winner?.verdict + const spent: Spend = { + iterations: result.iterations.length, + tokens: { input: result.tokenUsage.input, output: result.tokenUsage.output }, + usd: result.costUsd, + ms: Date.now() - started, + } + args.onArtifact({ + outRef: contentRef('sandbox', { harness: args.harness, out }), + out, + ...(verdict ? { verdict } : {}), + spent, + }) + for (let i = 0; i < result.iterations.length; i += 1) yield { kind: 'iteration' } + if (result.tokenUsage.input || result.tokenUsage.output) { + yield { kind: 'tokens', input: result.tokenUsage.input, output: result.tokenUsage.output } + } + if (result.costUsd) yield { kind: 'cost', usd: result.costUsd } + } finally { + args.signal.removeEventListener('abort', cascade) + args.controller.signal.removeEventListener('abort', cascade) + } +} + +// ── cli executor (Halo / external RLM subprocess) ────────────────────────────── + +/** + * Spawns a subprocess (`bin` + `args`). It cannot account tokens, so it is + * `budgetExempt: true`: its spend is NOT metered against the conserved pool and + * its iterations are EXCLUDED from the equal-k arms by construction (the + * resolver/equal-k path checks `budgetExempt`). teardown is SIGTERM → SIGKILL + * with a grace window. Streaming: yields one `iteration` event on clean exit. + */ +export const cliExecutor: LeafExecutorFactory = (_spec, ctx) => { + const seam = readSeam(ctx, cliSeamKey, 'cli') + if (!seam.bin) throw new ValidationError('cliExecutor: CliSeam.bin required') + + const controller = new AbortController() + const abortIfSignalled = () => { + if (ctx.signal.aborted) controller.abort() + } + abortIfSignalled() + if (!ctx.signal.aborted) ctx.signal.addEventListener('abort', abortIfSignalled, { once: true }) + + let proc: ReturnType | undefined + let artifact: LeafResult | undefined + + return { + runtime: 'cli' as Runtime, + budgetExempt: true, + execute(task, signal): AsyncIterable { + return streamCliLeaf({ + task, + signal, + seam, + controller, + onProc: (p) => { + proc = p + }, + onArtifact: (a) => { + artifact = a + }, + }) + }, + async teardown(grace): Promise<{ destroyed: boolean }> { + controller.abort() + if (!proc || proc.exitCode !== null || proc.killed) return { destroyed: true } + return killWithGrace(proc, grace) + }, + resultArtifact() { + if (!artifact) { + throw new ValidationError('cliExecutor: resultArtifact() read before stream drained') + } + return artifact + }, + } +} + +interface StreamCliArgs { + task: unknown + signal: AbortSignal + seam: CliSeam + controller: AbortController + onProc: (p: ReturnType) => void + onArtifact: (a: LeafResult) => void +} + +async function* streamCliLeaf(args: StreamCliArgs): AsyncIterable { + const prompt = taskToPrompt(args.task) + const proc = spawn(args.seam.bin, args.seam.args ?? [], { + ...(args.seam.cwd ? { cwd: args.seam.cwd } : {}), + env: { ...process.env, ...(args.seam.env ?? {}) }, + stdio: ['pipe', 'pipe', 'pipe'], + }) + args.onProc(proc) + + const onAbort = () => killWithGrace(proc, 'brutalKill') + if (args.signal.aborted || args.controller.signal.aborted) onAbort() + else { + args.signal.addEventListener('abort', onAbort, { once: true }) + args.controller.signal.addEventListener('abort', onAbort, { once: true }) + } + + // Feed the task on stdin; the subprocess owns its own tool/agent loop. + if (proc.stdin) { + proc.stdin.write(prompt) + proc.stdin.end() + } + const chunks: string[] = [] + const errChunks: string[] = [] + if (proc.stdout) proc.stdout.on('data', (d: Buffer) => chunks.push(d.toString('utf8'))) + if (proc.stderr) proc.stderr.on('data', (d: Buffer) => errChunks.push(d.toString('utf8'))) + + const exit = await new Promise<{ code: number | null; error?: Error }>((resolve) => { + proc.once('error', (err) => resolve({ code: null, error: err })) + proc.once('close', (code) => resolve({ code })) + }) + args.signal.removeEventListener('abort', onAbort) + args.controller.signal.removeEventListener('abort', onAbort) + + if (exit.error) { + throw new ValidationError(`cliExecutor: spawn failed: ${exit.error.message}`, { + cause: exit.error, + }) + } + if (exit.code !== 0) { + throw new ValidationError( + `cliExecutor: ${args.seam.bin} exited ${exit.code}: ${errChunks.join('').slice(0, 200)}`, + ) + } + const out = { content: chunks.join('') } as unknown + // budgetExempt: spend is recorded zero (not metered) — never a fabricated cost. + args.onArtifact({ outRef: contentRef('cli', out), out, spent: zeroSpend() }) + yield { kind: 'iteration' } +} + +/** SIGTERM, then SIGKILL after `grace` ms (`'brutalKill'` = immediate SIGKILL, + * `'infinity'` = await clean exit, never escalate). */ +function killWithGrace( + proc: ReturnType, + grace: number | 'brutalKill' | 'infinity', +): Promise<{ destroyed: boolean }> { + if (proc.exitCode !== null || proc.killed) return Promise.resolve({ destroyed: true }) + return new Promise((resolve) => { + let timer: ReturnType | undefined + proc.once('close', () => { + if (timer) clearTimeout(timer) + resolve({ destroyed: true }) + }) + if (grace === 'brutalKill') { + proc.kill('SIGKILL') + return + } + proc.kill('SIGTERM') + if (grace === 'infinity') return + timer = setTimeout(() => { + if (proc.exitCode === null && !proc.killed) proc.kill('SIGKILL') + }, grace) + }) +} + +// ── The open registry ────────────────────────────────────────────────────────── + +/** + * The open resolver/registry. Pre-registers the three built-ins under their + * runtime tags (`'router'`, `'sandbox'`, `'cli'`) and accepts `register(name, + * factory)` for any additional runtime — and a BYO `AgentSpec.executor` resolves + * without touching the registry at all. NOT a closed switch; registration + BYO + * ARE the extension points. + * + * `resolve` precedence (frozen in `ExecutorRegistry`): a BYO `spec.executor` → + * `harness === null` → the `'router'` factory; else a registered factory for the + * harness-derived runtime (`'sandbox'` for any `BackendType`); else fail loud. + */ +export function createExecutorRegistry(): ExecutorRegistry { + const factories = new Map>() + factories.set('router', routerInlineExecutor) + factories.set('inline', routerInlineExecutor) + factories.set('sandbox', sandboxExecutor) + factories.set('cli', cliExecutor) + + return { + register(runtime: Runtime, factory: LeafExecutorFactory): void { + if (factories.has(runtime)) { + throw new ValidationError(`executor registry: runtime "${runtime}" already registered`) + } + factories.set(runtime, factory as LeafExecutorFactory) + }, + resolve( + spec: AgentSpec, + ): { succeeded: true; value: LeafExecutorFactory } | { succeeded: false; error: string } { + // BYO: a caller-supplied executor wins, wrapped in a trivial per-spawn factory. + if (spec.executor) { + const byo = spec.executor + return { succeeded: true, value: (() => byo) as LeafExecutorFactory } + } + // router/inline: an agent with no harness is a direct Router call. + if (spec.harness === null) { + const f = factories.get('router') + if (!f) return { succeeded: false, error: 'executor registry: no "router" factory' } + return { succeeded: true, value: f as LeafExecutorFactory } + } + // sandbox: any BackendType maps to the sandbox-composing-runLoop executor. + const runtimeTag: Runtime = 'sandbox' + const f = factories.get(runtimeTag) + if (!f) { + return { + succeeded: false, + error: `executor registry: no factory for runtime "${runtimeTag}" (harness "${spec.harness}") and no BYO executor`, + } + } + return { succeeded: true, value: f as LeafExecutorFactory } + }, + } +} + +// ── Shared helpers ────────────────────────────────────────────────────────────── + +/** Narrow a named seam off the `ExecutorContext`, failing loud when absent — no + * silent default for a required external-boundary seam. */ +function readSeam(ctx: ExecutorContext, key: string, who: string): T { + const seam = ctx.seams[key] + if (seam === undefined || seam === null) { + throw new ValidationError(`${who} executor: missing required seam "${key}" on ExecutorContext`) + } + return seam as T +} + +/** A leaf task is opaque (`unknown`). A string is the prompt verbatim; an object + * with a `prompt`/`content`/`task` string field uses it; otherwise it serializes. */ +function taskToPrompt(task: unknown): string { + if (typeof task === 'string') return task + if (task && typeof task === 'object') { + const obj = task as Record + for (const k of ['prompt', 'content', 'task', 'message']) { + if (typeof obj[k] === 'string') return obj[k] as string + } + } + return JSON.stringify(task) +} + +/** Router messages from the opaque task + the profile's system prompt, when set. */ +function taskToMessages(task: unknown, spec: AgentSpec): Array<{ role: string; content: string }> { + const messages: Array<{ role: string; content: string }> = [] + const system = spec.profile.prompt?.systemPrompt + if (typeof system === 'string' && system.length > 0) { + messages.push({ role: 'system', content: system }) + } + messages.push({ role: 'user', content: taskToPrompt(task) }) + return messages +} + +/** A driver that refines a single task up to `maxIterations` times then stops — + * the minimal policy that lets the sandbox executor run `runLoop` as one leaf. */ +function singleShotDriver(maxIterations: number): Driver { + return { + name: 'leaf', + plan(task, history): Promise { + return Promise.resolve(history.length >= maxIterations ? [] : [task]) + }, + decide(history: ReadonlyArray>): string { + return history.length >= maxIterations ? 'stop' : 'continue' + }, + } +} + +/** Link two abort signals into one that fires when either does. Returns + * `undefined` when neither is present so `fetch` gets no signal at all. */ +function linkSignals(a: AbortSignal, b: AbortSignal): AbortSignal | undefined { + if (a.aborted || b.aborted) { + const c = new AbortController() + c.abort() + return c.signal + } + const c = new AbortController() + const onAbort = () => c.abort() + a.addEventListener('abort', onAbort, { once: true }) + b.addEventListener('abort', onAbort, { once: true }) + return c.signal +} + +// Re-export the verdict + spend surface so a consumer importing the runtime +// built-ins gets the budget vocabulary from one place. +export type { DefaultVerdict, LeafExecutor, LeafResult, Spend, UsageEvent } diff --git a/src/loops/supervise/scope.ts b/src/loops/supervise/scope.ts new file mode 100644 index 0000000..a5e775f --- /dev/null +++ b/src/loops/supervise/scope.ts @@ -0,0 +1,560 @@ +/** + * @experimental + * + * The reactive `Scope` impl (KEYSTONE, build step 4 + the step-8 adapter). + * + * An `Agent.act` runs inside a `Scope`. It `spawn`s children dynamically and reacts to + * them via `next()`. The scope owns ONE in-memory nursery — the authoritative live set — + * and is the single place that drives a child's lifecycle: reserve budget atomically, + * resolve a `LeafExecutor` through the open registry, run it (one-shot OR streaming), + * fold its normalized `UsageEvent`s into a conserved `Spend`, reconcile the reservation + * (refunding the unspent remainder), persist the result blob + journal records, and + * deliver the `Settled` through the `next()` cursor. + * + * Three invariants this impl enforces by construction: + * - `next()` is a ray.wait n=1 cursor over THIS scope's live set; it assigns the + * monotonic `seq` (the recorded cursor order) at the moment it yields a settlement, so + * replay re-delivers in the identical order — `seq` is never wall-clock. + * - Budget is reserved at spawn and reconciled at settle through the shared `BudgetPool`, + * so `spawn` fails CLOSED on an exhausted pool and total ≡ free + reserved + committed. + * - `view` reads the in-memory nursery, never the journal — O(live), synchronous. + * + * The settle path is the only writer of journal `settled` events; the spawn path the only + * writer of `spawned` events. The result blob is `put` BEFORE the journal `settled` record + * references its `outRef`, so a crash can never leave a journaled ref with no blob. + */ + +import { contentAddress } from '../../durable/spawn-journal' +import { ValidationError } from '../../errors' +import type { Iteration } from '../types' +import type { BudgetPool, ReservationTicket } from './budget' +import type { + Agent, + AgentSpec, + Budget, + DefaultVerdict, + ExecutorContext, + ExecutorRegistry, + Handle, + LeafExecutor, + LeafResult, + NodeId, + NodeSnapshot, + NodeStatus, + ResultBlobStore, + Scope, + Settled, + SpawnJournal, + SpawnOpts, + Spend, + TreeView, + UsageEvent, +} from './types' + +/** Construction args for `createScope`. The supervisor threads the shared pool, journal, + * blob store, and executor registry through; `depth`/`maxDepth` pair the runtime + * recursion ceiling with the conserved pool (R3). */ +export interface ScopeArgs { + /** This scope's owning node id — children get `${parentId}:s${seq}` ids. */ + readonly parentId: NodeId + /** Journal/blob root key the supervisor `beginTree`'d. */ + readonly root: NodeId + /** The shared conserved reservation pool (one per supervised run). */ + readonly pool: BudgetPool + /** Append-only spawn journal; this scope writes `spawned` + `settled` records. */ + readonly journal: SpawnJournal + /** Content-addressed result store backing `outRef` rehydration. */ + readonly blobs: ResultBlobStore + /** The open executor resolver (BYO → router/inline → registered harness factory). */ + readonly executors: ExecutorRegistry + /** Per-spawn executor-construction seams (sandbox client, router config, cli bin). */ + readonly seams: Readonly> + /** This scope's recursion depth (root = 0). */ + readonly depth: number + /** Runtime recursion-depth ceiling — a spawn past it fails closed `depth-exceeded`. */ + readonly maxDepth?: number + /** Abort signal for this scope; an abort cascades into every live child's executor. */ + readonly signal: AbortSignal + /** Injected clock — keeps the journal `at` timestamp deterministic in tests. */ + readonly now?: () => number +} + +/** + * Internal live-set entry. `settled` resolves once the child's executor has fully drained, + * its reservation reconciled, and its result blob persisted; `next()` awaits these to drive + * the cursor. `resolved` mirrors that terminal value synchronously so a concurrent `next()` + * can pick the next undelivered settlement without re-racing. `delivered` guards exactly-once + * delivery; `seq` is stamped by `next()`, never here. + */ +interface LiveChild { + readonly id: NodeId + status: NodeStatus + runtime: NodeSnapshot['runtime'] + readonly budget: Budget + readonly label: string + spent: Spend + outRef?: string + /** Resolves with the terminal settlement WITHOUT a `seq` — `next()` stamps the seq. */ + readonly settled: Promise + /** Synchronous mirror of `settled`'s value once it has resolved (else `undefined`). */ + resolved?: PreSeqSettled + /** True once `next()` has yielded this child's settlement. */ + delivered: boolean +} + +/** A child's terminal settlement before the cursor stamps the monotonic `seq`. */ +type PreSeqSettled = + | { kind: 'done'; out: unknown; outRef: string; verdict?: DefaultVerdict; spent: Spend } + | { kind: 'down'; reason: string; infra: boolean; restartCount: number } + +export function createScope(args: ScopeArgs): Scope { + const children = new Map() + // Two distinct monotonic counters in two namespaces: + // - `spawnOrdinal` is the spawn order (0,1,2,…); it mints the deterministic node id + // `${parent}:s${ordinal}` and stamps the `spawned` event's `seq`. Known at spawn. + // - `cursorSeq` is the order `next()` yields settlements (B2); it stamps the + // `settled`/`cancelled` event's `seq` and the `Settled.seq` the driver branches on. + // They are separate so a `spawned` event never collides with a `settled` event in the + // journal's per-tree uniqueness guard (which is scoped to the cursor namespace). + let spawnOrdinal = 0 + let cursorSeq = 0 + const now = args.now ?? Date.now + + function spawn( + agent: Agent, + task: unknown, + opts: SpawnOpts, + ): + | { ok: true; handle: Handle } + | { ok: false; reason: 'budget-exhausted' | 'depth-exceeded' } { + if (args.maxDepth !== undefined && args.depth >= args.maxDepth) { + return { ok: false, reason: 'depth-exceeded' } + } + + // Resolve the leaf executor through the OPEN registry FIRST (no reservation to unwind + // if the agent is misconfigured). An agent carries its executor mapping as the + // `executorSpec` (an `AgentSpec`); resolution precedence (BYO → router/inline → harness + // factory) lives in the registry, not in a call-site switch. + const spec = (agent as unknown as { executorSpec?: unknown }).executorSpec + if (!isAgentSpec(spec)) { + throw new ValidationError( + `scope.spawn: agent "${agent.name}" exposes no \`executorSpec\` (AgentSpec) to resolve a LeafExecutor`, + ) + } + const resolved = args.executors.resolve(spec) + if (!resolved.succeeded) throw new ValidationError(`scope.spawn: ${resolved.error}`) + + // Reserve the child's whole ceiling atomically; fail CLOSED when the pool can't cover + // it (never read-then-spawn overcommit, so Σk is conserved by construction). + const reservation = args.pool.reserve(opts.budget) + if (!reservation.ok) return { ok: false, reason: reservation.reason } + + const ordinal = spawnOrdinal++ + const id: NodeId = `${args.parentId}:s${ordinal}` + + // The child's abort chains off this scope's signal (a scope abort reaps every child) + // AND off its own handle.abort(). Aborting mid-acquire cascades through the executor's + // signal into its acquireSandbox find-by-name reap, so an acquiring node never leaks. + const childAbort = new AbortController() + const cascadeAbort = () => childAbort.abort() + if (args.signal.aborted) childAbort.abort() + else args.signal.addEventListener('abort', cascadeAbort, { once: true }) + + const ctx: ExecutorContext = { signal: childAbort.signal, seams: args.seams } + const executor = resolved.value(spec, ctx) as LeafExecutor + + const handle: Handle = { + id, + label: opts.label, + get status(): NodeStatus { + return children.get(id)?.status ?? 'cancelled' + }, + abort(reason?: string): void { + childAbort.abort(reason) + }, + } + + const live: LiveChild = { + id, + status: 'acquiring', + runtime: executor.runtime, + budget: opts.budget, + label: opts.label, + spent: zeroSpend(), + settled: undefined as unknown as Promise, + delivered: false, + } + children.set(id, live) + + void args.journal.appendEvent(args.root, { + kind: 'spawned', + id, + parent: args.parentId, + label: opts.label, + budget: opts.budget, + runtime: executor.runtime, + seq: ordinal, + at: new Date(now()).toISOString(), + }) + + // Drive the executor to settlement off to the side; `next()` awaits the resulting + // promise. A thrown executor (or a real abort) is TYPED into a `down` record by + // `runChild` (never re-thrown) so a single failing child never rejects the cursor. + const settled = runChild( + live, + executor, + childAbort, + task, + opts, + args.pool, + reservation.ticket, + args.blobs, + ) + .then((s) => { + live.resolved = s + return s + }) + .finally(() => { + args.signal.removeEventListener('abort', cascadeAbort) + }) + ;(live as { settled: Promise }).settled = settled + + return { ok: true, handle } + } + + async function next(): Promise | null> { + const undelivered = () => [...children.values()].filter((c) => !c.delivered) + if (undelivered().length === 0) return null + + // ray.wait n=1: await the FIRST not-yet-delivered child to settle. Loop because a + // concurrent `next()` may take the race winner between the await and the pick. + for (;;) { + const pending = undelivered() + if (pending.length === 0) return null + // Prefer an already-resolved-but-undelivered child (no await needed). + const ready = pending.find((c) => c.resolved !== undefined) + const chosen = ready ?? (await raceFirstSettled(pending)) + if (chosen.delivered) continue + chosen.delivered = true + + const seq = cursorSeq++ + const settlement = chosen.resolved + if (!settlement) { + throw new ValidationError( + `scope.next: child '${chosen.id}' won the settle race without a resolved value`, + ) + } + return finalizeSettlement(chosen, settlement, seq, args, now) + } + } + + return { + spawn, + next, + get view(): TreeView { + return makeTreeView(args.parentId, children) + }, + get budget() { + return args.pool.readout() + }, + } +} + +/** Await whichever pending child settles first, returning the child (its `resolved` is set + * by the time this resolves because `runChild`'s `.then` sets it before the promise + * resolves downstream). */ +async function raceFirstSettled(pending: LiveChild[]): Promise { + return Promise.race(pending.map((c) => c.settled.then(() => c))) +} + +/** Stamp the cursor `seq`, write the `settled` journal record, and project the + * `PreSeqSettled` into the frozen `Settled` the driver branches on. */ +async function finalizeSettlement( + child: LiveChild, + settlement: PreSeqSettled, + seq: number, + args: ScopeArgs, + now: () => number, +): Promise> { + const handle = frozenHandle(child) + if (settlement.kind === 'down') { + child.status = 'failed' + await args.journal.appendEvent(args.root, { + kind: 'settled', + id: child.id, + status: 'down', + spent: child.spent, + infra: settlement.infra, + seq, + at: new Date(now()).toISOString(), + }) + return { + kind: 'down', + handle, + reason: settlement.reason, + infra: settlement.infra, + restartCount: settlement.restartCount, + seq, + } + } + + child.status = 'done' + child.outRef = settlement.outRef + child.spent = settlement.spent + await args.journal.appendEvent(args.root, { + kind: 'settled', + id: child.id, + status: 'done', + outRef: settlement.outRef, + ...(settlement.verdict ? { verdict: settlement.verdict } : {}), + spent: settlement.spent, + seq, + at: new Date(now()).toISOString(), + }) + return { + kind: 'done', + handle, + out: settlement.out as Out, + outRef: settlement.outRef, + ...(settlement.verdict ? { verdict: settlement.verdict } : {}), + spent: settlement.spent, + seq, + } +} + +/** + * Drive one child's `LeafExecutor` to a terminal `PreSeqSettled`, folding usage into the + * conserved `Spend`, reconciling the reservation, and persisting the result blob. Both + * executor shapes are handled here: a one-shot `Promise` and a streaming + * `AsyncIterable` whose terminal artifact is read from `resultArtifact()`. + * + * A thrown executor (or a real abort) becomes a TYPED `down` — never re-thrown — so a + * single failing child cannot reject the `next()` cursor (the M2 typed-result discipline, + * applied per child). The reservation is reconciled on EVERY path (success, abort, throw) + * so the conserved pool can never leak a reservation. + */ +async function runChild( + live: LiveChild, + executor: LeafExecutor, + childAbort: AbortController, + task: unknown, + opts: SpawnOpts, + pool: BudgetPool, + ticket: ReservationTicket, + blobs: ResultBlobStore, +): Promise { + let reconciled = false + const reconcileOnce = (spend: Spend) => { + if (reconciled) return + reconciled = true + // A budgetExempt executor reports zero spend by contract; the reconcile refunds its + // whole reservation, keeping it out of the conserved Σk by construction. + pool.reconcile(ticket, clampSpend(spend, opts.budget)) + } + try { + live.status = 'running' + const ran = executor.execute(task, childAbort.signal) + let artifact: LeafResult + if (isAsyncIterable(ran)) { + // Streaming: fold the incremental usage events as they arrive (the conserved-pool + // authority), then read the terminal artifact after the stream drains. + const spend = await foldStream(ran) + live.spent = spend + artifact = executor.resultArtifact() as LeafResult + reconcileOnce(spend) + } else { + const terminal = await ran + live.spent = terminal.spent + artifact = terminal + reconcileOnce(terminal.spent) + } + + if (childAbort.signal.aborted) { + await teardownSafe(executor, opts.shutdown ?? 'brutalKill') + return downRecord('aborted before settle', true) + } + + // The durable record is keyed by the canonical content address of the output — the + // single addressing scheme the blob store enforces and the supervisor's winner path + // uses. An executor's self-minted `resultArtifact().outRef` is its own internal dedup + // hint; the journal/blob `outRef` is re-derived here so replay rehydrates by one + // scheme. Persist the blob BEFORE the journal `settled` record references its `outRef`, + // so a crash never leaves a journaled ref pointing at a missing blob. + const outRef = contentAddress(artifact.out) + await blobs.put(outRef, artifact.out) + await teardownSafe(executor, opts.shutdown ?? 'infinity') + return { + kind: 'done', + out: artifact.out, + outRef, + ...(artifact.verdict ? { verdict: artifact.verdict } : {}), + spent: live.spent, + } + } catch (err) { + // Reconcile the (likely partial) spend so the reservation is refunded even on a throw. + reconcileOnce(live.spent) + await teardownSafe(executor, 'brutalKill') + const aborted = childAbort.signal.aborted || isAbortError(err) + return downRecord(errMessage(err), aborted || isInfraError(err)) + } +} + +/** + * The step-8 merge-boundary adapter (M4): rehydrate a `Settled.done` into the kernel's + * `Iteration` shape so `defaultSelectWinner` stays single-sourced — the supervisor selects + * across settled children with the SAME argmax the loop kernel uses, not a forked copy. + * + * `index` is the cursor `seq` (the recorded, replay-stable order); `output`/`verdict`/ + * `tokenUsage`/`costUsd` are read straight off the settlement (already rehydrated from the + * `outRef` blob by `next()`). Events are empty — a settled child is an opaque leaf result, + * not a sandbox event stream — and the timing/cost fields project its conserved `Spend`. + * Fail loud on a `down` settlement: only a `done` child is an iteration. + */ +export function settledToIteration(settled: Settled): Iteration { + if (settled.kind === 'down') { + throw new ValidationError( + `settledToIteration: cannot adapt a 'down' settlement (node '${settled.handle.id}', seq ${settled.seq}) to an Iteration`, + ) + } + return { + index: settled.seq, + task: undefined, + agentRunName: settled.handle.label, + output: settled.out, + ...(settled.verdict ? { verdict: settled.verdict } : {}), + events: [], + startedAt: 0, + endedAt: settled.spent.ms, + costUsd: settled.spent.usd, + tokenUsage: { input: settled.spent.tokens.input, output: settled.spent.tokens.output }, + } +} + +// ── Helpers ───────────────────────────────────────────────────────────────────── + +function makeTreeView(root: NodeId, children: Map): TreeView { + const nodes: NodeSnapshot[] = [...children.values()].map((c) => ({ + id: c.id, + parent: root, + label: c.label, + status: c.status, + runtime: c.runtime, + budget: c.budget, + spent: c.spent, + ...(c.outRef ? { outRef: c.outRef } : {}), + })) + return { + root, + nodes, + inFlight: nodes.filter((n) => n.status === 'running' || n.status === 'acquiring').length, + } +} + +function frozenHandle(child: LiveChild): Handle { + return { + id: child.id, + label: child.label, + status: child.status, + abort(): void { + // A settled child is terminal; abort is a no-op (its executor already tore down). + }, + } +} + +async function foldStream(stream: AsyncIterable): Promise { + const tokens = { input: 0, output: 0 } + let usd = 0 + let iterations = 0 + for await (const ev of stream) { + if (ev.kind === 'tokens') { + tokens.input += ev.input + tokens.output += ev.output + } else if (ev.kind === 'cost') { + usd += ev.usd + } else { + iterations += 1 + } + } + return { iterations, tokens, usd, ms: 0 } +} + +/** Clamp a child's reported spend to its reservation so the pool's fail-loud over-spend + * guard never trips on a benign overshoot from an external usage report; the difference + * refunds to the pool as if the child stopped at its ceiling. */ +function clampSpend(spend: Spend, budget: Budget): Spend { + const totalTokens = spend.tokens.input + spend.tokens.output + const tokensOk = totalTokens <= budget.maxTokens + const itersOk = spend.iterations <= budget.maxIterations + const usdOk = budget.maxUsd === undefined || spend.usd <= budget.maxUsd + if (tokensOk && itersOk && usdOk) return spend + const ratio = !tokensOk && totalTokens > 0 ? budget.maxTokens / totalTokens : 1 + return { + iterations: Math.min(spend.iterations, budget.maxIterations), + tokens: + ratio < 1 + ? { + input: Math.floor(spend.tokens.input * ratio), + output: Math.floor(spend.tokens.output * ratio), + } + : spend.tokens, + usd: budget.maxUsd === undefined ? spend.usd : Math.min(spend.usd, budget.maxUsd), + ms: spend.ms, + } +} + +async function teardownSafe( + executor: LeafExecutor, + grace: number | 'brutalKill' | 'infinity', +): Promise { + try { + await executor.teardown(grace) + } catch { + // Teardown failure is observable through the node staying live; swallow so it never + // masks the settlement itself. The supervisor's join barrier reaps on its own grace. + } +} + +function downRecord(reason: string, infra: boolean): PreSeqSettled { + return { kind: 'down', reason, infra, restartCount: 0 } +} + +function zeroSpend(): Spend { + return { iterations: 0, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 } +} + +function isAsyncIterable(value: unknown): value is AsyncIterable { + return ( + typeof value === 'object' && + value !== null && + typeof (value as AsyncIterable)[Symbol.asyncIterator] === 'function' + ) +} + +/** An `AgentSpec` is identified structurally — it carries a `profile` and a `harness` + * field (`null` or a `BackendType`) and optionally an `executor`. */ +function isAgentSpec(value: unknown): value is AgentSpec { + if (typeof value !== 'object' || value === null) return false + const v = value as Record + return 'profile' in v && 'harness' in v +} + +function isAbortError(err: unknown): boolean { + return ( + typeof err === 'object' && + err !== null && + 'name' in err && + (err as { name: unknown }).name === 'AbortError' + ) +} + +/** External-boundary failures (network/FS/subprocess) are infra — excluded from the merge + * `n` and the equal-k assertion. A `ValidationError` from a built-in executor wraps a + * config/transport failure, so it counts as infra; other throws are a real bad result. */ +function isInfraError(err: unknown): boolean { + return err instanceof ValidationError +} + +function errMessage(err: unknown): string { + if (err instanceof Error) return err.message + return String(err) +} diff --git a/src/loops/supervise/supervisor.ts b/src/loops/supervise/supervisor.ts new file mode 100644 index 0000000..e95c7b5 --- /dev/null +++ b/src/loops/supervise/supervisor.ts @@ -0,0 +1,395 @@ +/** + * @experimental + * + * The `Supervisor` impl (KEYSTONE, build step 5). + * + * Owns the four things a free-running recursive `act` cannot own itself: the GLOBAL + * conserved budget pool, the event-sourced spawn log, the abort cascade over the whole + * live tree, and the OTP intensity breaker. `run` builds the root `Scope` over those, + * runs the root `Agent.act`, and returns a TYPED `SupervisedResult` — a no-winner is + * never coerced into a best-effort `Out`. + * + * Three lifecycle invariants this impl enforces by construction: + * - Join barrier: when `act()` settles (resolve OR reject), every still-live child is + * torn down before `run` returns — the generalization of the kernel's + * `finally{ Promise.allSettled(destroy) }` barrier (run-loop.ts) from boxes to the + * whole sub-tree. A teardown failure is `allSettled`'d and journaled as a + * `cancelled` event; it NEVER masks act()'s own outcome. act()'s rejection is the + * PRIMARY error (the kernel's firstError precedence), so a teardown throw during the + * barrier can never overwrite the real failure. + * - Abort cascade: a root abort (caller signal, `RootHandle.abort`, a tripped breaker, + * or pool exhaustion) aborts ONE internal controller whose signal is the root scope's + * signal. The scope cascades that into every live child's executor abort — which, for + * an `acquiring` child, chains into the `acquireSandbox` signal and reaps the + * find-by-name orphan box (M1). The supervisor never reaps children directly. + * - The supervisor NEVER re-enters a child (m3): the kernel/`acquireSandbox` already + * retried at the leaf, and a driver re-spawns through `scope.spawn`. The breaker only + * COUNTS `down` settlements within the intensity window and trips to a typed + * no-winner; it does not restart anything. + * + * Selection lives in the driver, not here (selector≠judge): `act` returns the synthesized + * winner `Out`. The supervisor content-addresses that `Out` for its replay `outRef`, + * reads `spentTotal` off the conserved pool, and wraps it as a typed `winner` — it does + * not re-rank children behind the driver's back. + */ + +import { contentAddress } from '../../durable/spawn-journal' +import { RuntimeRunStateError } from '../../errors' +import { type BudgetPool, createBudgetPool } from './budget' +import { createScope } from './scope' +import type { + Agent, + RootHandle, + RootSignal, + Scope, + SpawnEvent, + SpawnJournal, + Spend, + SupervisedResult, + Supervisor, + SupervisorOpts, + TreeView, +} from './types' + +/** The default runtime recursion-depth ceiling, paired with the conserved pool so a + * runaway recursion hits budget-exhaustion first and depth-exceeded second (R3). */ +const defaultMaxDepth = 4 + +/** A no-winner reason the supervisor can prove from its OWN lifecycle state — pinned to + * the frozen `SupervisedResult` reason union. A driver rejecting for a domain reason + * (not budget/abort) is classed `all-children-down`, the only typed bucket for "the tree + * produced no usable result". */ +type NoWinnerReason = (SupervisedResult & { kind: 'no-winner' })['reason'] + +export function createSupervisor(): Supervisor { + let attached: RootControl | undefined + + async function run( + root: Agent, + task: Task, + opts: SupervisorOpts, + ): Promise> { + const now = opts.now ?? Date.now + const pool = createBudgetPool(opts.budget, now) + await opts.journal.beginTree(opts.runId, new Date(now()).toISOString()) + + // ONE internal controller is the root scope's abort source. Every cascade path + // (caller signal, RootHandle.abort, breaker trip, deadline) aborts it; the scope + // fans it out to each live child's executor (acquire-aware reap included). + const controller = new AbortController() + const cascadeAbort = (reason?: string) => { + if (controller.signal.aborted) return + // Carry the reason on the signal so it chains down to each child's abort signal + // (`childAbort.signal.reason`) — the diagnostic the scope's executors observe. + controller.abort(reason) + } + + const onCallerAbort = () => cascadeAbort('caller signal aborted') + if (opts.signal) { + if (opts.signal.aborted) cascadeAbort('caller signal aborted') + else opts.signal.addEventListener('abort', onCallerAbort, { once: true }) + } + + // The breaker watches `down` settlements via a counting journal decorator, so it + // observes every child failure without intercepting `scope.next()` (the driver's + // private channel). Tripping aborts the same controller; the trip is recorded so the + // final result can name it. + const breaker = createIntensityBreaker(opts, () => cascadeAbort('intensity breaker tripped')) + const journal = wrapJournalForBreaker(opts.journal, breaker) + + const scope = createScope({ + parentId: opts.runId, + root: opts.runId, + pool, + journal, + blobs: opts.blobs, + executors: opts.executors, + seams: {}, + depth: 0, + maxDepth: opts.maxDepth ?? defaultMaxDepth, + signal: controller.signal, + now, + }) + + // `view`/drain read the scope opaquely (`Out` erased) — the supervisor never `spawn`s + // on it, so the live-tree readout and the join barrier are `Out`-agnostic. + const openScope = scope as unknown as Scope + + // Bind any attached RootHandle to THIS live run so view()/signal()/abort() reach the + // live scope + the one cascade controller. Detached again in the finally barrier. + if (attached) { + attached.bind({ scope: openScope, cascadeAbort, signal: pushRootSignal(cascadeAbort) }) + } + + let actOutcome: { ok: true; out: Out } | { ok: false; error: unknown } + try { + const out = await root.act(task, scope) + actOutcome = { ok: true, out } + } catch (error) { + // act()'s rejection is the PRIMARY error; capture it before the join barrier so a + // teardown failure in the barrier can never overwrite it (firstError precedence). + actOutcome = { ok: false, error } + } finally { + // Join barrier: tear down every still-live child. Generalizes the kernel's + // `finally{ Promise.allSettled(destroy) }` — a teardown throw is allSettled'd and + // journaled, never re-thrown. + await drainLiveChildren(openScope, controller) + if (opts.signal) opts.signal.removeEventListener('abort', onCallerAbort) + if (attached) attached.unbind() + } + + const tree = scope.view + if (actOutcome.ok) { + // The driver synthesized a winner. Content-address it for the replay `outRef`, put + // it once, and sum the conserved spend off every journaled settlement. No + // re-ranking — the driver already selected. + const out = actOutcome.out + const outRef = contentAddress(out) + await opts.blobs.put(outRef, out) + return { + kind: 'winner', + out, + outRef, + tree, + spentTotal: await spentTotalFromJournal(journal, opts.runId), + } + } + + // act() rejected. The reason is proven from lifecycle state, in precedence order: + // a tripped breaker outranks any abort (it is the most specific cause) outranks + // budget-exhaustion outranks the residual "the tree produced nothing usable" bucket. + // A no-winner is TYPED — never a best-effort coercion of a partial child (M2). + return { + kind: 'no-winner', + reason: classifyNoWinner(controller, pool, opts, breaker), + tree, + downCount: breaker.downCount(), + } + } + + function attach(h: RootHandle): void { + const control = rootControls.get(h as RootHandle) + if (!control) { + throw new RuntimeRunStateError( + 'supervisor.attach: handle was not minted by createRootHandle (no control channel)', + ) + } + attached = control + } + + return { run, attach } +} + +// ── Root handle ─────────────────────────────────────────────────────────────── + +/** The live binding the supervisor populates while a run is in flight. `view` reads the + * live scope; `cascadeAbort`/`signal` reach the one cascade controller. */ +interface RunBinding { + readonly scope: Scope + readonly cascadeAbort: (reason?: string) => void + readonly signal: (msg: RootSignal) => void +} + +/** The supervisor-private control behind a `RootHandle`. `createRootHandle` mints it and + * registers it in `rootControls`; `attach` looks it up and `bind`s it to the live run. */ +interface RootControl { + bind(binding: RunBinding): void + unbind(): void +} + +/** Module-private channel from a minted `RootHandle` to its `RootControl`, so `attach` + * can prove a handle is ours and reach its binding without leaking the control onto the + * frozen `RootHandle` shape. */ +const rootControls = new WeakMap, RootControl>() + +/** + * Mint a `RootHandle` plus its supervisor-private control. The handle is the substrate a + * chat/pi-viz client attaches to (Q2): `view()` reads the live tree, `signal()` delivers + * an out-of-band message, `abort()` cascades. Before `run` binds it (and after `run` + * unbinds it) the handle is fail-loud: a client that talks to a handle that is not + * driving a live run gets a typed error, never a silent no-op. + */ +export function createRootHandle(): RootHandle { + let binding: RunBinding | undefined + const handle: RootHandle = { + view(): TreeView { + if (!binding) { + throw new RuntimeRunStateError( + 'RootHandle.view: handle is not bound to a live run (attach it before run, read after run starts)', + ) + } + return binding.scope.view + }, + signal(msg: RootSignal): void { + if (!binding) { + throw new RuntimeRunStateError('RootHandle.signal: handle is not bound to a live run') + } + binding.signal(msg) + }, + abort(reason?: string): void { + if (!binding) { + throw new RuntimeRunStateError('RootHandle.abort: handle is not bound to a live run') + } + binding.cascadeAbort(reason ?? 'root handle aborted') + }, + } + rootControls.set(handle as RootHandle, { + bind(b: RunBinding): void { + binding = b + }, + unbind(): void { + binding = undefined + }, + }) + return handle +} + +/** A `RootSignal` sink: `cancel` cascades an abort; pause/resume/ask are observability + * signals the substrate accepts but does not act on here (the chat/pi-viz client owns + * pause semantics — building them now would be mechanism ahead of the gate). */ +function pushRootSignal(cascadeAbort: (reason?: string) => void): (msg: RootSignal) => void { + return (msg: RootSignal): void => { + if (msg.kind === 'cancel') cascadeAbort(msg.reason ?? 'root signal: cancel') + } +} + +// ── OTP intensity breaker ─────────────────────────────────────────────────────── + +/** + * Counts `down` settlements inside a sliding window. More than `maxRestarts` of them + * within `withinMs` trips the supervisor (aborting the cascade) rather than letting a + * driver re-spawn a doomed child forever. With either bound unset the breaker is inert + * (it still counts `down`s for `downCount`). The breaker NEVER restarts a child — it is a + * circuit breaker over the driver's own re-spawn decisions (m3). + */ +interface IntensityBreaker { + recordDown(at: number): void + tripped(): boolean + downCount(): number +} + +function createIntensityBreaker(opts: SupervisorOpts, trip: () => void): IntensityBreaker { + const max = opts.maxRestarts + const within = opts.withinMs + const armed = max !== undefined && within !== undefined + const recent: number[] = [] + let total = 0 + let isTripped = false + return { + recordDown(at: number): void { + total += 1 + if (!armed || isTripped) return + recent.push(at) + const cutoff = at - within + while (recent.length > 0 && recent[0]! < cutoff) recent.shift() + if (recent.length > max) { + isTripped = true + trip() + } + }, + tripped(): boolean { + return isTripped + }, + downCount(): number { + return total + }, + } +} + +/** Decorate the journal so the breaker observes every `settled`-`down` event the scope + * appends, without the supervisor intercepting `scope.next()`. The decorator is + * transparent — it forwards every method verbatim and only reads the down events. */ +function wrapJournalForBreaker(journal: SpawnJournal, breaker: IntensityBreaker): SpawnJournal { + return { + loadTree: (root) => journal.loadTree(root), + beginTree: (root, at) => journal.beginTree(root, at), + appendEvent: (root, ev: SpawnEvent) => { + if (ev.kind === 'settled' && ev.status === 'down') breaker.recordDown(Date.parse(ev.at)) + return journal.appendEvent(root, ev) + }, + } +} + +// ── Join barrier + result classification ───────────────────────────────────────── + +/** + * Drain the root scope's live set so every still-running/acquiring child is torn down + * before `run` returns — the join barrier. Abort the cascade controller first (so each + * child's executor stops cleanly), then pull `next()` to completion so every aborted + * child's teardown + reconcile runs and its `settled` event is journaled by the scope. + * A child's own teardown failure is already swallowed inside `runChild`, and the cursor + * itself never rejects (a failing child is typed into a `down`), so the whole barrier is + * `allSettled`'d — a stray throw here is NOT the primary error (firstError precedence). + */ +async function drainLiveChildren( + scope: Scope, + controller: AbortController, +): Promise { + const hasLive = scope.view.inFlight > 0 + if (!hasLive) return + // Cascade the abort into every live child's executor before draining. + if (!controller.signal.aborted) controller.abort() + await Promise.allSettled([drainCursor(scope)]) +} + +async function drainCursor(scope: Scope): Promise { + for (;;) { + const settled = await scope.next() + if (settled === null) return + } +} + +function classifyNoWinner( + controller: AbortController, + pool: BudgetPool, + opts: SupervisorOpts, + breaker: IntensityBreaker, +): NoWinnerReason { + // A tripped breaker is the most specific cause (children kept dying), so it outranks + // the generic abort it raised. Then a caller/handle abort. Then the pool. The residual + // bucket is "ran to completion under budget but produced nothing usable". + if (breaker.tripped()) return 'all-children-down' + if (controller.signal.aborted) return 'aborted' + if (poolExhausted(pool, opts)) return 'budget-exhausted' + return 'all-children-down' +} + +function poolExhausted(pool: BudgetPool, opts: SupervisorOpts): boolean { + const r = pool.readout() + if (r.tokensLeft <= 0) return true + if (opts.budget.maxUsd !== undefined && r.usdLeft <= 0) return true + if ( + opts.budget.deadlineMs !== undefined && + r.deadlineMs > 0 && + (opts.now ?? Date.now)() >= r.deadlineMs + ) { + return true + } + return false +} + +/** + * Sum the conserved spend over every journaled `settled` event — the honest per-channel + * total (input/output/usd/iterations all preserved), read off the same evidence replay + * reads. Computed AFTER the join barrier so every child's settlement is recorded. Fails + * loud if the tree was never journaled (the supervisor always `beginTree`s, so a missing + * tree is a corrupted journal, not a normal path). + */ +async function spentTotalFromJournal(journal: SpawnJournal, root: string): Promise { + const events = await journal.loadTree(root) + if (events === undefined) { + throw new RuntimeRunStateError( + `supervisor: spawn tree '${root}' is missing from the journal after run (corrupted log)`, + ) + } + const total: Spend = { iterations: 0, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 } + for (const ev of events) { + if (ev.kind !== 'settled') continue + total.iterations += ev.spent.iterations + total.tokens.input += ev.spent.tokens.input + total.tokens.output += ev.spent.tokens.output + total.usd += ev.spent.usd + total.ms += ev.spent.ms + } + return total +} diff --git a/src/loops/supervise/types.ts b/src/loops/supervise/types.ts new file mode 100644 index 0000000..fd06232 --- /dev/null +++ b/src/loops/supervise/types.ts @@ -0,0 +1,443 @@ +/** + * @experimental + * + * Recursive execution atom — the FROZEN type surface (the keystone contract). + * + * One self-similar `Agent` atom runs inside a budget-conserving reactive `Scope`, + * orchestrated by a `Supervisor` over an event-sourced `SpawnJournal`. A leaf is an + * `Agent` that never calls `scope.spawn`; a driver is an `Agent` that spawns and runs + * a policy over its children's streaming results. + * + * Two invariants the surface exists to make enforceable: + * - Budget is an atomically-reserved CONSERVED pool, so `Σk(treatment) ≡ Σk(blind)` by + * construction (reserve-on-spawn, refund-unspent-on-settle, fail-closed admission). + * - The journal records a content-addressed `outRef` per child result, so replay + * rehydrates the exact `Settled` the driver branched on (the replay invariant below). + * + * The leaf RUNTIME is one OPEN `LeafExecutor` interface, not a closed `inline|sandbox|cli` + * union the call site switches on. The built-ins (router/inline, sandbox, cli) are the + * initial IMPLEMENTATIONS; any user agent is first-class the moment it implements the + * interface. The interface IS the extension point — no per-vendor adapters live here. + * + * Layering: substrate types (`DefaultVerdict`) come from `@tangle-network/agent-eval`; + * runtime-shaped types (everything else) live here. Pure types/interfaces only — this + * module typechecks standalone and is imported by every keystone impl. + */ + +import type { DefaultVerdict } from '@tangle-network/agent-eval' +import type { AgentProfile, BackendType } from '@tangle-network/sandbox' +import type { LoopTokenUsage } from '../types' + +// `LoopTokenUsage = { input, output }` ONLY (../types). Re-exported so keystone impls +// import the budget surface from one place. `usd` is a SEPARATE channel (see `UsageEvent`). +export type { DefaultVerdict, LoopTokenUsage } + +// ── The atom ──────────────────────────────────────────────────────────────── + +/** + * One self-similar atom. A leaf is an `Agent` that never calls `scope.spawn`; a driver + * is an `Agent` whose `act` spawns children and reacts to them via `scope.next()`. An + * analyst is an `Agent` whose task is "read these traces → findings" — `where` it runs + * is its executor, not a separate type. + * + * `act` MUST be replay-safe: it may read `verdict`, `spent`, and `out` (rehydrated by + * `outRef`) off each `Settled`; it MUST NOT read `Date.now`, `Math.random`, or any + * unordered collection. `scope.next()` delivers strictly in recorded `seq` order. + */ +export interface Agent { + readonly name: string + act(task: Task, scope: Scope): Promise +} + +// ── The open leaf runtime ───────────────────────────────────────────────────── + +/** + * The leaf runtime — ONE open interface, not a closed union. `execute` returns a + * `Promise` for one-shot executors OR an `AsyncIterable` for + * streaming ones; a streaming executor reports incremental normalized usage as it runs + * (the budget pool reconciles against it) and exposes its terminal artifact via + * `resultArtifact()`. Both shapes normalize usage to `UsageEvent` so the conserved pool + * meters every runtime identically. + * + * Built-in implementations (in `runtime.ts`, NOT variants here): router/inline (a direct + * Router/HTTP inference call, no box), sandbox (COMPOSES `runLoop` as a leaf, forwarding + * PR #150's optional `lineage` passthrough — does NOT reinvent checkpoint/fork), cli + * (Halo/RLM subprocess; `budgetExempt`, excluded from equal-k by construction). A user's + * own agent (mastra/agno/raw HTTP/anything) is first-class by implementing this interface. + */ +export interface LeafExecutor { + /** Stable runtime tag for traces + the equal-k exemption check. */ + readonly runtime: Runtime + /** + * When true, this executor's spend is NOT metered against the conserved pool and its + * iterations are excluded from the equal-k assertion (a `cli` subprocess without + * token accounting). Fail-loud everywhere else: a metered executor MUST report usage. + */ + readonly budgetExempt?: boolean + /** + * One-shot → resolves a `LeafResult`; streaming → yields incremental `UsageEvent`s and + * the terminal artifact is read from `resultArtifact()` after the stream drains. + * `signal` is the spawn-scoped abort (chains the acquire lifecycle for sandbox). + */ + execute(task: unknown, signal: AbortSignal): Promise> | AsyncIterable + /** + * Tear the executor's resources down. `grace` mirrors the OTP shutdown spec + * (`'brutalKill'` = immediate, a number = ms grace, `'infinity'` = await clean exit). + */ + teardown(grace: number | 'brutalKill' | 'infinity'): Promise<{ destroyed: boolean }> + /** + * The replay source (B1): the content-addressed `outRef` + the materialized output the + * driver branched on, its verdict, and the conserved spend. Read once, after settle. + */ + resultArtifact(): { outRef: string; out: Out; verdict?: DefaultVerdict; spent: Spend } +} + +/** Terminal artifact of a one-shot `LeafExecutor.execute`. */ +export interface LeafResult { + outRef: string + out: Out + verdict?: DefaultVerdict + spent: Spend +} + +/** + * Normalized usage event — the single channel every executor reports through, so the + * conserved pool meters all runtimes identically. `tokens` carries `LoopTokenUsage`'s + * `{ input, output }`; `usd` is a SEPARATE channel (never folded into tokens). + */ +export type UsageEvent = + | { kind: 'tokens'; input: number; output: number } + | { kind: 'cost'; usd: number } + | { kind: 'iteration' } + +/** The runtime tag of a `LeafExecutor` impl. Open by intent — `string` so a BYO executor + * names its own runtime; the built-ins use these literals. */ +export type Runtime = 'router' | 'inline' | 'sandbox' | 'cli' | (string & {}) + +// ── Executor resolution (OPEN registry, not a switch) ───────────────────────── + +/** + * `AgentProfile` does NOT carry a `harness`/backend field — `harness` lives on the + * sandbox SDK's `BackendConfig`, not the portable profile. So an agent is mapped to its + * executor through this MINIMAL wrapper, never by fabricating a field onto `AgentProfile`. + * + * Resolution (in `runtime.ts`): + * - `executor` present → BYO: use it verbatim (a user's own `LeafExecutor`). + * - `harness === null` → router/inline: a direct Router call, no box. + * - `harness` is a `BackendType` → sandbox: compose `runLoop` against `profile` on that backend. + * Fail loud on an unresolvable spec (no executor and an unknown harness). + */ +export interface AgentSpec { + readonly profile: AgentProfile + /** `null` selects router/inline; a `BackendType` selects the sandboxed harness. */ + readonly harness: BackendType | null + /** Bring-your-own executor: when set, overrides harness-based resolution entirely. */ + readonly executor?: LeafExecutor +} + +/** + * Builds a fresh `LeafExecutor` for one spawn from the resolved spec. Per-spawn (not + * shared) so each child owns its own box/abort/teardown lifecycle. A BYO factory lets a + * user supply construction args without pre-instantiating. + */ +export type LeafExecutorFactory = (spec: AgentSpec, ctx: ExecutorContext) => LeafExecutor + +/** Construction context handed to a `LeafExecutorFactory` — the seams a built-in needs + * (sandbox client for the sandbox executor, router config for router/inline) without + * the factory reaching into module globals. */ +export interface ExecutorContext { + readonly signal: AbortSignal + /** Opaque seams the registry threads through; a built-in narrows what it needs. */ + readonly seams: Readonly> +} + +/** + * The OPEN resolver: maps an `AgentSpec` to a `LeafExecutorFactory`. The default + * registry resolves the three built-ins AND accepts a BYO `executor`/factory; callers + * register more runtimes by name. NOT a closed switch — registration is the extension + * point, mirroring the open `LeafExecutor` interface. + */ +export interface ExecutorRegistry { + /** Register a factory for a named runtime. Throws on a duplicate name (fail loud). */ + register(runtime: Runtime, factory: LeafExecutorFactory): void + /** + * Resolve a spec to a factory. Precedence: a BYO `spec.executor` → a trivial factory + * returning it; else `harness === null` → the `'router'` factory; else a registered + * factory for the harness-derived runtime. Returns a typed outcome — the caller + * inspects `succeeded` before `value` (no silent fallback). + */ + resolve( + spec: AgentSpec, + ): { succeeded: true; value: LeafExecutorFactory } | { succeeded: false; error: string } +} + +// ── Budget — the conserved reservation pool ─────────────────────────────────── + +/** A budget envelope on a spawn or the root. All ceilings; the pool reserves against them. */ +export interface Budget { + readonly maxIterations: number + readonly maxTokens: number + readonly maxUsd?: number + readonly deadlineMs?: number +} + +/** Conserved spend, reconciled from the normalized `UsageEvent` stream. Tokens and usd + * are separate channels (never folded). */ +export interface Spend { + iterations: number + tokens: LoopTokenUsage + usd: number + ms: number +} + +// ── Node lifecycle ──────────────────────────────────────────────────────────── + +/** OTP child-spec restart class. */ +export type Restart = 'temporary' | 'transient' | 'permanent' + +/** `'acquiring'` is first-class (M1): a node spends real time + reaps an orphan box + * during sandbox acquire BEFORE it is `running`, so abort must be defined over it. */ +export type NodeStatus = 'pending' | 'acquiring' | 'running' | 'done' | 'failed' | 'cancelled' + +/** Deterministic node id — `${parent}:s${seq}` from the cursor order, never wall-clock. */ +export type NodeId = string + +export interface SpawnOpts { + readonly budget: Budget + readonly label: string + readonly restart?: Restart + /** Teardown grace handed to the executor when this node is reaped. */ + readonly shutdown?: number | 'brutalKill' | 'infinity' +} + +/** + * A live child handle. `abort()` is defined over the ACQUIRE lifecycle: it chains into + * the `acquireSandbox` signal and reaps a find-by-name orphan box, so a node aborted + * mid-acquire never leaks (M1). + */ +export interface Handle { + readonly id: NodeId + readonly label: string + readonly status: NodeStatus + abort(reason?: string): void + /** Phantom: binds the handle to the child's output type so `spawn` returns a + * `Handle` distinct from a `Handle`. Type-only — never present at runtime. */ + readonly __out?: Out +} + +/** + * A settled child, delivered by `scope.next()`. `seq` is the monotonic cursor order + * `next()` yielded this settlement (B2) — NOT wall-clock — and replay delivers strictly + * in `seq` order. `outRef` rehydrates `out` from the `ResultBlobStore` on replay. + */ +export type Settled = + | { + kind: 'done' + handle: Handle + out: Out + outRef: string + verdict?: DefaultVerdict + spent: Spend + seq: number + } + | { + kind: 'down' + handle: Handle + reason: string + /** True = infrastructure failure (excluded from merge `n` / equal-k), not a bad result. */ + infra: boolean + restartCount: number + seq: number + } + +// ── The reactive Scope ───────────────────────────────────────────────────────── + +/** + * The budget-conserving reactive scope an `Agent.act` runs inside. `spawn` reserves + * budget atomically from the shared pool and FAILS CLOSED when the pool can't cover it; + * `next()` is a ray.wait cursor (n=1) over THIS scope's IN-MEMORY live set; `view` reads + * the in-memory nursery (NOT the log), O(live). + */ +export interface Scope { + /** + * Spawn a child. Reserves `opts.budget` from the conserved pool atomically; refunds the + * unspent remainder on settle. Returns a typed outcome — fail-closed on an exhausted + * pool or an exceeded depth ceiling (the caller inspects `ok` before `handle`). + */ + spawn( + agent: Agent, + task: unknown, + opts: SpawnOpts, + ): { ok: true; handle: Handle } | { ok: false; reason: 'budget-exhausted' | 'depth-exceeded' } + /** ray.wait n=1 over this scope's in-memory live set; resolves as each child settles; + * `null` when the live set is empty. */ + next(): Promise | null> + /** The live tree — reads the in-memory nursery, not the journal. */ + readonly view: TreeView + /** Conserved-pool readouts (post-reservation). */ + readonly budget: Readonly<{ + tokensLeft: number + usdLeft: number + deadlineMs: number + reservedTokens: number + }> +} + +// ── Observability view (read off the in-memory nursery) ──────────────────────── + +export interface NodeSnapshot { + readonly id: NodeId + readonly parent?: NodeId + readonly label: string + readonly status: NodeStatus + readonly runtime: Runtime + readonly budget: Budget + /** Conserved spend so far for this node. */ + readonly spent: Spend + /** `outRef` once the node is `done` (the replay/result pointer). */ + readonly outRef?: string +} + +/** The live tree — what `scope.view` / `RootHandle.view()` materialize for a viewer. */ +export interface TreeView { + readonly root: NodeId + readonly nodes: ReadonlyArray + /** Count of nodes in `running` or `acquiring` — the "what's in flow?" answer. */ + readonly inFlight: number +} + +// ── Event source — the decision/payload split the replay argument rests on ───── + +/** Journaled spawn-tree events (B1/B2). `seq` is the cursor order; `at` is an ISO + * timestamp for human inspection only (NOT a replay input). */ +export type SpawnEvent = + | { + kind: 'spawned' + id: NodeId + parent?: NodeId + label: string + budget: Budget + runtime: Runtime + seq: number + at: string + } + | { + kind: 'settled' + id: NodeId + status: 'done' | 'down' + /** Content-addressed result pointer; rehydrates `out` from `ResultBlobStore`. */ + outRef?: string + verdict?: DefaultVerdict + spent: Spend + infra?: boolean + seq: number + at: string + } + | { kind: 'cancelled'; id: NodeId; reason: string; seq: number; at: string } + +/** + * The spawn-tree event source (mirrors `ConversationJournal`'s begin/append/load shape). + * `loadTree` replays the full ordered event list for resume/replay; `appendEvent` is + * called only AFTER the event is observed-committed (never speculative). + */ +export interface SpawnJournal { + loadTree(root: NodeId): Promise + beginTree(root: NodeId, at: string): Promise + appendEvent(root: NodeId, ev: SpawnEvent): Promise +} + +/** Content-addressed result blobs (the `outRef` → artifact map) backing the replay + * invariant. Split from the journal so the journal stays small (decisions) and the + * payloads (evidence) live where a viewer/replayer rehydrates them. */ +export interface ResultBlobStore { + put(outRef: string, artifact: unknown): Promise + get(outRef: string): Promise +} + +// ── The Supervisor ───────────────────────────────────────────────────────────── + +/** + * Owns the conserved pool, the spawn log, the abort cascade, the OTP intensity breaker, + * and the root handle. `run` executes the root `Agent` to completion; `attach` wires a + * live `RootHandle` (the Q2 substrate the chat/pi-viz client later consumes). + */ +export interface Supervisor { + run(root: Agent, task: Task, opts: SupervisorOpts): Promise> + attach(h: RootHandle): void +} + +export interface SupervisorOpts { + /** The root conserved-pool ceiling (tokens + usd + iterations + deadline). */ + readonly budget: Budget + /** Trace-correlation root + the journal/blob root key. */ + readonly runId: NodeId + /** Event source — defaults to the in-memory journal in the impl; pass JSONL/FS for durability. */ + readonly journal: SpawnJournal + /** Result payload store backing `outRef` rehydration. */ + readonly blobs: ResultBlobStore + /** Executor resolution — the open registry mapping `AgentSpec` → `LeafExecutor`. */ + readonly executors: ExecutorRegistry + /** Runtime recursion-depth ceiling (paired with the conserved pool per R3). */ + readonly maxDepth?: number + /** + * OTP intensity breaker: more than `maxRestarts` child restarts within `withinMs` + * trips the supervisor to `no-winner` rather than restarting forever. + */ + readonly maxRestarts?: number + readonly withinMs?: number + readonly now?: () => number + readonly signal?: AbortSignal +} + +/** Typed terminal result (M2) — a no-winner is NEVER coerced to a best-effort output. */ +export type SupervisedResult = + | { + kind: 'winner' + out: Out + outRef: string + verdict?: DefaultVerdict + tree: TreeView + spentTotal: Spend + } + | { + kind: 'no-winner' + reason: 'all-children-down' | 'budget-exhausted' | 'aborted' + tree: TreeView + downCount: number + } + +/** Live root handle — the substrate a chat/pi-viz client attaches to (Q2). `signal` + * delivers an out-of-band message to the running root; `view()` materializes the tree. */ +export interface RootHandle { + view(): TreeView + signal(msg: RootSignal): void + abort(reason?: string): void + /** Phantom: binds the handle to the supervised run's output type. Type-only — never + * present at runtime; lets `attach(h: RootHandle)` stay output-typed. */ + readonly __out?: Out +} + +/** Out-of-band message to a running root. Open by intent — a client extends it. */ +export type RootSignal = + | { kind: 'pause' } + | { kind: 'resume' } + | { kind: 'cancel'; reason?: string } + | { kind: 'ask'; question: string } + +// ── Widening governor ──────────────────────────────────────────────────────── + +/** + * The progressive-widening gate (MCTS-PW). Decides whether a settled child is + * `promising` enough to spawn another under the remaining pool. DEFAULTS TO FLAT + * (`shouldWiden` always false) so a gate run never widens and the selector≠judge + * firewall conflict (R2) stays dormant. When widening IS enabled, `promising` MUST be + * derived from TRACE findings (`analyses`), never raw `verdict` — or the gate carries + * an explicit, argued `judgeExempt: true` (the documented escape hatch, off by default). + */ +export interface WidenGate { + /** Default impl returns false for every settlement (flat — never widens). */ + shouldWiden(settled: Settled, budget: Scope['budget']): boolean + /** When true, widening may read `verdict` directly (collides with the steer firewall — + * must be explicitly argued per cell, never defaulted on). */ + readonly judgeExempt?: boolean +} diff --git a/tests/loops/supervise.test.ts b/tests/loops/supervise.test.ts new file mode 100644 index 0000000..a1de89e --- /dev/null +++ b/tests/loops/supervise.test.ts @@ -0,0 +1,766 @@ +import type { AgentProfile } from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + InMemoryResultBlobStore, + InMemorySpawnJournal, + materializeTreeView, + replaySpawnTree, +} from '../../src/durable/spawn-journal' +import { ValidationError } from '../../src/errors' +import { defaultSelectWinner } from '../../src/loops/run-loop' +import { createBudgetPool, spendFromUsageEvents } from '../../src/loops/supervise/budget' +import { createExecutorRegistry } from '../../src/loops/supervise/runtime' +import { createScope, settledToIteration } from '../../src/loops/supervise/scope' +import { createRootHandle, createSupervisor } from '../../src/loops/supervise/supervisor' +import type { + Agent, + AgentSpec, + Budget, + DefaultVerdict, + LeafExecutor, + LeafResult, + Scope, + Settled, + SpawnEvent, + Spend, + SupervisorOpts, + UsageEvent, + WidenGate, +} from '../../src/loops/supervise/types' + +// ── The mock LeafExecutor — the whole keystone runs offline against this ───────── +// +// A scripted leaf: a fixed `UsageEvent` program drives the conserved-pool fold, a +// scripted `out` (+ optional verdict) is the artifact the driver branches on, and a +// `failWith` knob lets a child go `down` (typed, never re-thrown by the scope) so the +// supervisor join barrier can be exercised. No network, no sandbox, no subprocess. +interface MockScript { + readonly out: unknown + readonly events: UsageEvent[] + readonly verdict?: DefaultVerdict + /** When set, `execute` throws — the scope types it into a `down` settlement. */ + readonly failWith?: string + /** When set, `execute` blocks on this promise until the scope aborts it. */ + readonly block?: Promise +} + +function mockExecutor(script: MockScript): LeafExecutor { + const spent = spendFromUsageEvents(script.events) + const outRef = `mock:${stableKey(script.out)}` + const executor: LeafExecutor = { + runtime: 'router', + execute(_task: unknown, signal: AbortSignal): AsyncIterable { + // Streaming shape: yield the scripted usage, then the artifact is read from + // resultArtifact(). A `block` script parks until the spawn-scoped signal aborts, + // so an abort mid-flight tears the child down deterministically. + return (async function* () { + if (script.failWith !== undefined) throw new ValidationError(script.failWith) + if (script.block) { + await Promise.race([ + script.block, + new Promise((resolve) => { + if (signal.aborted) return resolve() + signal.addEventListener('abort', () => resolve(), { once: true }) + }), + ]) + } + for (const ev of script.events) yield ev + })() + }, + teardown(): Promise<{ destroyed: boolean }> { + return Promise.resolve({ destroyed: true }) + }, + resultArtifact(): LeafResult { + return { + outRef, + out: script.out, + ...(script.verdict ? { verdict: script.verdict } : {}), + spent, + } + }, + } + return executor +} + +function stableKey(value: unknown): string { + return JSON.stringify(value) ?? String(value) +} + +/** A leaf agent carrying a BYO mock executor as its `executorSpec.executor`. The scope + * resolves this verbatim through the open registry (BYO precedence), so no built-in + * router/sandbox/cli factory ever fires — the test stays fully offline. */ +function leafAgent(name: string, script: MockScript): Agent { + const spec: AgentSpec = { + profile: { name } as AgentProfile, + harness: null, + executor: mockExecutor(script), + } + return { name, act: async () => script.out, executorSpec: spec } as Agent & { + executorSpec: AgentSpec + } +} + +const tokensOnly = (input: number, output: number, iterations = 1): UsageEvent[] => { + const evs: UsageEvent[] = [] + for (let i = 0; i < iterations; i += 1) evs.push({ kind: 'iteration' }) + evs.push({ kind: 'tokens', input, output }) + return evs +} + +function scopeArgs(over: Partial[0]> = {}) { + const pool = over.pool ?? createBudgetPool({ maxIterations: 100, maxTokens: 100_000 }, () => 0) + const journal = over.journal ?? new InMemorySpawnJournal() + const root = over.root ?? 'run' + return { + args: { + parentId: over.parentId ?? root, + root, + pool, + journal, + blobs: over.blobs ?? new InMemoryResultBlobStore(), + executors: over.executors ?? createExecutorRegistry(), + seams: over.seams ?? {}, + depth: over.depth ?? 0, + maxDepth: over.maxDepth, + signal: over.signal ?? new AbortController().signal, + now: over.now ?? (() => 0), + }, + pool, + journal, + } +} + +async function beginScope(over: Partial[0]> = {}) { + const { args, pool, journal } = scopeArgs(over) + await journal.beginTree(args.root, new Date(0).toISOString()) + return { scope: createScope(args), pool, journal, args } +} + +// ── 1. Conserved budget pool ───────────────────────────────────────────────────── + +describe('conserved budget pool', () => { + it('reserve fails closed when the pool cannot cover the child', () => { + const pool = createBudgetPool({ maxIterations: 4, maxTokens: 1000 }, () => 0) + const a = pool.reserve({ maxIterations: 2, maxTokens: 600, label: '' } as Budget) + expect(a.ok).toBe(true) + // 600 reserved, 400 free; a 500-token child must fail closed (never overcommit). + const b = pool.reserve({ maxIterations: 2, maxTokens: 500, label: '' } as Budget) + expect(b).toEqual({ ok: false, reason: 'budget-exhausted' }) + expect(pool.readout().tokensLeft).toBe(400) + expect(pool.readout().reservedTokens).toBe(600) + }) + + it('refunds the unspent remainder on reconcile (Σ conservation)', () => { + const pool = createBudgetPool({ maxIterations: 10, maxTokens: 1000 }, () => 0) + const r = pool.reserve({ maxIterations: 5, maxTokens: 800, label: '' } as Budget) + if (!r.ok) throw new Error('reserve should have succeeded') + expect(pool.readout().tokensLeft).toBe(200) + expect(pool.readout().reservedTokens).toBe(800) + // Spent 300 of the 800 reserved → 500 refunds to free; reserved drops to 0. + pool.reconcile(r.ticket, { + iterations: 2, + tokens: { input: 100, output: 200 }, + usd: 0, + ms: 0, + }) + expect(pool.readout().tokensLeft).toBe(700) + expect(pool.readout().reservedTokens).toBe(0) + }) + + it('fails loud on a double reconcile (no silent double refund)', () => { + const pool = createBudgetPool({ maxIterations: 10, maxTokens: 1000 }, () => 0) + const r = pool.reserve({ maxIterations: 5, maxTokens: 800, label: '' } as Budget) + if (!r.ok) throw new Error('reserve should have succeeded') + const spend: Spend = { iterations: 1, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 } + pool.reconcile(r.ticket, spend) + expect(() => pool.reconcile(r.ticket, spend)).toThrow(/unknown or already-settled/) + }) + + it('a usd request against an uncapped root is unsatisfiable (fail closed)', () => { + const pool = createBudgetPool({ maxIterations: 10, maxTokens: 1000 }, () => 0) + const r = pool.reserve({ maxIterations: 1, maxTokens: 10, maxUsd: 0.5, label: '' } as Budget) + expect(r).toEqual({ ok: false, reason: 'budget-exhausted' }) + }) + + it('spendFromUsageEvents folds tokens + usd on separate channels', () => { + const spend = spendFromUsageEvents([ + { kind: 'iteration' }, + { kind: 'tokens', input: 10, output: 5 }, + { kind: 'tokens', input: 2, output: 3 }, + { kind: 'cost', usd: 0.01 }, + ]) + expect(spend).toEqual({ iterations: 1, tokens: { input: 12, output: 8 }, usd: 0.01, ms: 0 }) + }) +}) + +// ── 2. equal-k by construction ────────────────────────────────────────────────── + +describe('equal-k by construction', () => { + it('two arms at equal per-child budget spend equal total iterations', async () => { + // Each arm spawns 3 children at a fixed 1-iteration budget; both arms draw from a + // pool sized for exactly 6, so the realized Σiterations is equal by the conserved + // reservation — no arm can overcommit past its half. + const runArm = async (label: string) => { + const { scope } = await beginScope({ root: `arm-${label}`, parentId: `arm-${label}` }) + let spawned = 0 + for (let i = 0; i < 3; i += 1) { + const res = scope.spawn( + leafAgent(`${label}-${i}`, { out: { label, i }, events: tokensOnly(10, 10, 1) }), + 'task', + { budget: { maxIterations: 1, maxTokens: 100 }, label: `${label}-${i}` }, + ) + if (res.ok) spawned += 1 + } + let total = 0 + for (let settled = await scope.next(); settled !== null; settled = await scope.next()) { + if (settled.kind === 'done') total += settled.spent.iterations + } + return { spawned, total } + } + const treatment = await runArm('t') + const blind = await runArm('b') + expect(treatment.spawned).toBe(3) + expect(blind.spawned).toBe(3) + expect(treatment.total).toBe(blind.total) + expect(treatment.total).toBe(3) + }) +}) + +// ── 3. The reactive Scope: seq order, view, inFlight ──────────────────────────── + +describe('reactive scope', () => { + // Regression pin (scope↔journal seam): a `spawned` event must not reuse the cursor + // `seq` that `next()` later stamps on the `settled` event, or the journal's per-tree + // unique-seq guard rejects the settle write and the cursor rejects. The scope's OWN + // doc says "`seq` is stamped by `next()`, never here" — so the spawn event needs a + // separate spawn-ordinal. One spawn → one drain must not corrupt the journal. + it('one spawn → one drain does not collide the journal seq namespace', async () => { + const journal = new InMemorySpawnJournal() + const { scope } = await beginScope({ journal }) + scope.spawn(leafAgent('only', { out: 1, events: tokensOnly(1, 1, 1) }), 'task', { + budget: { maxIterations: 1, maxTokens: 10 }, + label: 'only', + }) + const settled = await scope.next() + expect(settled?.kind).toBe('done') + expect(settled?.seq).toBe(0) + const events = (await journal.loadTree('run')) as SpawnEvent[] + const settledSeqs = events.filter((e) => e.kind === 'settled').map((e) => e.seq) + expect(new Set(settledSeqs).size).toBe(settledSeqs.length) + }) + + it('next() yields in monotonic seq order and view reflects the in-memory tree', async () => { + const { scope } = await beginScope() + for (let i = 0; i < 4; i += 1) { + const res = scope.spawn( + leafAgent(`c${i}`, { out: { i }, events: tokensOnly(5, 5, 1) }), + 'task', + { budget: { maxIterations: 1, maxTokens: 50 }, label: `c${i}` }, + ) + expect(res.ok).toBe(true) + } + expect(scope.view.nodes).toHaveLength(4) + + const seqs: number[] = [] + const ids: string[] = [] + for (let settled = await scope.next(); settled !== null; settled = await scope.next()) { + seqs.push(settled.seq) + ids.push(settled.handle.id) + } + // seq is the monotonic cursor order, contiguous from 0. + expect(seqs).toEqual([0, 1, 2, 3]) + // ids are the deterministic `${parent}:s${seq}` form minted at spawn order. + expect(ids.every((id) => /^run:s\d+$/.test(id))).toBe(true) + expect(scope.view.inFlight).toBe(0) + }) + + it('inFlight shrinks as children settle (live set is the nursery, not the log)', async () => { + // Both children park on their own gate so neither settles before the assertion — + // inFlight is read off the in-memory nursery, deterministically, with no race. + const gateA = deferred() + const gateB = deferred() + const { scope } = await beginScope() + scope.spawn( + leafAgent('a', { out: 'a', events: tokensOnly(1, 1, 1), block: gateA.promise }), + 'task', + { budget: { maxIterations: 1, maxTokens: 10 }, label: 'a' }, + ) + scope.spawn( + leafAgent('b', { out: 'b', events: tokensOnly(1, 1, 1), block: gateB.promise }), + 'task', + { budget: { maxIterations: 1, maxTokens: 10 }, label: 'b' }, + ) + expect(scope.view.inFlight).toBe(2) + gateA.resolve() + const first = await scope.next() + expect(first?.kind).toBe('done') + expect(scope.view.inFlight).toBe(1) + gateB.resolve() + const second = await scope.next() + expect(second?.kind).toBe('done') + expect(scope.view.inFlight).toBe(0) + expect(await scope.next()).toBeNull() + }) + + it('a thrown executor becomes a typed `down` (infra), never rejects the cursor', async () => { + const { scope } = await beginScope() + scope.spawn(leafAgent('boom', { out: null, events: [], failWith: 'leaf exploded' }), 'task', { + budget: { maxIterations: 1, maxTokens: 10 }, + label: 'boom', + }) + scope.spawn(leafAgent('ok', { out: 'ok', events: tokensOnly(1, 1, 1) }), 'task', { + budget: { maxIterations: 1, maxTokens: 10 }, + label: 'ok', + }) + const settles: Settled[] = [] + for (let s = await scope.next(); s !== null; s = await scope.next()) settles.push(s) + const down = settles.find((s) => s.kind === 'down') + const done = settles.find((s) => s.kind === 'done') + expect(down).toBeDefined() + if (down?.kind === 'down') { + expect(down.infra).toBe(true) + expect(down.reason).toContain('leaf exploded') + } + expect(done?.kind).toBe('done') + }) + + it('spawn fails closed on depth-exceeded', async () => { + const { scope } = await beginScope({ depth: 2, maxDepth: 2 }) + const res = scope.spawn(leafAgent('deep', { out: 1, events: tokensOnly(1, 1) }), 'task', { + budget: { maxIterations: 1, maxTokens: 10 }, + label: 'deep', + }) + expect(res).toEqual({ ok: false, reason: 'depth-exceeded' }) + }) + + it('spawn fails closed on budget-exhausted', async () => { + const { scope } = await beginScope({ + pool: createBudgetPool({ maxIterations: 1, maxTokens: 10 }, () => 0), + }) + const ok = scope.spawn(leafAgent('a', { out: 1, events: tokensOnly(1, 1) }), 'task', { + budget: { maxIterations: 1, maxTokens: 10 }, + label: 'a', + }) + expect(ok.ok).toBe(true) + const overflow = scope.spawn(leafAgent('b', { out: 2, events: tokensOnly(1, 1) }), 'task', { + budget: { maxIterations: 1, maxTokens: 10 }, + label: 'b', + }) + expect(overflow).toEqual({ ok: false, reason: 'budget-exhausted' }) + }) + + it('abort mid-flight reaps the live child (down, no throw)', async () => { + const controller = new AbortController() + const gate = deferred() // never resolves — the child only ends via abort. + const { scope } = await beginScope({ signal: controller.signal }) + scope.spawn( + leafAgent('parked', { out: 'p', events: tokensOnly(1, 1, 1), block: gate.promise }), + 'task', + { budget: { maxIterations: 1, maxTokens: 10 }, label: 'parked' }, + ) + expect(scope.view.inFlight).toBe(1) + controller.abort('test reap') + const settled = await scope.next() + expect(settled?.kind).toBe('down') + expect(scope.view.inFlight).toBe(0) + }) +}) + +// ── 4. settledToIteration adapter (single-sourced selection) ───────────────────── + +describe('settledToIteration adapter', () => { + it('projects a done settlement into the kernel Iteration so defaultSelectWinner is shared', async () => { + const { scope } = await beginScope() + scope.spawn( + leafAgent('lo', { + out: 'lo', + events: tokensOnly(1, 1, 1), + verdict: { valid: true, score: 0.2 }, + }), + 'task', + { budget: { maxIterations: 1, maxTokens: 10 }, label: 'lo' }, + ) + scope.spawn( + leafAgent('hi', { + out: 'hi', + events: tokensOnly(1, 1, 1), + verdict: { valid: true, score: 0.9 }, + }), + 'task', + { budget: { maxIterations: 1, maxTokens: 10 }, label: 'hi' }, + ) + const iterations = [] + for (let s = await scope.next(); s !== null; s = await scope.next()) { + if (s.kind === 'done') iterations.push(settledToIteration(s)) + } + const winner = defaultSelectWinner(iterations) + expect(winner?.output).toBe('hi') + expect(winner?.verdict?.score).toBe(0.9) + }) + + it('fails loud when handed a `down` settlement (only a done child is an iteration)', () => { + const down: Settled = { + kind: 'down', + handle: { id: 'run:s0', label: 'x', status: 'failed', abort() {} }, + reason: 'boom', + infra: false, + restartCount: 0, + seq: 0, + } + expect(() => settledToIteration(down)).toThrow(/cannot adapt a 'down'/) + }) +}) + +// ── 5. Open executor registry resolution ───────────────────────────────────────── + +describe('open executor registry', () => { + it('resolves a BYO executor verbatim (highest precedence)', () => { + const registry = createExecutorRegistry() + const byo = mockExecutor({ out: 'x', events: [] }) + const spec: AgentSpec = { + profile: { name: 'byo' } as AgentProfile, + harness: null, + executor: byo, + } + const r = registry.resolve(spec) + expect(r.succeeded).toBe(true) + if (r.succeeded) { + const built = r.value(spec, { signal: new AbortController().signal, seams: {} }) + // BYO factory returns the SAME instance — not a re-constructed router executor. + expect(built).toBe(byo) + } + }) + + it('harness:null resolves the router factory; a BackendType resolves the sandbox factory', () => { + const registry = createExecutorRegistry() + const router = registry.resolve({ profile: { name: 'r' } as AgentProfile, harness: null }) + const sandbox = registry.resolve({ + profile: { name: 's' } as AgentProfile, + harness: 'claude-code', + }) + expect(router.succeeded).toBe(true) + expect(sandbox.succeeded).toBe(true) + // Distinct factories: router/inline vs the sandbox-composing-runLoop built-in. + if (router.succeeded && sandbox.succeeded) { + expect(router.value).not.toBe(sandbox.value) + } + }) + + it('register is fail-loud on a duplicate runtime tag', () => { + const registry = createExecutorRegistry() + expect(() => registry.register('router', mockRouterFactory())).toThrow(/already registered/) + }) + + it('register accepts a brand-new runtime tag (the open extension point)', () => { + const registry = createExecutorRegistry() + expect(() => registry.register('vendorx', mockRouterFactory())).not.toThrow() + }) + + it('scope.spawn fails loud when an agent carries no executorSpec (AgentSpec)', async () => { + const { scope } = await beginScope() + const noSpec: Agent = { name: 'orphan', act: async () => 1 } + expect(() => + scope.spawn(noSpec, 'task', { budget: { maxIterations: 1, maxTokens: 10 }, label: 'orphan' }), + ).toThrow(/exposes no .*executorSpec/) + }) +}) + +function mockRouterFactory() { + return () => mockExecutor({ out: 'x', events: [] }) +} + +// ── WidenGate defaults flat (the R2 firewall stays dormant by construction) ────── + +describe('WidenGate default', () => { + it('a flat gate never widens for any settlement', () => { + // The frozen contract: the default WidenGate returns false for EVERY settlement, so a + // gate run never widens and the widening-from-verdict (selector≠judge) conflict stays + // dormant. No `judgeExempt` escape hatch is set. + const flat: WidenGate = { shouldWiden: () => false } + expect(flat.judgeExempt).toBeUndefined() + const budget = { tokensLeft: 1000, usdLeft: 0, deadlineMs: 0, reservedTokens: 0 } + const done: Settled = { + kind: 'done', + handle: { id: 'run:s0', label: 'a', status: 'done', abort() {} }, + out: 'a', + outRef: 'mock:"a"', + verdict: { valid: true, score: 0.99 }, + spent: { iterations: 1, tokens: { input: 1, output: 1 }, usd: 0, ms: 0 }, + seq: 0, + } + const down: Settled = { + kind: 'down', + handle: { id: 'run:s1', label: 'b', status: 'failed', abort() {} }, + reason: 'x', + infra: false, + restartCount: 0, + seq: 1, + } + // Even a near-perfect verdict does not widen under the flat default. + expect(flat.shouldWiden(done, budget)).toBe(false) + expect(flat.shouldWiden(down, budget)).toBe(false) + }) +}) + +// ── 6. Supervisor: join barrier, abort cascade, typed result ──────────────────── + +function supervisorOpts(over: Partial = {}): SupervisorOpts { + return { + budget: over.budget ?? { maxIterations: 100, maxTokens: 100_000 }, + runId: over.runId ?? 'sup', + journal: over.journal ?? new InMemorySpawnJournal(), + blobs: over.blobs ?? new InMemoryResultBlobStore(), + executors: over.executors ?? createExecutorRegistry(), + maxDepth: over.maxDepth, + maxRestarts: over.maxRestarts, + withinMs: over.withinMs, + now: over.now ?? (() => 0), + signal: over.signal, + } +} + +/** A flat-harness driver: spawn one child per arm, drain to settlement, select the best + * valid via the SAME single-sourced argmax the loop kernel uses. Returns the winner's + * `out` — selection lives in the driver, not the supervisor (selector≠judge). */ +function flatHarness(arms: Array<{ name: string; script: MockScript }>): Agent { + return { + name: 'flat-harness', + async act(task, scope: Scope): Promise { + for (const arm of arms) { + scope.spawn(leafAgent(arm.name, arm.script), task, { + budget: { maxIterations: 1, maxTokens: 1000 }, + label: arm.name, + }) + } + const iterations = [] + for (let s = await scope.next(); s !== null; s = await scope.next()) { + if (s.kind === 'done') iterations.push(settledToIteration(s)) + } + const winner = defaultSelectWinner(iterations) + if (!winner) throw new ValidationError('flat-harness: no valid child') + return winner.output + }, + } +} + +describe('supervisor', () => { + it('returns a typed `winner` and a `down` child does not crash the join', async () => { + const supervisor = createSupervisor() + const result = await supervisor.run( + flatHarness([ + { + name: 'good', + script: { + out: 'good', + events: tokensOnly(10, 10, 1), + verdict: { valid: true, score: 0.8 }, + }, + }, + { name: 'dead', script: { out: null, events: [], failWith: 'arm down' } }, + { + name: 'meh', + script: { + out: 'meh', + events: tokensOnly(10, 10, 1), + verdict: { valid: true, score: 0.3 }, + }, + }, + ]), + 'solve it', + supervisorOpts(), + ) + expect(result.kind).toBe('winner') + if (result.kind === 'winner') { + expect(result.out).toBe('good') + // spentTotal sums the conserved spend off every journaled settlement (2 done arms). + expect(result.spentTotal.iterations).toBe(2) + expect(result.spentTotal.tokens.input).toBe(20) + expect(result.tree.nodes.length).toBe(3) + } + }) + + it('returns a typed `no-winner` (never best!) when every child is down', async () => { + const supervisor = createSupervisor() + const result = await supervisor.run( + flatHarness([ + { name: 'd1', script: { out: null, events: [], failWith: 'down 1' } }, + { name: 'd2', script: { out: null, events: [], failWith: 'down 2' } }, + ]), + 'task', + supervisorOpts(), + ) + expect(result.kind).toBe('no-winner') + if (result.kind === 'no-winner') { + expect(result.reason).toBe('all-children-down') + expect(result.downCount).toBe(2) + } + }) + + it('a caller abort cascades teardown over live children (allSettled, no throw)', async () => { + const controller = new AbortController() + const gate = deferred() // children never settle on their own. + const supervisor = createSupervisor() + const driver: Agent = { + name: 'parker', + async act(_t, scope: Scope): Promise { + scope.spawn( + leafAgent('p1', { out: 1, events: tokensOnly(1, 1, 1), block: gate.promise }), + 't', + { budget: { maxIterations: 1, maxTokens: 10 }, label: 'p1' }, + ) + scope.spawn( + leafAgent('p2', { out: 2, events: tokensOnly(1, 1, 1), block: gate.promise }), + 't', + { budget: { maxIterations: 1, maxTokens: 10 }, label: 'p2' }, + ) + // Abort arrives while both children are parked; the first next() must see the reap. + controller.abort('caller cancel') + const settled = await scope.next() + if (settled?.kind === 'down') throw new ValidationError('aborted') + return 'unreachable' + }, + } + const result = await supervisor.run(driver, 't', supervisorOpts({ signal: controller.signal })) + expect(result.kind).toBe('no-winner') + if (result.kind === 'no-winner') expect(result.reason).toBe('aborted') + }) + + it('a bound RootHandle reads the live tree and is fail-loud when detached', async () => { + const handle = createRootHandle() + // Detached: every method is a typed throw, never a silent no-op. + expect(() => handle.view()).toThrow() + const supervisor = createSupervisor() + supervisor.attach(handle) + let observed = -1 + const driver: Agent = { + name: 'observe', + async act(_t, scope: Scope): Promise { + scope.spawn(leafAgent('c', { out: 'c', events: tokensOnly(1, 1, 1) }), 't', { + budget: { maxIterations: 1, maxTokens: 10 }, + label: 'c', + }) + observed = handle.view().nodes.length + await scope.next() + return 'c' + }, + } + const result = await supervisor.run(driver, 't', supervisorOpts()) + expect(result.kind).toBe('winner') + expect(observed).toBe(1) + // Unbound again after the run completes. + expect(() => handle.view()).toThrow() + }) + + it('attach rejects a foreign handle not minted by createRootHandle', () => { + const supervisor = createSupervisor() + const foreign = { + view() { + return { root: '', nodes: [], inFlight: 0 } + }, + signal() {}, + abort() {}, + } + expect(() => supervisor.attach(foreign)).toThrow(/createRootHandle/) + }) +}) + +// ── 7. Replay determinism ──────────────────────────────────────────────────────── + +describe('replay determinism', () => { + it('replaying a recorded journal yields the same tree + winner in the same seq order', async () => { + const journal = new InMemorySpawnJournal() + const blobs = new InMemoryResultBlobStore() + const supervisor = createSupervisor() + const arms = [ + { + name: 'a', + script: { + out: { ans: 'a' }, + events: tokensOnly(10, 5, 1), + verdict: { valid: true, score: 0.4 }, + }, + }, + { + name: 'b', + script: { + out: { ans: 'b' }, + events: tokensOnly(8, 4, 1), + verdict: { valid: true, score: 0.9 }, + }, + }, + { + name: 'c', + script: { + out: { ans: 'c' }, + events: tokensOnly(6, 3, 1), + verdict: { valid: true, score: 0.6 }, + }, + }, + ] + const live = await supervisor.run( + flatHarness(arms), + 'task', + supervisorOpts({ runId: 'replay-run', journal, blobs }), + ) + expect(live.kind).toBe('winner') + const liveWinner = live.kind === 'winner' ? live.out : undefined + + // Replay the recorded journal: rehydrate each `out` from the blob store in seq order. + const replayed = await replaySpawnTree(journal, blobs, 'replay-run') + const replaySeqs = replayed.map((s) => s.seq) + expect(replaySeqs).toEqual([...replaySeqs].sort((x, y) => x - y)) + + // Re-run the SAME driver selection over the replayed settlements — same winner. + const iterations = replayed + .filter((s): s is Extract, { kind: 'done' }> => s.kind === 'done') + .map(settledToIteration) + const replayWinner = defaultSelectWinner(iterations)?.output + expect(replayWinner).toEqual(liveWinner) + expect((replayWinner as { ans: string }).ans).toBe('b') + + // materializeTreeView re-derives the recorded tree (same node ids + statuses). + const events = (await journal.loadTree('replay-run')) as SpawnEvent[] + const view = materializeTreeView(events) + const leafNodes = view.nodes.filter((n) => n.parent === 'replay-run') + expect(leafNodes).toHaveLength(3) + expect(leafNodes.every((n) => n.status === 'done')).toBe(true) + expect(view.inFlight).toBe(0) + }) + + it('replay fails loud on a journaled outRef missing from the blob store', async () => { + const journal = new InMemorySpawnJournal() + await journal.beginTree('gap', new Date(0).toISOString()) + await journal.appendEvent('gap', { + kind: 'spawned', + id: 'gap:s0', + parent: 'gap', + label: 'x', + budget: { maxIterations: 1, maxTokens: 10 }, + runtime: 'router', + seq: 0, + at: new Date(0).toISOString(), + }) + await journal.appendEvent('gap', { + kind: 'settled', + id: 'gap:s0', + status: 'done', + outRef: 'mock:"orphan"', + spent: { iterations: 1, tokens: { input: 1, output: 1 }, usd: 0, ms: 0 }, + seq: 1, + at: new Date(0).toISOString(), + }) + await expect(replaySpawnTree(journal, new InMemoryResultBlobStore(), 'gap')).rejects.toThrow( + /no artifact for outRef/, + ) + }) +}) + +// ── helpers ──────────────────────────────────────────────────────────────────── + +function deferred(): { promise: Promise; resolve: () => void } { + let resolve!: () => void + const promise = new Promise((r) => { + resolve = r + }) + return { promise, resolve } +}