From 4234c94396ac15bccda04ff0ce047aa037c39414 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:18:11 -0600 Subject: [PATCH 1/9] feat(profiles/coder): default-on no-op + secret-path floor on the coder validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First increment of the canonical MCP delegate hardening (the techniques the ai-trading-blueprint delegation fork proved, folded back into agent-runtime so delegate_code is reliable for the whole fleet — not re-forked per product): - No-op rejection: an empty patch can trivially pass tests/typecheck (nothing changed) yet does no work — now valid=false (scores.nonEmpty=0). - Secret-path floor: always-on, independent of task.forbiddenPaths — rejects a patch touching credential-shaped paths (.env, *.pem/*.key/*.p12/*.pfx, keystore, wallet, id_rsa/id_ed25519, secrets/credentials.json). valid=false. Both are hard gates (flip valid), additive to the existing forbidden-path / diff-size / tests / typecheck checks; the weighted composite is unchanged so clean patches don't regress. Tests: empty patch → invalid; secret path → invalid even when not in forbiddenPaths; normal patch still valid. Full suite 407 green, tsc + biome clean. Remaining hardening increments (this branch): reviewer/audit gate + winner- selection strategy on delegate_code; physim's valid-only KB-growth (passage- present storage guard, fail-closed judge registry, correct-on-veto/escalate, circular-citation detection) on delegate_research. Umbrella: #828 (loop-runner). --- src/profiles/coder.ts | 30 ++++++++++++++++++++++ tests/profiles/coder.test.ts | 48 ++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/src/profiles/coder.ts b/src/profiles/coder.ts index 80149d3..51fe3f4 100644 --- a/src/profiles/coder.ts +++ b/src/profiles/coder.ts @@ -250,6 +250,15 @@ function parseCoderEvents(events: SandboxEvent[]): CoderOutput { * * @experimental */ +/** + * Default-on safety floor (folded from the ai-trading-blueprint delegation + * MCP): a coder patch that touches a credential-shaped path is rejected + * regardless of `forbiddenPaths` config. Catches `.env`, private keys, + * keystores, wallets, and the common secret/credential JSON files. + */ +const SECRET_PATH_RE = + /(^|\/)(\.env(\.|$)|.*\.(pem|key|p12|pfx|keystore|wallet)|id_rsa|id_ed25519|secrets?\.json|credentials?\.json)$/i + export function createCoderValidator(task: CoderTask): Validator { const maxDiff = task.maxDiffLines ?? DEFAULT_MAX_DIFF_LINES const forbidden = task.forbiddenPaths ?? [] @@ -260,6 +269,27 @@ export function createCoderValidator(task: CoderTask): Validator { let pass = true const touched = touchedPathsFromPatch(output.patch) + + // No-op rejection: an empty patch can trivially "pass" tests/typecheck + // (nothing changed) yet does no work — never a valid coder result. + if (touched.length === 0 || output.patch.trim().length === 0) { + pass = false + scores.nonEmpty = 0 + notes.push('empty patch — no files changed') + } else { + scores.nonEmpty = 1 + } + + // Secret-path floor: always-on, independent of `forbiddenPaths`. + const touchedSecrets = touched.filter((p) => SECRET_PATH_RE.test(p)) + if (touchedSecrets.length > 0) { + pass = false + scores.noSecrets = 0 + notes.push(`touched secret-shaped paths: ${touchedSecrets.join(', ')}`) + } else { + scores.noSecrets = 1 + } + const touchedForbidden = forbidden.filter((path) => { const prefix = path.endsWith('/') ? path : `${path}/` const exact = prefix.slice(0, -1) diff --git a/tests/profiles/coder.test.ts b/tests/profiles/coder.test.ts index 4b356c7..fc2d725 100644 --- a/tests/profiles/coder.test.ts +++ b/tests/profiles/coder.test.ts @@ -184,3 +184,51 @@ describe('multiHarnessCoderFanout — heterogeneous fanout bundle', () => { expect(bundle.agentRuns.every((s) => s.profile.tools?.git === true)).toBe(true) }) }) + +describe('createCoderValidator — default-on safety floor (no-op + secrets)', () => { + it('rejects an empty patch (no-op) even when tests + typecheck pass', async () => { + const validator = createCoderValidator(baseTask) + const output: CoderOutput = { + branch: 'feat/x', + patch: '', + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 0, insertions: 0, deletions: 0 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(false) + expect(verdict.scores?.nonEmpty).toBe(0) + expect(verdict.notes).toMatch(/empty patch/i) + }) + + it('rejects a patch touching a secret-shaped path regardless of forbiddenPaths', async () => { + // `.env` is NOT in baseTask.forbiddenPaths — the secret floor is always-on. + const validator = createCoderValidator(baseTask) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['config/.env', 'src/ok.ts'], 2, 0), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 2, insertions: 2, deletions: 0 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(false) + expect(verdict.scores?.noSecrets).toBe(0) + expect(verdict.notes).toMatch(/secret-shaped/i) + }) + + it('passes a normal non-empty, non-secret patch (floor does not regress clean work)', async () => { + const validator = createCoderValidator(baseTask) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['src/foo.ts'], 3, 1), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 3, deletions: 1 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(true) + expect(verdict.scores?.nonEmpty).toBe(1) + expect(verdict.scores?.noSecrets).toBe(1) + }) +}) From 688d701b318cd8d59a89e62e45ef31751549629a Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:26:48 -0600 Subject: [PATCH 2/9] feat(mcp): reviewer gate + winner-selection on delegate_code; createKbGate for valid-only research MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increments 2 + 3 of the canonical-MCP delegate hardening (folding the proven techniques from the ai-trading-blueprint fork + physim's KB subsystem back into agent-runtime, so every product's delegated loops are reliable without re-forking). delegate_code (createDefaultCoderDelegate): - Optional `reviewer` (CoderReviewer): a candidate that passes mechanical validation must ALSO be approved by an adversarial reviewer to win — catches the "compiles + tests pass but wrong/unsafe" class. No reviewer → unchanged behavior. - `winnerSelection`: highest-score (default, = kernel) | smallest-diff | highest-readiness | first-approved, over ALL valid candidates not just the kernel's single winner. Fails loud when nothing survives validation (+ review). delegate_research (createKbGate): - Reusable, dependency-free valid-only KB-growth gate distilled from physim: fail-closed judge registry, first-veto-wins. Always-on floor — passage-non-empty, passage-present anti-hallucination guard (verbatim passage MUST appear in source), value-in-passage (literal / comma-grouped / billion-million shorthand), no-circular-citation (laundering catch). Consumer judges append after the floor. Operates on fact candidates, not a store — composes with agent-knowledge without importing it. Verdict only; remediation is the caller's (never drops silently). Tests: delegate selection + reviewer fail-loud + backward-compat; kb-gate floor + shorthand + circular + consumer-judge. Full suite 420 green, tsc + biome clean. Engine for the loop-runner (#828). Increment 1 (no-op + secret floor) = 4234c94. --- src/mcp/delegates.ts | 155 +++++++++++++++++++-- src/mcp/index.ts | 11 ++ src/mcp/kb-gate.ts | 153 ++++++++++++++++++++ tests/mcp/coder-delegate-selection.test.ts | 110 +++++++++++++++ tests/mcp/kb-gate.test.ts | 94 +++++++++++++ 5 files changed, 511 insertions(+), 12 deletions(-) create mode 100644 src/mcp/kb-gate.ts create mode 100644 tests/mcp/coder-delegate-selection.test.ts create mode 100644 tests/mcp/kb-gate.test.ts diff --git a/src/mcp/delegates.ts b/src/mcp/delegates.ts index 59b3040..41375a9 100644 --- a/src/mcp/delegates.ts +++ b/src/mcp/delegates.ts @@ -16,9 +16,9 @@ * pass `researcherDelegate` explicitly when constructing the server. */ -import type { LoopSandboxClient } from '../loops' +import type { Iteration, LoopSandboxClient } from '../loops' import { runLoop } from '../loops' -import { coderProfile, multiHarnessCoderFanout } from '../profiles/coder' +import { type CoderOutput, coderProfile, multiHarnessCoderFanout } from '../profiles/coder' import { createSiblingSandboxExecutor, type DelegationExecutor } from './executor' import type { CoderTask, @@ -46,6 +46,43 @@ export type ResearcherDelegate = ( ctx: DelegateRunCtx, ) => Promise +/** @experimental Structured review verdict over a coder candidate. */ +export interface CoderReview { + /** Gate: only approved candidates are eligible to win. */ + approved: boolean + /** Reviewer's recommendation — surfaced in traces. */ + recommendation: 'ship' | 'approve-with-nits' | 'changes-requested' | 'reject' + /** Readiness 0..1, used by the `highest-readiness` winner-selection strategy. */ + readiness: number + notes?: string +} + +/** + * @experimental + * + * Optional adversarial reviewer over a coder candidate that already passed + * mechanical validation (tests/typecheck/forbidden/diff/no-op/secrets). Folded + * from the ai-trading-blueprint delegation MCP: a candidate is only eligible to + * win if the reviewer approves it. The reviewer is the consumer's seam — an LLM + * judge, a `pnpm review` command, anything returning a `CoderReview`. + */ +export type CoderReviewer = ( + output: import('../profiles/coder').CoderOutput, + task: CoderTask, + ctx: { signal: AbortSignal }, +) => Promise | CoderReview + +/** + * @experimental Winner-selection strategy among validated (+ reviewed) + * candidates. `highest-readiness` requires a `reviewer`. Default `highest-score` + * (the kernel's behavior — preserves backward compatibility). + */ +export type CoderWinnerSelection = + | 'highest-score' + | 'smallest-diff' + | 'highest-readiness' + | 'first-approved' + /** @experimental */ export interface CreateDefaultCoderDelegateOptions { /** @@ -64,6 +101,15 @@ export interface CreateDefaultCoderDelegateOptions { fanoutHarnesses?: string[] /** Hard cap on the kernel's per-batch concurrency. Default 4. */ maxConcurrency?: number + /** + * Optional adversarial reviewer. When set, a candidate must pass mechanical + * validation AND `reviewer.approved` to be eligible to win — empty/secret/ + * test-failing patches are already gone; this catches the "compiles + passes + * but wrong/unsafe" class the deterministic validator can't see. + */ + reviewer?: CoderReviewer + /** Winner-selection strategy among eligible candidates. Default `highest-score`. */ + winnerSelection?: CoderWinnerSelection } /** @@ -103,12 +149,16 @@ export function createDefaultCoderDelegate( maxIterations: 1, maxConcurrency, }) - const winner = result.winner - if (!winner) { - throw new Error('coder delegate produced no winner') - } + const chosen = await pickCoderWinner({ + iterations: result.iterations, + reviewer: options.reviewer, + selection: options.winnerSelection ?? 'highest-score', + task, + signal: ctx.signal, + }) + if (!chosen) throw new Error(noWinnerMessage(options.reviewer)) ctx.report({ iteration: 1, phase: 'completed' }) - return winner.output + return chosen } const fanout = multiHarnessCoderFanout( fanoutHarnesses && fanoutHarnesses.length > 0 @@ -126,15 +176,96 @@ export function createDefaultCoderDelegate( maxIterations: variants, maxConcurrency: Math.min(maxConcurrency, variants), }) - const winner = result.winner - if (!winner) { - throw new Error('coder delegate fanout produced no winner') - } + const chosen = await pickCoderWinner({ + iterations: result.iterations, + reviewer: options.reviewer, + selection: options.winnerSelection ?? 'highest-score', + task, + signal: ctx.signal, + }) + if (!chosen) throw new Error(noWinnerMessage(options.reviewer)) ctx.report({ iteration: agentRuns.length, phase: 'completed' }) - return winner.output + return chosen } } +interface PickCoderWinnerArgs { + iterations: ReadonlyArray> + reviewer: CoderReviewer | undefined + selection: CoderWinnerSelection + task: CoderTask + signal: AbortSignal +} + +interface CoderCandidate { + index: number + output: CoderOutput + score: number + readiness: number +} + +/** + * Pick the winning coder candidate from a finished loop's iterations: + * 1. keep only mechanically-VALID candidates (the validator already gated + * tests/typecheck/forbidden/diff/no-op/secrets), + * 2. if a `reviewer` is wired, keep only those it APPROVES, + * 3. select among survivors by the chosen strategy. + * Returns `undefined` when nothing survives — the delegate fails loud. + */ +async function pickCoderWinner(args: PickCoderWinnerArgs): Promise { + const valid: CoderCandidate[] = [] + for (const iter of args.iterations) { + if (iter.output === undefined || iter.error || iter.verdict?.valid !== true) continue + valid.push({ + index: iter.index, + output: iter.output, + score: iter.verdict.score ?? 0, + readiness: iter.verdict.score ?? 0, + }) + } + if (valid.length === 0) return undefined + + let eligible = valid + if (args.reviewer) { + eligible = [] + for (const c of valid) { + const review = await args.reviewer(c.output, args.task, { signal: args.signal }) + if (review.approved) eligible.push({ ...c, readiness: review.readiness }) + } + if (eligible.length === 0) return undefined + } + + return selectCoderCandidate(eligible, args.selection).output +} + +/** Apply the winner-selection strategy; ties broken by earliest iteration. */ +function selectCoderCandidate( + candidates: CoderCandidate[], + selection: CoderWinnerSelection, +): CoderCandidate { + const diffLines = (c: CoderCandidate) => + c.output.diffStats.insertions + c.output.diffStats.deletions + const sorted = [...candidates].sort((a, b) => { + switch (selection) { + case 'smallest-diff': + return diffLines(a) - diffLines(b) || a.index - b.index + case 'highest-readiness': + return b.readiness - a.readiness || a.index - b.index + case 'first-approved': + return a.index - b.index + default: + return b.score - a.score || a.index - b.index + } + }) + return sorted[0]! +} + +function noWinnerMessage(reviewer: CoderReviewer | undefined): string { + return reviewer + ? 'coder delegate: no candidate passed validation + review' + : 'coder delegate: no candidate passed validation' +} + function buildCoderGoal(args: DelegateCodeArgs): string { if (!args.contextHint) return args.goal return [args.goal, '', '## Context', args.contextHint].join('\n') diff --git a/src/mcp/index.ts b/src/mcp/index.ts index 6ab9b2b..24b5fc7 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -17,6 +17,9 @@ export type { DetectExecutorArgs } from './bin-helpers' export { detectExecutor } from './bin-helpers' export type { CoderDelegate, + CoderReview, + CoderReviewer, + CoderWinnerSelection, CreateDefaultCoderDelegateOptions, DelegateRunCtx, ResearcherDelegate, @@ -36,6 +39,14 @@ export type { InProcessExecutorOptions, } from './in-process-executor' export { createInProcessExecutor } from './in-process-executor' +export { + type CreateKbGateOptions, + createKbGate, + type FactCandidate, + type FactJudge, + type FactJudgeVerdict, + type KbGateResult, +} from './kb-gate' export type { LocalHarness, LocalHarnessResult, RunLocalHarnessOptions } from './local-harness' export { runLocalHarness } from './local-harness' export { mcpToolsForRuntimeMcp, mcpToolsForRuntimeMcpSubset } from './openai-tools' diff --git a/src/mcp/kb-gate.ts b/src/mcp/kb-gate.ts new file mode 100644 index 0000000..2f2b340 --- /dev/null +++ b/src/mcp/kb-gate.ts @@ -0,0 +1,153 @@ +/** + * @experimental + * + * `createKbGate` — the valid-only knowledge-base growth gate, distilled from + * physim's KB-research subsystem. A research-in-a-loop delegate (or any KB + * writer) runs candidate facts through this before persisting, so the KB grows + * with ONLY grounded facts — hallucinated, unsourced, or laundered claims are + * vetoed at the gate. + * + * Fail-closed by construction: every judge must `accept`; the FIRST veto wins + * and the fact is rejected. The non-negotiable floor (always on, can't be + * disabled) is the **passage-present guard** — a fact's `verbatimPassage` MUST + * literally appear in its `sourceText`. That single check kills the dominant + * failure mode (a confident claim decoupled from any real source). + * + * Pure + dependency-free: it operates on fact candidates, not on a store, so it + * composes with `@tangle-network/agent-knowledge` or any persistence layer + * without importing it. The remediation policy (correct-on-veto vs + * escalate-as-unverified) is the caller's — this returns the verdict; it never + * drops a fact silently. + */ + +/** @experimental A fact proposed for the KB, with its grounding. */ +export interface FactCandidate { + /** The atomic claim text. */ + claim: string + /** Optional extracted value (number or string) the claim asserts. */ + value?: string | number + /** Verbatim span lifted from the source that backs the claim. */ + verbatimPassage: string + /** The raw source text the passage must be grounded in. */ + sourceText: string + /** Where the fact claims to come from — checked for circular/self citations. */ + citation?: string +} + +/** @experimental */ +export interface FactJudgeVerdict { + accept: boolean + reason?: string +} + +/** @experimental A pluggable fact validator. Throw is NOT allowed — return a + * verdict; a thrown judge is a programmer error, not a veto. */ +export interface FactJudge { + name: string + judge(candidate: FactCandidate): FactJudgeVerdict | Promise +} + +/** @experimental */ +export interface KbGateResult { + accepted: boolean + /** Name of the judge that vetoed; undefined when accepted. */ + vetoedBy?: string + reason?: string +} + +/** @experimental */ +export interface CreateKbGateOptions { + /** Extra judges appended after the built-in floor (e.g. an LLM judge). */ + judges?: FactJudge[] + /** Minimum verbatim-passage length. Default 12 — kills empty/stub passages. */ + minPassageChars?: number + /** + * Citation tokens that denote a SELF-generated artifact (e.g. `'spec'`, + * `'cad_params'`, `'requirements'`). A citation naming one is circular + * (laundering) — the fact cites a derived artifact, not a real source. + * Default `[]` (no circular check unless the consumer declares its kinds). + */ + selfArtifactKinds?: string[] +} + +const norm = (s: string): string => s.toLowerCase().replace(/\s+/g, ' ').trim() + +/** Does `value` appear in the (normalized) passage — literally, comma-grouped, + * or in billion/million shorthand (the forms a source actually writes). */ +function valueAppears(value: string | number, passageNorm: string): boolean { + if (passageNorm.includes(norm(String(value)))) return true + if (typeof value !== 'number' || !Number.isFinite(value)) return false + const forms = [value.toLocaleString('en-US')] + if (Math.abs(value) >= 1e9) forms.push(`${trimZero(value / 1e9)} billion`) + if (Math.abs(value) >= 1e6) forms.push(`${trimZero(value / 1e6)} million`) + return forms.some((f) => passageNorm.includes(norm(f))) +} + +function trimZero(n: number): string { + return Number.isInteger(n) ? String(n) : String(Number(n.toFixed(2))) +} + +/** The always-on floor judges. Order matters: cheapest / most-fundamental first. */ +function builtinJudges(minPassageChars: number, selfArtifactKinds: string[]): FactJudge[] { + const kinds = selfArtifactKinds.map((k) => k.toLowerCase()) + return [ + { + name: 'passage-non-empty', + judge: (c) => + c.verbatimPassage.trim().length >= minPassageChars + ? { accept: true } + : { accept: false, reason: `passage shorter than ${minPassageChars} chars` }, + }, + { + // THE anti-hallucination floor — the passage must literally be in the source. + name: 'passage-present', + judge: (c) => + norm(c.sourceText).includes(norm(c.verbatimPassage)) + ? { accept: true } + : { accept: false, reason: 'verbatim passage not found in source (unbacked fact)' }, + }, + { + name: 'value-in-passage', + judge: (c) => + c.value === undefined || valueAppears(c.value, norm(c.verbatimPassage)) + ? { accept: true } + : { accept: false, reason: `value ${JSON.stringify(c.value)} not present in passage` }, + }, + { + name: 'no-circular-citation', + judge: (c) => { + if (!c.citation || kinds.length === 0) return { accept: true } + const cite = c.citation.toLowerCase() + const hit = kinds.find((k) => cite.includes(k)) + return hit + ? { accept: false, reason: `circular citation to self-generated artifact "${hit}"` } + : { accept: true } + }, + }, + ] +} + +/** + * @experimental + * + * Build a fail-closed KB gate. The returned function runs the built-in floor + * (passage-non-empty → passage-present → value-in-passage → no-circular-citation) + * then any consumer judges, returning on the first veto. + */ +export function createKbGate( + options: CreateKbGateOptions = {}, +): (candidate: FactCandidate) => Promise { + const judges = [ + ...builtinJudges(options.minPassageChars ?? 12, options.selfArtifactKinds ?? []), + ...(options.judges ?? []), + ] + return async (candidate) => { + for (const j of judges) { + const verdict = await j.judge(candidate) + if (!verdict.accept) { + return { accepted: false, vetoedBy: j.name, reason: verdict.reason } + } + } + return { accepted: true } + } +} diff --git a/tests/mcp/coder-delegate-selection.test.ts b/tests/mcp/coder-delegate-selection.test.ts new file mode 100644 index 0000000..92da502 --- /dev/null +++ b/tests/mcp/coder-delegate-selection.test.ts @@ -0,0 +1,110 @@ +import type { CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + type CoderReview, + type CoderReviewer, + type CoderWinnerSelection, + createDefaultCoderDelegate, +} from '../../src/mcp/delegates' +import type { CoderOutput } from '../../src/profiles/coder' + +function diff(path: string, plus: number, minus: number): string { + const out = [`diff --git a/${path} b/${path}`, `--- a/${path}`, `+++ b/${path}`] + for (let i = 0; i < plus; i += 1) out.push(`+line ${i}`) + for (let i = 0; i < minus; i += 1) out.push(`-line ${i}`) + return out.join('\n') +} + +// Two distinct, mechanically-VALID candidates that DIVERGE on diff-size vs +// readiness, so the selection strategy is observable: +// - candidate "small": tiny diff (2 lines), low reviewer readiness +// - candidate "big": larger diff (10 lines), high reviewer readiness +const CANDIDATES: CoderOutput[] = [ + { + branch: 'small', + patch: diff('src/small.ts', 1, 1), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 1, deletions: 1 }, + }, + { + branch: 'big', + patch: diff('src/big.ts', 5, 5), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 5, deletions: 5 }, + }, +] + +// Stub sandbox client: each create() serves the next candidate (by call order) +// as a parseable `result` event. Two harnesses → two branches → two candidates. +function candidateClient() { + let i = 0 + return { + async create(_opts?: CreateSandboxOptions): Promise { + const out = CANDIDATES[i++ % CANDIDATES.length]! + return { + async *streamPrompt() { + yield { type: 'result', data: { result: out } } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } +} + +const ctx = { signal: new AbortController().signal, report() {} } +const args = { goal: 'fix it', repoRoot: '/repo', variants: 2 } + +// Reviewer that approves both but rates the BIG candidate more ready. +const readinessReviewer: CoderReviewer = (output) => ({ + approved: true, + recommendation: 'ship', + readiness: output.branch === 'big' ? 0.9 : 0.4, +}) + +describe('createDefaultCoderDelegate — reviewer gate + winner selection', () => { + it('smallest-diff selects the smaller valid patch', async () => { + const delegate = createDefaultCoderDelegate({ + sandboxClient: candidateClient(), + fanoutHarnesses: ['claude-code', 'codex'], + winnerSelection: 'smallest-diff' satisfies CoderWinnerSelection, + }) + const out = await delegate(args, ctx) + expect(out.branch).toBe('small') + }) + + it('highest-readiness selects by the reviewer score, diverging from diff size', async () => { + const delegate = createDefaultCoderDelegate({ + sandboxClient: candidateClient(), + fanoutHarnesses: ['claude-code', 'codex'], + reviewer: readinessReviewer, + winnerSelection: 'highest-readiness', + }) + const out = await delegate(args, ctx) + expect(out.branch).toBe('big') + }) + + it('rejects when the reviewer approves nothing (fails loud, no winner)', async () => { + const rejectAll: CoderReviewer = (): CoderReview => ({ + approved: false, + recommendation: 'changes-requested', + readiness: 0, + }) + const delegate = createDefaultCoderDelegate({ + sandboxClient: candidateClient(), + fanoutHarnesses: ['claude-code', 'codex'], + reviewer: rejectAll, + }) + await expect(delegate(args, ctx)).rejects.toThrow(/validation \+ review/) + }) + + it('default highest-score (no reviewer) still returns a valid winner', async () => { + const delegate = createDefaultCoderDelegate({ + sandboxClient: candidateClient(), + fanoutHarnesses: ['claude-code', 'codex'], + }) + const out = await delegate(args, ctx) + // smaller diff → higher diffSize score → highest-score favors it; either way a valid winner. + expect(['small', 'big']).toContain(out.branch) + }) +}) diff --git a/tests/mcp/kb-gate.test.ts b/tests/mcp/kb-gate.test.ts new file mode 100644 index 0000000..4899bd6 --- /dev/null +++ b/tests/mcp/kb-gate.test.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from 'vitest' +import { createKbGate, type FactCandidate, type FactJudge } from '../../src/mcp/kb-gate' + +const SOURCE = + 'The 2025 annual report states total revenue was $1,200,000,000 for the fiscal year, up 12% year over year.' + +function fact(overrides: Partial): FactCandidate { + return { + claim: 'revenue was 1.2B', + verbatimPassage: 'total revenue was $1,200,000,000 for the fiscal year', + sourceText: SOURCE, + ...overrides, + } +} + +describe('createKbGate — valid-only KB growth', () => { + it('accepts a grounded fact whose passage is present in the source', async () => { + const gate = createKbGate() + const r = await gate(fact({})) + expect(r.accepted).toBe(true) + expect(r.vetoedBy).toBeUndefined() + }) + + it('vetoes a fact whose passage is NOT in the source (the anti-hallucination floor)', async () => { + const gate = createKbGate() + const r = await gate(fact({ verbatimPassage: 'revenue tripled to nine billion dollars' })) + expect(r.accepted).toBe(false) + expect(r.vetoedBy).toBe('passage-present') + }) + + it('vetoes a too-short passage', async () => { + const gate = createKbGate({ minPassageChars: 12 }) + const r = await gate(fact({ verbatimPassage: 'revenue' })) + expect(r.accepted).toBe(false) + expect(r.vetoedBy).toBe('passage-non-empty') + }) + + it('vetoes a value not present in the passage', async () => { + const gate = createKbGate() + const r = await gate(fact({ value: 999 })) + expect(r.accepted).toBe(false) + expect(r.vetoedBy).toBe('value-in-passage') + }) + + it('accepts a numeric value via comma-grouped form', async () => { + const gate = createKbGate() + const r = await gate(fact({ value: 1_200_000_000 })) + expect(r.accepted).toBe(true) + }) + + it('accepts a numeric value via billion shorthand when the source uses it', async () => { + const gate = createKbGate() + const r = await gate( + fact({ + verbatimPassage: 'revenue reached 1.2 billion in 2025', + sourceText: 'Per the filing, revenue reached 1.2 billion in 2025.', + value: 1_200_000_000, + }), + ) + expect(r.accepted).toBe(true) + }) + + it('vetoes a circular citation to a self-generated artifact (laundering)', async () => { + const gate = createKbGate({ selfArtifactKinds: ['spec', 'cad_params'] }) + const r = await gate(fact({ citation: '[cad_params.v2]' })) + expect(r.accepted).toBe(false) + expect(r.vetoedBy).toBe('no-circular-citation') + }) + + it('runs consumer judges after the floor, fail-closed on first veto', async () => { + const domainJudge: FactJudge = { + name: 'requires-year', + judge: (c) => + /\b20\d{2}\b/.test(c.verbatimPassage) + ? { accept: true } + : { accept: false, reason: 'no year' }, + } + const gate = createKbGate({ judges: [domainJudge] }) + // passage is grounded but has no year → the consumer judge vetoes + const r = await gate( + fact({ + verbatimPassage: 'total revenue was $1,200,000,000 for the fiscal year', + }), + ) + expect(r.accepted).toBe(false) + expect(r.vetoedBy).toBe('requires-year') + }) + + it('accepts when no value is asserted (value check is conditional)', async () => { + const gate = createKbGate() + const r = await gate(fact({ value: undefined })) + expect(r.accepted).toBe(true) + }) +}) From 97382c2e0e0e96a7254e9b43997bb0212869c2bc Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:27:02 -0600 Subject: [PATCH 3/9] =?UTF-8?q?chore(release):=200.36.0=20=E2=80=94=20MCP?= =?UTF-8?q?=20delegate=20hardening=20(reviewer=20gate,=20winner-selection,?= =?UTF-8?q?=20no-op+secret=20floor,=20createKbGate)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index ad7c33b..556408a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.35.0", + "version": "0.36.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { From 5f5fbbff19e63bc0a68469a9360ebd1e313a2b71 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:35:00 -0600 Subject: [PATCH 4/9] =?UTF-8?q?feat(loop-runner):=20runDelegatedLoop=20?= =?UTF-8?q?=E2=80=94=20configured=20mode=20dispatcher=20over=20the=20harde?= =?UTF-8?q?ned=20engines?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The thin façade that makes the hardened delegation engines (this branch) usable as ONE configured, schedulable entrypoint — the "configured delegated loop runner" (#828). - runDelegatedLoop(mode, registry): dispatches code | review | research | audit | self-improve | dynamic to a pre-configured runner. Owns mode routing, timing, fail-loud on an unregistered mode (ConfigError), and a uniform DelegatedLoopResult (a thrown engine becomes { ok:false, error } so unattended/scheduled runs record and move on rather than crash). - coderLoopRunner / reviewLoopRunner: default code/review runners over the hardened coder delegate (no-op + secret floor, reviewer gate, winner-selection). review mode TYPE-requires a reviewer — a review loop with no reviewer is just a code loop. - Registry is partial + injectable: products/routines register only the modes they use; tests inject stubs; the engines stay the canonical agent-runtime ones (no fork). This is the layer a scheduled routine targets (research/audit/self-improve on a cadence; code/review/dynamic on demand). Tests: dispatch routing, fail-loud unregistered mode, thrown-engine → ok:false, coderLoopRunner real wiring via stub. Full suite green, tsc + biome clean. Engine = 4234c94 + 688d701. --- src/index.ts | 13 +++- src/loop-runner.ts | 143 ++++++++++++++++++++++++++++++++++++++ tests/loop-runner.test.ts | 71 +++++++++++++++++++ 3 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 src/loop-runner.ts create mode 100644 tests/loop-runner.test.ts diff --git a/src/index.ts b/src/index.ts index dc2e491..3cda443 100644 --- a/src/index.ts +++ b/src/index.ts @@ -105,13 +105,24 @@ export { RuntimeRunStateError, ValidationError, } from './errors' +// ── Delegated loop-runner (configured code/research/review/audit/self-improve/dynamic) ── +export { + type CoderLoopRunnerOptions, + coderLoopRunner, + type DelegatedLoopMode, + type DelegatedLoopRegistry, + type DelegatedLoopResult, + type DelegatedLoopRunner, + type RunDelegatedLoopOptions, + reviewLoopRunner, + runDelegatedLoop, +} from './loop-runner' // ── MCP → OpenAI tools projection ──────────────────────────────────── // Helper for eval / orchestrator code that routes through the // OpenAI-compat backend and needs the 5 delegation tools surfaced to // the model. Sandbox-SDK callers discover tools via the runtime's MCP // mount and don't need this projection. export { mcpToolsForRuntimeMcp, mcpToolsForRuntimeMcpSubset } from './mcp/openai-tools' - // ── Chat-model resolution ──────────────────────────────────────────── // Router catalog fetch + fail-closed id validation + precedence resolver. export type { ModelInfo, ResolvedChatModel, RouterEnv } from './model-resolution' diff --git a/src/loop-runner.ts b/src/loop-runner.ts new file mode 100644 index 0000000..058aba4 --- /dev/null +++ b/src/loop-runner.ts @@ -0,0 +1,143 @@ +/** + * @experimental + * + * `runDelegatedLoop` — the configured delegated loop-runner. + * + * One typed entrypoint a worker agent (or a scheduled routine) calls to run a + * disciplined loop in a chosen MODE, over agent-runtime's hardened engines: + * + * code → build-in-a-loop via the coder delegate (no-op + secret floor, + * optional reviewer gate, winner-selection) + * review → code mode with a REQUIRED reviewer (the gate is the point) + * research → research-in-a-loop with valid-only KB growth (createKbGate) + * audit → analyze trace/run data → findings (runAnalystLoop, caller-wired) + * self-improve → identity-gated prompt optimization (optimizePrompt, caller-wired) + * dynamic → agent-authored topology (runLoop + createDynamicDriver) + * + * It is intentionally a thin façade: the value is that EVERY product reuses the + * one hardened engine instead of forking delegation logic. The dispatcher owns + * mode routing, timing, fail-loud on an unregistered mode, and a uniform result + * shape; each mode's engine is a pre-configured runner in the registry (build it + * with the factories below, or inject your own / a stub). + */ + +import { ConfigError } from './errors' +import type { LoopSandboxClient } from './loops' +import { + type CoderReviewer, + type CoderWinnerSelection, + createDefaultCoderDelegate, + type DelegateRunCtx, +} from './mcp/delegates' +import type { DelegateCodeArgs } from './mcp/types' +import type { CoderOutput } from './profiles/coder' + +/** @experimental */ +export type DelegatedLoopMode = + | 'code' + | 'review' + | 'research' + | 'audit' + | 'self-improve' + | 'dynamic' + +/** @experimental A pre-configured loop for one mode. Returns the mode's raw + * output; the dispatcher wraps it in a {@link DelegatedLoopResult}. */ +export type DelegatedLoopRunner = (signal: AbortSignal) => Promise + +/** @experimental Mode → configured runner. Partial: only register the modes a + * given product/routine actually uses. */ +export type DelegatedLoopRegistry = Partial> + +/** @experimental Uniform result — never throws from a registered runner; a + * thrown engine becomes `{ ok: false, error }` so a routine can record + move on. */ +export interface DelegatedLoopResult { + mode: DelegatedLoopMode + ok: boolean + output?: T + error?: string + durationMs: number +} + +/** @experimental */ +export interface RunDelegatedLoopOptions { + signal?: AbortSignal + /** Clock override for deterministic tests. */ + now?: () => number +} + +/** + * @experimental + * + * Dispatch a configured loop by mode. Fails loud (throws `ConfigError`) when no + * runner is registered for the mode — a routine pointed at an unwired mode is a + * config bug, not a silent no-op. A runner that throws is captured as + * `{ ok: false }` so unattended runs record the failure rather than crash. + */ +export async function runDelegatedLoop( + mode: DelegatedLoopMode, + registry: DelegatedLoopRegistry, + options: RunDelegatedLoopOptions = {}, +): Promise> { + const runner = registry[mode] as DelegatedLoopRunner | undefined + if (!runner) { + throw new ConfigError( + `runDelegatedLoop: no runner registered for mode '${mode}' (registered: ${ + Object.keys(registry).join(', ') || 'none' + })`, + ) + } + const now = options.now ?? Date.now + const signal = options.signal ?? new AbortController().signal + const start = now() + try { + const output = await runner(signal) + return { mode, ok: true, output, durationMs: now() - start } + } catch (err) { + return { + mode, + ok: false, + error: err instanceof Error ? err.message : String(err), + durationMs: now() - start, + } + } +} + +/** @experimental Options for the default `code`/`review` runner. */ +export interface CoderLoopRunnerOptions { + sandboxClient: LoopSandboxClient + /** What to build — the delegate args (goal, repoRoot, variants, config, …). */ + args: DelegateCodeArgs + /** Adversarial reviewer. REQUIRED for `review` mode (see `reviewLoopRunner`). */ + reviewer?: CoderReviewer + /** Winner-selection strategy. Default `highest-score`. */ + winnerSelection?: CoderWinnerSelection + /** Harnesses for `variants > 1` fanout. */ + fanoutHarnesses?: string[] +} + +/** @experimental Build a `code`-mode runner over the hardened coder delegate. */ +export function coderLoopRunner(options: CoderLoopRunnerOptions): DelegatedLoopRunner { + const delegate = createDefaultCoderDelegate({ + sandboxClient: options.sandboxClient, + ...(options.reviewer ? { reviewer: options.reviewer } : {}), + ...(options.winnerSelection ? { winnerSelection: options.winnerSelection } : {}), + ...(options.fanoutHarnesses ? { fanoutHarnesses: options.fanoutHarnesses } : {}), + }) + return async (signal) => { + const ctx: DelegateRunCtx = { signal, report: () => {} } + return delegate(options.args, ctx) + } +} + +/** + * @experimental + * + * `review` mode = `code` with a REQUIRED reviewer. The gate is the whole point, + * so the type forces a reviewer (a "review loop" with no reviewer is a code loop). + */ +export function reviewLoopRunner( + options: CoderLoopRunnerOptions & { reviewer: CoderReviewer }, +): DelegatedLoopRunner { + return coderLoopRunner(options) +} diff --git a/tests/loop-runner.test.ts b/tests/loop-runner.test.ts new file mode 100644 index 0000000..2e67dc3 --- /dev/null +++ b/tests/loop-runner.test.ts @@ -0,0 +1,71 @@ +import type { CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { ConfigError } from '../src/errors' +import { coderLoopRunner, type DelegatedLoopRegistry, runDelegatedLoop } from '../src/loop-runner' +import type { CoderOutput } from '../src/profiles/coder' + +const clock = () => { + let t = 0 + return () => (t += 100) +} + +describe('runDelegatedLoop — mode dispatch', () => { + it('routes to the registered runner and returns a uniform ok result', async () => { + const registry: DelegatedLoopRegistry = { + research: async () => ({ grounded: 3 }), + } + const r = await runDelegatedLoop('research', registry, { now: clock() }) + expect(r.mode).toBe('research') + expect(r.ok).toBe(true) + expect(r.output).toEqual({ grounded: 3 }) + expect(r.durationMs).toBeGreaterThan(0) + }) + + it('fails loud (ConfigError) on a mode with no registered runner', async () => { + await expect(runDelegatedLoop('audit', {})).rejects.toThrow(ConfigError) + await expect(runDelegatedLoop('audit', {})).rejects.toThrow( + /no runner registered for mode 'audit'/, + ) + }) + + it('captures a thrown engine as ok:false (unattended runs record, not crash)', async () => { + const registry: DelegatedLoopRegistry = { + 'self-improve': async () => { + throw new Error('reflection model 502') + }, + } + const r = await runDelegatedLoop('self-improve', registry, { now: clock() }) + expect(r.ok).toBe(false) + expect(r.error).toBe('reflection model 502') + expect(r.durationMs).toBeGreaterThan(0) + }) +}) + +describe('coderLoopRunner — code mode over the hardened delegate', () => { + it('runs the coder delegate and returns its winning CoderOutput', async () => { + const out: CoderOutput = { + branch: 'feat/fix', + patch: 'diff --git a/src/x.ts b/src/x.ts\n--- a/src/x.ts\n+++ b/src/x.ts\n+ok\n', + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 1, deletions: 0 }, + } + const sandboxClient = { + async create(_o?: CreateSandboxOptions): Promise { + return { + async *streamPrompt() { + yield { type: 'result', data: { result: out } } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } + const runner = coderLoopRunner({ + sandboxClient, + args: { goal: 'fix x', repoRoot: '/repo' }, + }) + const registry: DelegatedLoopRegistry = { code: runner } + const r = await runDelegatedLoop('code', registry) + expect(r.ok).toBe(true) + expect(r.output?.branch).toBe('feat/fix') + }) +}) From eea49c9549a603b7b393457fb0371e52b338e8bd Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:35:18 -0600 Subject: [PATCH 5/9] =?UTF-8?q?chore(release):=200.37.0=20=E2=80=94=20runD?= =?UTF-8?q?elegatedLoop=20configured=20loop-runner=20over=20the=20hardened?= =?UTF-8?q?=20engines=20(#828)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 556408a..3081efb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.36.0", + "version": "0.37.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { From dc589965ec8d351ad6d66fff26d22582d87f3c99 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:41:26 -0600 Subject: [PATCH 6/9] feat(loop-runner): default runner factories for all six modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rounds out the configured loop-runner (#828) — every mode now has a default factory wiring a shipped engine, so a routine can run any of them with config only (still registry-injectable for stubs/custom engines): - dynamicLoopRunner — runLoop + createDynamicDriver (agent-authored topology) - researchLoopRunner — research-in-a-loop with valid-only KB growth: each round research → createKbGate (fail-closed) → accept clean facts, re-research vetoed ones up to maxRounds (correct-on-veto), and RETURN final vetoes (escalate, never silently drop). VetoedFact carries the gate reason. - selfImproveLoopRunner — optimizePrompt (identity-gated) - auditLoopRunner — runAnalystLoop over captured trace/run data (code/review shipped previously.) Tests: research single-round accept/veto + escalation, research correct-on-veto across rounds, dynamic real runLoop via stub. Full suite 427 green, tsc + biome clean. Completes the engine (#827 target) + runner; the thin scheduled-routine wrapper is the only remaining layer. --- src/index.ts | 8 +++ src/loop-runner.ts | 137 +++++++++++++++++++++++++++++++++++++- tests/loop-runner.test.ts | 92 +++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 1 deletion(-) diff --git a/src/index.ts b/src/index.ts index 3cda443..e86335e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -107,15 +107,23 @@ export { } from './errors' // ── Delegated loop-runner (configured code/research/review/audit/self-improve/dynamic) ── export { + auditLoopRunner, type CoderLoopRunnerOptions, coderLoopRunner, type DelegatedLoopMode, type DelegatedLoopRegistry, type DelegatedLoopResult, type DelegatedLoopRunner, + type DynamicLoopRunnerOptions, + dynamicLoopRunner, + type ResearchLoopResult, + type ResearchLoopRunnerOptions, type RunDelegatedLoopOptions, + researchLoopRunner, reviewLoopRunner, runDelegatedLoop, + selfImproveLoopRunner, + type VetoedFact, } from './loop-runner' // ── MCP → OpenAI tools projection ──────────────────────────────────── // Helper for eval / orchestrator code that routes through the diff --git a/src/loop-runner.ts b/src/loop-runner.ts index 058aba4..d93876b 100644 --- a/src/loop-runner.ts +++ b/src/loop-runner.ts @@ -21,14 +21,33 @@ * with the factories below, or inject your own / a stub). */ +import type { Scenario } from '@tangle-network/agent-eval/campaign' +import { runAnalystLoop } from './analyst-loop' +import type { RunAnalystLoopOpts, RunAnalystLoopResult } from './analyst-loop/types' import { ConfigError } from './errors' -import type { LoopSandboxClient } from './loops' +import { + type OptimizePromptOptions, + type OptimizePromptResult, + optimizePrompt, +} from './improvement/optimize-prompt' +import { + type AgentRunSpec, + createDynamicDriver, + type DynamicDecision, + type LoopResult, + type LoopSandboxClient, + type OutputAdapter, + runLoop, + type TopologyPlanner, + type Validator, +} from './loops' import { type CoderReviewer, type CoderWinnerSelection, createDefaultCoderDelegate, type DelegateRunCtx, } from './mcp/delegates' +import { type CreateKbGateOptions, createKbGate, type FactCandidate } from './mcp/kb-gate' import type { DelegateCodeArgs } from './mcp/types' import type { CoderOutput } from './profiles/coder' @@ -141,3 +160,119 @@ export function reviewLoopRunner( ): DelegatedLoopRunner { return coderLoopRunner(options) } + +/** @experimental Options for the default `dynamic` runner. */ +export interface DynamicLoopRunnerOptions { + sandboxClient: LoopSandboxClient + /** The agent-authored topology planner (e.g. `createSandboxPlanner(...)`). */ + planner: TopologyPlanner + task: Task + output: OutputAdapter + validator?: Validator + /** Exactly one of `agentRun` / `agentRuns` (runLoop validates). */ + agentRun?: AgentRunSpec + agentRuns?: AgentRunSpec[] + maxIterations?: number + maxFanout?: number +} + +/** @experimental `dynamic` mode — agent-authored topology over `runLoop`. */ +export function dynamicLoopRunner( + o: DynamicLoopRunnerOptions, +): DelegatedLoopRunner> { + return async (signal) => + runLoop({ + driver: createDynamicDriver({ + planner: o.planner, + ...(o.maxIterations !== undefined ? { maxIterations: o.maxIterations } : {}), + ...(o.maxFanout !== undefined ? { maxFanout: o.maxFanout } : {}), + }), + ...(o.agentRun ? { agentRun: o.agentRun } : {}), + ...(o.agentRuns ? { agentRuns: o.agentRuns } : {}), + output: o.output, + ...(o.validator ? { validator: o.validator } : {}), + task: o.task, + ctx: { sandboxClient: o.sandboxClient, signal }, + ...(o.maxIterations !== undefined ? { maxIterations: o.maxIterations } : {}), + }) +} + +/** @experimental A fact rejected at the KB gate — surfaced, never dropped. */ +export interface VetoedFact { + candidate: FactCandidate + vetoedBy?: string + reason?: string +} + +/** @experimental */ +export interface ResearchLoopResult { + /** Facts that passed the fail-closed gate — safe to write to the KB. */ + accepted: FactCandidate[] + /** Facts the gate vetoed in the final round — escalate, do not silently drop. */ + vetoed: VetoedFact[] + /** Research rounds actually run. */ + rounds: number +} + +/** @experimental Options for the default `research` runner. */ +export interface ResearchLoopRunnerOptions { + /** + * The research engine (the consumer's web/doc searcher + extractor). Called + * each round with the prior round's vetoes so it can re-research the gaps. + * Returns fact candidates carrying their grounding (`verbatimPassage` + + * `sourceText`). + */ + research: (round: number, vetoed: VetoedFact[]) => Promise + /** Gate config (extra judges, self-artifact kinds, …). The floor is always on. */ + gate?: CreateKbGateOptions + /** Max research rounds (correct-on-veto remediation). Default 1. */ + maxRounds?: number +} + +/** + * @experimental `research` mode — research-in-a-loop with valid-only KB growth. + * + * Each round: research → gate every candidate (fail-closed; passage MUST be in + * the source) → accept the clean ones → re-research the vetoed ones next round, + * up to `maxRounds`. Vetoed facts in the final round are RETURNED (escalate, + * never silently dropped) so the caller audits vs retries. + */ +export function researchLoopRunner( + o: ResearchLoopRunnerOptions, +): DelegatedLoopRunner { + const gate = createKbGate(o.gate) + const maxRounds = Math.max(1, Math.trunc(o.maxRounds ?? 1)) + return async (signal) => { + const accepted: FactCandidate[] = [] + let vetoed: VetoedFact[] = [] + let rounds = 0 + for (let round = 0; round < maxRounds; round += 1) { + if (signal.aborted) break + rounds += 1 + const candidates = await o.research(round, vetoed) + if (candidates.length === 0) break + vetoed = [] + for (const c of candidates) { + const v = await gate(c) + if (v.accepted) accepted.push(c) + else vetoed.push({ candidate: c, vetoedBy: v.vetoedBy, reason: v.reason }) + } + if (vetoed.length === 0) break + } + return { accepted, vetoed, rounds } + } +} + +/** @experimental `self-improve` mode — identity-gated prompt optimization. */ +export function selfImproveLoopRunner( + options: OptimizePromptOptions, +): DelegatedLoopRunner> { + return async () => optimizePrompt(options) +} + +/** @experimental `audit` mode — analyst loop over captured trace/run data. */ +export function auditLoopRunner( + options: RunAnalystLoopOpts, +): DelegatedLoopRunner> { + return async () => runAnalystLoop(options) +} diff --git a/tests/loop-runner.test.ts b/tests/loop-runner.test.ts index 2e67dc3..4499c10 100644 --- a/tests/loop-runner.test.ts +++ b/tests/loop-runner.test.ts @@ -69,3 +69,95 @@ describe('coderLoopRunner — code mode over the hardened delegate', () => { expect(r.output?.branch).toBe('feat/fix') }) }) + +import { dynamicLoopRunner, researchLoopRunner, type VetoedFact } from '../src/loop-runner' +import type { AgentRunSpec, OutputAdapter, TopologyPlanner, Validator } from '../src/loops' +import type { FactCandidate } from '../src/mcp/kb-gate' + +const neverAbort = new AbortController().signal + +describe('researchLoopRunner — valid-only KB growth with remediation', () => { + const grounded: FactCandidate = { + claim: 'revenue was 100', + verbatimPassage: 'revenue was 100 in 2025', + sourceText: 'The filing notes revenue was 100 in 2025.', + } + const ungrounded: FactCandidate = { + claim: 'profit was 50', + verbatimPassage: 'profit was 50', + sourceText: 'this source says nothing of the sort', + } + + it('accepts grounded facts, vetoes ungrounded ones (single round, escalates the veto)', async () => { + const runner = researchLoopRunner({ research: async () => [grounded, ungrounded] }) + const res = await runner(neverAbort) + expect(res.rounds).toBe(1) + expect(res.accepted).toHaveLength(1) + expect(res.accepted[0]?.claim).toBe('revenue was 100') + expect(res.vetoed).toHaveLength(1) + expect(res.vetoed[0]?.vetoedBy).toBe('passage-present') + }) + + it('re-researches vetoed facts next round and accepts once grounded (correct-on-veto)', async () => { + const research = async (round: number, vetoed: VetoedFact[]): Promise => { + if (round === 0) return [grounded, ungrounded] + // round 1: re-ground the previously-vetoed candidate with a real source + return vetoed.map((v) => ({ + ...v.candidate, + sourceText: 'a better source: profit was 50 last year', + })) + } + const runner = researchLoopRunner({ research, maxRounds: 2 }) + const res = await runner(neverAbort) + expect(res.rounds).toBe(2) + expect(res.accepted.map((f) => f.claim).sort()).toEqual(['profit was 50', 'revenue was 100']) + expect(res.vetoed).toHaveLength(0) + }) +}) + +describe('dynamicLoopRunner — agent-authored topology over runLoop', () => { + interface T { + goal: string + } + interface O { + score: number + } + it('runs the planner-driven loop and returns a finished LoopResult', async () => { + const moves = [{ kind: 'refine' as const, task: { goal: 'g' } }, { kind: 'stop' as const }] + let i = 0 + const planner: TopologyPlanner = () => moves[i++]! + const output: OutputAdapter = { + parse: (events) => ({ score: (events.at(-1)?.data as { score?: number })?.score ?? 0 }), + } + const validator: Validator = { + async validate(o) { + return { valid: o.score >= 0.5, score: o.score } + }, + } + const spec: AgentRunSpec = { + profile: { name: 'w' }, + name: 'w', + taskToPrompt: (t) => t.goal, + } + const client = { + async create() { + return { + async *streamPrompt() { + yield { type: 'result', data: { score: 0.9 } } + }, + } as unknown as import('@tangle-network/sandbox').SandboxInstance + }, + } + const runner = dynamicLoopRunner({ + sandboxClient: client, + planner, + task: { goal: 'g' }, + output, + validator, + agentRun: spec, + }) + const res = await runner(neverAbort) + expect(res.decision).toBe('done') + expect(res.winner?.output.score).toBeCloseTo(0.9, 6) + }) +}) From 7f1f96e05a318aec889d185da1174f1fdc8e6cca Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:41:31 -0600 Subject: [PATCH 7/9] =?UTF-8?q?chore(release):=200.38.0=20=E2=80=94=20loop?= =?UTF-8?q?-runner=20default=20factories=20for=20all=20six=20modes=20(#828?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 3081efb..8afd346 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.37.0", + "version": "0.38.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { From 3193fb7ffdebe2b580a21809f52e4b1218a57f6e Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:47:19 -0600 Subject: [PATCH 8/9] =?UTF-8?q?feat(loop-runner):=20agent-runtime-loop=20b?= =?UTF-8?q?in=20=E2=80=94=20the=20schedulable=20entrypoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the loop-runner (#828): a cron/routine/Makefile invokes `agent-runtime-loop --mode --config `. The config module wires the DelegatedLoopRegistry (with full env/creds access — deps live there, not in the generic bin), the bin runs the mode, prints the DelegatedLoopResult as JSON, exits 0 ok / 1 recorded-failure / 2 usage-or-config-error. - runLoopRunnerCli: pure, IO-free CLI core (mode validation → load registry → dispatch → exit code) — exported + unit-tested. - parseLoopRunnerArgv, DELEGATED_LOOP_MODES, isDelegatedLoopMode exported. - New bin `agent-runtime-loop` → dist/loop-runner-bin.js (tsup entry + package bin). Tests: argv parsing (space + = forms), exit 0/1/2 paths (success, recorded failure, unknown mode, no-runner-for-mode, config load failure). Full suite green, tsc + biome clean. --- package.json | 3 +- src/index.ts | 8 +++ src/loop-runner-bin.ts | 126 ++++++++++++++++++++++++++++++++++ src/loop-runner.ts | 23 +++++-- tests/loop-runner-bin.test.ts | 64 +++++++++++++++++ tsup.config.ts | 1 + 6 files changed, 217 insertions(+), 8 deletions(-) create mode 100644 src/loop-runner-bin.ts create mode 100644 tests/loop-runner-bin.test.ts diff --git a/package.json b/package.json index 8afd346..f2d1fed 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,8 @@ } }, "bin": { - "agent-runtime-mcp": "./dist/mcp/bin.js" + "agent-runtime-mcp": "./dist/mcp/bin.js", + "agent-runtime-loop": "./dist/loop-runner-bin.js" }, "files": [ "dist", diff --git a/src/index.ts b/src/index.ts index e86335e..ae1ded6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -110,12 +110,14 @@ export { auditLoopRunner, type CoderLoopRunnerOptions, coderLoopRunner, + DELEGATED_LOOP_MODES, type DelegatedLoopMode, type DelegatedLoopRegistry, type DelegatedLoopResult, type DelegatedLoopRunner, type DynamicLoopRunnerOptions, dynamicLoopRunner, + isDelegatedLoopMode, type ResearchLoopResult, type ResearchLoopRunnerOptions, type RunDelegatedLoopOptions, @@ -125,6 +127,12 @@ export { selfImproveLoopRunner, type VetoedFact, } from './loop-runner' +export { + type LoopRunnerCliArgs, + type LoopRunnerCliResult, + parseLoopRunnerArgv, + runLoopRunnerCli, +} from './loop-runner-bin' // ── MCP → OpenAI tools projection ──────────────────────────────────── // Helper for eval / orchestrator code that routes through the // OpenAI-compat backend and needs the 5 delegation tools surfaced to diff --git a/src/loop-runner-bin.ts b/src/loop-runner-bin.ts new file mode 100644 index 0000000..de9981d --- /dev/null +++ b/src/loop-runner-bin.ts @@ -0,0 +1,126 @@ +#!/usr/bin/env node +/** + * @experimental + * + * `agent-runtime-loop` — the schedulable entrypoint for the configured + * delegated loop-runner. A cron job / routine / Makefile target invokes: + * + * agent-runtime-loop --mode research --config ./loops.config.js + * + * The config module wires the registry (with full access to env / creds — + * which is why the deps live there, not in this generic bin). It must default- + * export a `DelegatedLoopRegistry`, or a `() => DelegatedLoopRegistry | Promise<…>`. + * The bin runs the selected mode, prints the `DelegatedLoopResult` as JSON, and + * exits 0 on `ok`, 1 on a recorded failure, 2 on a usage/config error. + */ + +import { + DELEGATED_LOOP_MODES, + type DelegatedLoopMode, + type DelegatedLoopRegistry, + type DelegatedLoopResult, + isDelegatedLoopMode, + runDelegatedLoop, +} from './loop-runner' + +/** @experimental Parsed CLI invocation. */ +export interface LoopRunnerCliArgs { + mode: string + /** Loads the registry — the bin wires this from `--config`; tests inject a stub. */ + loadRegistry: () => Promise | DelegatedLoopRegistry + now?: () => number +} + +/** @experimental */ +export interface LoopRunnerCliResult { + exitCode: number + result?: DelegatedLoopResult + error?: string +} + +/** + * @experimental + * + * Pure CLI core (no process / argv / IO) so it's unit-testable: validate the + * mode, load the registry, dispatch, map to an exit code (0 ok / 1 failed / + * 2 usage). Exported for embedding in custom runners + tests. + */ +export async function runLoopRunnerCli(args: LoopRunnerCliArgs): Promise { + if (!isDelegatedLoopMode(args.mode)) { + return { + exitCode: 2, + error: `unknown mode '${args.mode}' (expected one of: ${DELEGATED_LOOP_MODES.join(', ')})`, + } + } + let registry: DelegatedLoopRegistry + try { + registry = await args.loadRegistry() + } catch (err) { + return { exitCode: 2, error: `failed to load registry: ${errMsg(err)}` } + } + if (!registry[args.mode]) { + return { + exitCode: 2, + error: `config registers no runner for mode '${args.mode}' (registered: ${ + Object.keys(registry).join(', ') || 'none' + })`, + } + } + // runDelegatedLoop throws only on a missing runner (guarded above); a failing + // engine is captured as { ok: false } → exit 1, not a crash. + const result = await runDelegatedLoop(args.mode as DelegatedLoopMode, registry, { + ...(args.now ? { now: args.now } : {}), + }) + return { exitCode: result.ok ? 0 : 1, result } +} + +/** Parse `--mode X --config Y` from an argv tail (`process.argv.slice(2)`). */ +export function parseLoopRunnerArgv(argv: string[]): { mode?: string; config?: string } { + const out: { mode?: string; config?: string } = {} + for (let i = 0; i < argv.length; i += 1) { + const a = argv[i] + if (a === '--mode') out.mode = argv[++i] + else if (a === '--config') out.config = argv[++i] + else if (a?.startsWith('--mode=')) out.mode = a.slice('--mode='.length) + else if (a?.startsWith('--config=')) out.config = a.slice('--config='.length) + } + return out +} + +/** Normalize a config module's default export → a registry. */ +function resolveRegistry(mod: unknown): DelegatedLoopRegistry { + const def = (mod as { default?: unknown })?.default ?? mod + const value = typeof def === 'function' ? (def as () => unknown)() : def + return value as DelegatedLoopRegistry +} + +function errMsg(err: unknown): string { + return err instanceof Error ? err.message : String(err) +} + +/** The argv → IO → exit shell. Kept thin; logic lives in `runLoopRunnerCli`. */ +async function main(): Promise { + const { mode, config } = parseLoopRunnerArgv(process.argv.slice(2)) + if (!mode || !config) { + process.stderr.write( + 'usage: agent-runtime-loop --mode --config \n' + + ` modes: ${DELEGATED_LOOP_MODES.join(' | ')}\n` + + ' config: a JS/TS module default-exporting a DelegatedLoopRegistry (or a factory)\n', + ) + process.exit(2) + } + const { pathToFileURL } = await import('node:url') + const { resolve } = await import('node:path') + const cli = await runLoopRunnerCli({ + mode, + loadRegistry: async () => resolveRegistry(await import(pathToFileURL(resolve(config)).href)), + }) + process.stdout.write(`${JSON.stringify(cli.result ?? { error: cli.error }, null, 2)}\n`) + if (cli.error) process.stderr.write(`${cli.error}\n`) + process.exit(cli.exitCode) +} + +// Run only when executed as the bin (not when imported for the testable core). +if (process.argv[1] && /loop-runner-bin\.(js|ts|mjs)$/.test(process.argv[1])) { + void main() +} diff --git a/src/loop-runner.ts b/src/loop-runner.ts index d93876b..ede4bb8 100644 --- a/src/loop-runner.ts +++ b/src/loop-runner.ts @@ -51,14 +51,23 @@ import { type CreateKbGateOptions, createKbGate, type FactCandidate } from './mc import type { DelegateCodeArgs } from './mcp/types' import type { CoderOutput } from './profiles/coder' +/** @experimental Every delegated-loop mode, for validation + CLI surfaces. */ +export const DELEGATED_LOOP_MODES = [ + 'code', + 'review', + 'research', + 'audit', + 'self-improve', + 'dynamic', +] as const + /** @experimental */ -export type DelegatedLoopMode = - | 'code' - | 'review' - | 'research' - | 'audit' - | 'self-improve' - | 'dynamic' +export type DelegatedLoopMode = (typeof DELEGATED_LOOP_MODES)[number] + +/** @experimental Type guard for an untrusted mode string (CLI / config input). */ +export function isDelegatedLoopMode(value: unknown): value is DelegatedLoopMode { + return typeof value === 'string' && (DELEGATED_LOOP_MODES as readonly string[]).includes(value) +} /** @experimental A pre-configured loop for one mode. Returns the mode's raw * output; the dispatcher wraps it in a {@link DelegatedLoopResult}. */ diff --git a/tests/loop-runner-bin.test.ts b/tests/loop-runner-bin.test.ts new file mode 100644 index 0000000..27420a6 --- /dev/null +++ b/tests/loop-runner-bin.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from 'vitest' +import type { DelegatedLoopRegistry } from '../src/loop-runner' +import { parseLoopRunnerArgv, runLoopRunnerCli } from '../src/loop-runner-bin' + +describe('parseLoopRunnerArgv', () => { + it('parses --mode/--config in both space and = forms', () => { + expect(parseLoopRunnerArgv(['--mode', 'research', '--config', './c.js'])).toEqual({ + mode: 'research', + config: './c.js', + }) + expect(parseLoopRunnerArgv(['--mode=code', '--config=./c.js'])).toEqual({ + mode: 'code', + config: './c.js', + }) + }) +}) + +describe('runLoopRunnerCli', () => { + it('exit 0 when the selected runner succeeds', async () => { + const registry: DelegatedLoopRegistry = { research: async () => ({ grounded: 2 }) } + const r = await runLoopRunnerCli({ mode: 'research', loadRegistry: () => registry }) + expect(r.exitCode).toBe(0) + expect(r.result?.ok).toBe(true) + expect(r.result?.output).toEqual({ grounded: 2 }) + }) + + it('exit 1 when the runner fails (recorded, not crashed)', async () => { + const registry: DelegatedLoopRegistry = { + 'self-improve': async () => { + throw new Error('router 502') + }, + } + const r = await runLoopRunnerCli({ mode: 'self-improve', loadRegistry: () => registry }) + expect(r.exitCode).toBe(1) + expect(r.result?.ok).toBe(false) + expect(r.result?.error).toBe('router 502') + }) + + it('exit 2 on an unknown mode', async () => { + const r = await runLoopRunnerCli({ mode: 'nonsense', loadRegistry: () => ({}) }) + expect(r.exitCode).toBe(2) + expect(r.error).toMatch(/unknown mode/) + }) + + it('exit 2 when the config registers no runner for the mode', async () => { + const r = await runLoopRunnerCli({ + mode: 'audit', + loadRegistry: () => ({ code: async () => 1 }), + }) + expect(r.exitCode).toBe(2) + expect(r.error).toMatch(/registers no runner for mode 'audit'/) + }) + + it('exit 2 when the config module fails to load', async () => { + const r = await runLoopRunnerCli({ + mode: 'code', + loadRegistry: () => { + throw new Error('module not found') + }, + }) + expect(r.exitCode).toBe(2) + expect(r.error).toMatch(/failed to load registry/) + }) +}) diff --git a/tsup.config.ts b/tsup.config.ts index 67381f8..8c5a06a 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -11,6 +11,7 @@ export default defineConfig({ profiles: 'src/profiles/index.ts', 'mcp/index': 'src/mcp/index.ts', 'mcp/bin': 'src/mcp/bin.ts', + 'loop-runner-bin': 'src/loop-runner-bin.ts', }, format: ['esm'], dts: true, From 210c43c0acd2f2a7649e22c6025b31bd76006961 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 31 May 2026 03:47:25 -0600 Subject: [PATCH 9/9] =?UTF-8?q?chore(release):=200.39.0=20=E2=80=94=20agen?= =?UTF-8?q?t-runtime-loop=20schedulable=20bin=20(#828=20complete)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index f2d1fed..e3d39c1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.38.0", + "version": "0.39.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": {