tangle-network · drewstone · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/bench/src/drivers/flat-harness.ts b/bench/src/drivers/flat-harness.ts
diff --git a/bench/src/drivers/llm-meta-driver.ts b/bench/src/drivers/llm-meta-driver.ts
diff --git a/bench/src/drivers/progressive-widening.ts b/bench/src/drivers/progressive-widening.ts
@@ -0,0 +1,223 @@
+/**
+ * @experimental
+ *
+ * Coded progressive-widening driver — the CONTROL variant of the recursive execution
+ * atom's two driver-act bodies (the LLM meta-driver in `./llm-meta-driver.ts` is the
+ * treatment). Both share the `WidenGate` below.
+ *
+ * The policy (MCTS progressive widening, the governor that keeps "full generality" from
+ * becoming "boil the ocean"): seed a NARROW frontier (one child per seed), then react to
+ * each `scope.next()` completion. A node widens — spawns ONE more child toward the same
+ * promising lineage under the conserved pool — only when the `WidenGate` says so. No
+ * eager fan-out: the frontier grows by at most one per settlement, bounded by the
+ * conserved budget reservation (`scope.spawn` fails closed when the pool can't cover it).
+ *
+ * Two firewall invariants this driver upholds by construction (critique R2):
+ *  - `WidenGate` DEFAULTS TO FLAT: `defaultWidenGate.shouldWiden` returns false for every
+ *    settlement, so a gate run never widens and the selector≠judge conflict stays dormant.
+ *  - When widening IS enabled, `promising` is derived from TRACE findings (the `analyze`
+ *    hook → `AnalystFinding[]`), NEVER from a raw `verdict.score`. Reading the judge
+ *    verdict for a spawn decision requires the gate's explicit `judgeExempt: true` (off by
+ *    default) — the documented escape hatch that re-couples steering to the judge.
+ *
+ * Selection stays single-sourced: settled children adapt to `Iteration` via
+ * `settledToIteration` and `defaultSelectWinner` picks the winner — the driver never
+ * forks the kernel's argmax (selector ≠ judge).
+ */
+
+import type { AnalystFinding } from '@tangle-network/agent-eval'
+import { defaultSelectWinner } from '../../../src/loops/run-loop.ts'
+import { settledToIteration } from '../../../src/loops/supervise/scope.ts'
+import type {
+  Agent,
+  AgentSpec,
+  Budget,
+  DefaultVerdict,
+  Scope,
+  Settled,
+  WidenGate,
+} from '../../../src/loops/supervise/types.ts'
+
+/** A child the driver can spawn: a leaf `Agent` plus the `AgentSpec` the open registry
+ *  resolves its `LeafExecutor` from (`harness: null` → router/inline; `BackendType` →
+ *  sandbox; or a BYO `executor`). The spec rides on the agent as `executorSpec` because
+ *  that is the field `scope.spawn` reads to resolve a runtime — fail loud if it is absent. */
+export interface ChildAgent<Out> {
+  readonly agent: Agent<unknown, Out>
+  readonly task: unknown
+  readonly label: string
+}
+
+/** A seed of the narrow initial frontier: the child to spawn and its per-child budget. */
+export interface WideningSeed<Out> {
+  readonly child: ChildAgent<Out>
+  readonly budget: Budget
+}
+
+/**
+ * Trace-analyst hook: read a settled child's TRACE (rehydrated `out` + lineage) into
+ * `AnalystFinding[]`. This is the analyst→driver wire (mirrors `PlannerContext.analyses`)
+ * and the ONLY signal `promising` may read when the gate is flat-with-findings. The hook
+ * MUST return trace-derived findings; the gate never inspects `settled.verdict` unless it
+ * is explicitly `judgeExempt`.
+ */
+export type AnalyzeSettled<Out> = (
+  settled: Extract<Settled<Out>, { kind: 'done' }>,
+) => Promise<ReadonlyArray<AnalystFinding>>
+
+export interface ProgressiveWideningOptions<Out> {
+  readonly name?: string
+  /** The narrow initial frontier — one child per seed, no eager fan-out. */
+  readonly seed: (task: unknown) => ReadonlyArray<WideningSeed<Out>>
+  /** Build the next child to widen toward a promising lineage. Returns `null` to stop
+   *  widening this lineage (e.g. the lineage has converged). */
+  readonly widen: (
+    settled: Extract<Settled<Out>, { kind: 'done' }>,
+    findings: ReadonlyArray<AnalystFinding>,
+  ) => WideningSeed<Out> | null
+  /** Trace-analyst wire feeding `promising`. Omit to run flat (no findings → never
+   *  widens under the default gate). */
+  readonly analyze?: AnalyzeSettled<Out>
+  /** The widening governor. Defaults to `defaultWidenGate` (flat — never widens). */
+  readonly gate?: WidenGate<Out>
+}
+
+/**
+ * Build the coded progressive-widening `Agent`. Its `act` body is the control policy:
+ * seed narrow → react to each `next()` → widen toward a promising lineage under budget →
+ * synthesize the winner with the single-sourced selector. `WidenGate` defaults flat, so
+ * with no `gate`/`analyze` supplied this is exactly the "spawn the seeds, pick the best"
+ * flat harness.
+ */
+export function createProgressiveWideningDriver<Out>(
+  opts: ProgressiveWideningOptions<Out>,
+): Agent<unknown, Out> {
+  const gate = opts.gate ?? defaultWidenGate<Out>()
+  const analyze = opts.analyze
+
+  return {
+    name: opts.name ?? 'progressive-widening',
+    async act(task: unknown, scope: Scope<Out>): Promise<Out> {
+      // Seed the NARROW frontier: one child per seed, reserved atomically from the pool.
+      // A seed that fails admission (pool can't cover it) is dropped — fail closed, never
+      // overcommit; the conserved Σk holds by construction.
+      for (const s of opts.seed(task)) {
+        scope.spawn(asSpawnable(s.child), s.child.task, { budget: s.budget, label: s.child.label })
+      }
+
+      const done: Array<Extract<Settled<Out>, { kind: 'done' }>> = []
+      // React to settlements one at a time (ray.wait n=1). `next()` is null only when the
+      // live set is empty — every spawned child eventually settles done or down.
+      for (let settled = await scope.next(); settled !== null; settled = await scope.next()) {
+        if (settled.kind === 'down') continue // infra/bad child: excluded from merge n + equal-k
+        done.push(settled)
+
+        // Progressive widening: spawn AT MOST one more child toward this lineage, and only
+        // when the gate says promising AND the pool can still cover a widen. The findings
+        // are TRACE-derived (`analyze`); the gate reads them, never the raw verdict.
+        if (!gate.shouldWiden(settled, scope.budget)) continue
+        const findings = analyze ? await analyze(settled) : []
+        if (!isPromising(findings, gate, settled)) continue
+        const next = opts.widen(settled, findings)
+        if (next === null) continue
+        scope.spawn(asSpawnable(next.child), next.child.task, {
+          budget: next.budget,
+          label: next.child.label,
+        })
+      }
+
+      // Single-sourced selection: adapt the done children to the kernel's Iteration shape
+      // and let `defaultSelectWinner` pick (best-valid-score, ties → earliest). The driver
+      // does NOT fork the argmax (selector ≠ judge).
+      const iterations = done.map((s) => settledToIteration(s))
+      const winner = defaultSelectWinner(iterations)
+      if (!winner) {
+        throw new Error(
+          'progressive-widening: no done child to select a winner from (all children were down)',
+        )
+      }
+      return winner.output as Out
+    },
+  }
+}
+
+/**
+ * The flat-by-default widening governor (the shared `WidenGate`). `shouldWiden` returns
+ * false for EVERY settlement, so a gate run never widens — the firewall conflict (R2)
+ * stays dormant by construction. Override it with a findings-driven gate (severity/area
+ * thresholds over trace findings) to enable widening; only an explicit `judgeExempt: true`
+ * gate may read `verdict.score`.
+ */
+export function defaultWidenGate<Out>(): WidenGate<Out> {
+  return {
+    shouldWiden(): boolean {
+      return false
+    },
+  }
+}
+
+/**
+ * A findings-driven widening gate (opt-in, never the default). Widens toward a lineage
+ * whose TRACE findings show a correctable middle band — a high/critical finding that
+ * carries a `recommended_action` (the analyst says "this is fixable, do X"). It reads ONLY
+ * trace-derived findings, never the verdict, so it composes with the steer firewall. The
+ * `minTokensLeft` guard keeps a widen from starving the pool below a usable per-child floor.
+ */
+export function findingsWidenGate<Out>(opts: { minTokensLeft: number }): WidenGate<Out> {
+  return {
+    shouldWiden(_settled: Settled<Out>, budget: Scope<Out>['budget']): boolean {
+      return budget.tokensLeft >= opts.minTokensLeft
+    },
+  }
+}
+
+/**
+ * Is this lineage promising enough to widen? Promise is computed from TRACE findings, not
+ * the judge verdict: a `high`/`critical` finding that names a `recommended_action` is a
+ * correctable middle band worth one more shot. Empty findings are NOT promising (flat).
+ *
+ * The ONLY path that reads `verdict.score` is the gate's explicit `judgeExempt: true`
+ * escape hatch — it re-couples steering to the judge, so it must be argued per cell and is
+ * off by default.
+ */
+function isPromising<Out>(
+  findings: ReadonlyArray<AnalystFinding>,
+  gate: WidenGate<Out>,
+  settled: Extract<Settled<Out>, { kind: 'done' }>,
+): boolean {
+  if (gate.judgeExempt === true) return judgeScore(settled.verdict) > 0
+  return findings.some(
+    (f) =>
+      (f.severity === 'high' || f.severity === 'critical') &&
+      typeof f.recommended_action === 'string' &&
+      f.recommended_action.length > 0,
+  )
+}
+
+/** Read a verdict's scalar score. Used ONLY behind the explicit `judgeExempt` hatch — the
+ *  steering-from-the-judge path the firewall otherwise forbids. */
+function judgeScore(verdict: DefaultVerdict | undefined): number {
+  if (!verdict) return 0
+  const score = (verdict as { score?: unknown }).score
+  return typeof score === 'number' ? score : 0
+}
+
+/** Attach the child's `AgentSpec` as the `executorSpec` field `scope.spawn` resolves the
+ *  runtime from. A `ChildAgent` whose `agent` already carries a matching `executorSpec`
+ *  passes through unchanged; otherwise this is a fail-loud no-op (the agent must carry the
+ *  spec, since only the agent author knows its profile/harness). */
+function asSpawnable<Out>(child: ChildAgent<Out>): Agent<unknown, Out> {
+  const carried = (child.agent as { executorSpec?: unknown }).executorSpec
+  if (!isAgentSpec(carried)) {
+    throw new Error(
+      `progressive-widening: child "${child.label}" agent carries no executorSpec (AgentSpec); cannot resolve its LeafExecutor`,
+    )
+  }
+  return child.agent
+}
+
+function isAgentSpec(value: unknown): value is AgentSpec {
+  if (typeof value !== 'object' || value === null) return false
+  const v = value as Record<string, unknown>
+  return 'profile' in v && 'harness' in v
+}
diff --git a/bench/src/rsi.ts b/bench/src/rsi.ts
@@ -0,0 +1,86 @@
+/**
+ * The RSI driver experiment, instantiated. The whole thing in one file: pick a
+ * benchmark adapter, pick the steer POLICIES (the arms), run them through the one
+ * flow at equal compute, read the result. Everything else is the library
+ * (src/experiment.ts). Adding a benchmark is one import; adding a policy is one
+ * steer function.
+ *
+ *   BENCH=swe-bench N=20 ROUNDS=3 tsx src/rsi.ts
+ *
+ * Caveat: `blind`/`random` are independent fresh attempts (the compute control).
+ * A `continue` / "build on your prior work" policy is only meaningful with
+ * CONTINUED-SESSION execution (the kernel reusing one box across turns); the loop
+ * is fresh-box-per-attempt today, so it would degrade to a re-attempt. The
+ * prompt-steering policies below (critical-audit, aggressive-push) are live now.
+ */
+import { Sandbox } from '@tangle-network/sandbox'
+import { createFinsearchcompAdapter } from './benchmarks/finsearchcomp'
+import { createSweBenchAdapter } from './benchmarks/swe-bench'
+import type { BenchmarkAdapter } from './benchmarks/types'
+import { type Arm, analystArm, arm, llmAnalyst, randomArm, runExperiment, sandboxAgentRun } from './experiment'
+
+const must = (k: string): string => {
+  const v = process.env[k]
+  if (!v) throw new Error(`env ${k} is required`)
+  return v
+}
+
+// The benchmark roster. Long-horizon adapters (commit0, swe-lancer, tau2, appworld,
+// blueprint) slot in here as one entry each; the loop below never changes.
+const ADAPTERS: Record<string, () => BenchmarkAdapter> = {
+  'swe-bench': createSweBenchAdapter,
+  finsearchcomp: createFinsearchcompAdapter,
+}
+
+async function main() {
+  const make = ADAPTERS[process.env.BENCH ?? 'swe-bench']
+  if (!make) throw new Error(`unknown BENCH=${process.env.BENCH} (have: ${Object.keys(ADAPTERS).join(', ')})`)
+  const adapter = make()
+  const model = process.env.WORKER_MODEL ?? 'gpt-5'
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const routerKey = must('TANGLE_API_KEY')
+  const rounds = Number(process.env.ROUNDS ?? 3)
+  const router = { routerBaseUrl, routerKey, model }
+  const client = new Sandbox({
+    baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools',
+    apiKey: routerKey,
+    timeoutMs: 1_200_000,
+  } as never)
+
+  // The steer policies under test. Each is an arm = a steer f(rootPrompt, history).
+  const policies: [Arm, ...Arm[]] = [
+    randomArm('blind'), // compute control: independent retries, no steer
+    analystArm('critical-audit', llmAnalyst(router)), // audit the prior attempt, steer on the findings
+    arm('aggressive-push', (root, _h, r) =>
+      r === 0 ? root : `${root}\n\nShip the most complete working end-to-end result NOW. Prefer done over polish; finish it.`),
+  ]
+
+  const corpus = process.env.CORPUS ?? `${process.cwd()}/corpus/rsi-${adapter.name}.jsonl`
+  const r = await runExperiment({
+    adapter,
+    sandboxClient: client,
+    agentRun: sandboxAgentRun({ model, routerBaseUrl, routerKey }),
+    arms: policies,
+    model,
+    rounds,
+    n: Number(process.env.N ?? 10),
+    ids: process.env.IDS ? process.env.IDS.split(',') : undefined,
+    concurrency: Number(process.env.CONCURRENCY ?? 3),
+    ...(adapter.output ? { output: adapter.output } : {}),
+    corpusPath: corpus,
+  })
+
+  const pct = (x: number) => (r.n > 0 ? `${((x / r.n) * 100).toFixed(1)}%` : 'n/a')
+  console.log(`\n=== ${adapter.name}: ${r.arms.length} policies x rounds=${rounds} (clean n=${r.n}, excluded ${r.errored}) ===`)
+  console.log(`  blind (1 attempt): ${pct(r.blind)}`)
+  for (const a of r.arms) {
+    const tag = a.label === r.arms[0]?.label ? '  <- compute control' : `  delta vs control ${((a.deltaVsControl / Math.max(r.n, 1)) * 100).toFixed(1)}pp`
+    console.log(`  ${a.label}@${rounds}: ${pct(a.resolved)}${tag}`)
+  }
+  console.log(`corpus: ${corpus}  ->  paired CI + BH via: tsx src/corpus-report.mts ${corpus}`)
+}
+
+main().catch((e) => {
+  console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+  process.exit(1)
+})
diff --git a/docs/README.md b/docs/README.md
@@ -14,6 +14,17 @@ Read top-to-bottom for the full picture.
 | 4 | [learning-flywheel.md](./learning-flywheel.md) | theory deep-dive | The moat thesis — the `(π, τ, J, D, O)` recursion and cross-run flywheel. Points to `architecture.md` as the canonical entry. |
 | 5 | [../bench/README.md](../bench/README.md) | empirical harness | The benchmark surface and current empirical status (what's been run, what wins, what's untested). |
 
+## Research track
+
+Forward-looking design research — surveys, multi-agent design passes, decision logs. Not the canonical spine; promotions into `architecture.md` happen explicitly once a design ships.
+
+| Doc | Role | Purpose |
+|---|---|---|
+| [research/README.md](./research/README.md) | research index | The active design thread + decision log + source-artifact pointers. |
+| [research/recursive-execution-atom.md](./research/recursive-execution-atom.md) | design (in progress) | The next generation: one recursive `Agent` atom run as a durable, observable supervision tree (drivers-of-drivers, analyst-as-agent-with-runtime, async dynamic spawning). Plane B — contains the flat harness. |
+| [research/flat-harness-design.md](./research/flat-harness-design.md) | design synthesis | Plane A — the assumption-free experiment harness (profiles × steer × executionMode × allocation). Recovered as the simplest `act` body on Plane B. |
+| [research/long-horizon-benchmark-survey.md](./research/long-horizon-benchmark-survey.md) | survey | Adversarially-verified long-horizon + multi-turn benchmark survey. Top picks: Commit0, τ²-bench. |
+
 ## Reference track
 
 The package API and subsystems.