Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 70 additions & 31 deletions examples/self-improving-loop/self-improving-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
// See README.md for the conceptual map.

import {
runJudge,
runMultishot,
type JudgeConfig,
type MultishotMessage,
type MultishotPersona,
type MultishotResult,
type MultishotShape,
runJudge,
runMultishot,
} from '@tangle-network/agent-eval/multishot'
import type { AgentProfile } from '@tangle-network/sandbox'

Expand Down Expand Up @@ -48,11 +48,16 @@ function installMockRouter(replies: ScriptedReply[]): () => void {
return {
ok: true,
status: 200,
json: async () => ({ choices: [{ message }], usage: { prompt_tokens: 100, completion_tokens: 200 } }),
json: async () => ({
choices: [{ message }],
usage: { prompt_tokens: 100, completion_tokens: 200 },
}),
text: async () => 'ok',
} as Response
}) as typeof fetch
return () => { global.fetch = original }
return () => {
global.fetch = original
}
}

// ── 2. A tiny domain — viral content scoring ─────────────────────────────────
Expand All @@ -70,8 +75,10 @@ const PERSONAS: FounderPersona[] = [
]

const shape: MultishotShape<FounderPersona> = {
buildOpener: (p) => `I'm ${p.name}, ${p.domain}. Help me write content that actually gets engagement.`,
buildDriverSystemPrompt: (p) => `You are ${p.name} working in ${p.domain}. Push back on vague advice; demand concrete posts.`,
buildOpener: (p) =>
`I'm ${p.name}, ${p.domain}. Help me write content that actually gets engagement.`,
buildDriverSystemPrompt: (p) =>
`You are ${p.name} working in ${p.domain}. Push back on vague advice; demand concrete posts.`,
}

// ── 3. Baseline AgentProfile (v0) — intentionally weak ──────────────────────
Expand All @@ -84,20 +91,24 @@ const baseline: AgentProfile = {
// ── 4. Judge — scores how concrete + audience-fit the agent's output is ────

const dims = [
{ key: 'concreteness', description: 'Real posts vs vague descriptions (0=descriptions, 10=ready-to-post)' },
{ key: 'audience_fit', description: 'Tailored to the persona\'s domain (0=generic, 10=spot-on)' },
{
key: 'concreteness',
description: 'Real posts vs vague descriptions (0=descriptions, 10=ready-to-post)',
},
{ key: 'audience_fit', description: "Tailored to the persona's domain (0=generic, 10=spot-on)" },
] as const

const conversationJudge: JudgeConfig<{ transcript: MultishotMessage[]; persona: FounderPersona }> = {
name: 'content-quality',
systemPrompt: 'You are a strict judge. Output ONLY valid JSON.',
dimensions: [...dims],
buildPrompt: ({ transcript, persona }) =>
`Score this agent's output for ${persona.name} (${persona.domain}). 0-10 each.\n\n${transcript
.filter((m) => m.role !== 'tool')
.map((m) => `${m.role}: ${m.content}`)
.join('\n\n')}\n\nRespond with ONLY: {"concreteness":N,"audience_fit":N,"notes":"..."}`,
}
const conversationJudge: JudgeConfig<{ transcript: MultishotMessage[]; persona: FounderPersona }> =
{
name: 'content-quality',
systemPrompt: 'You are a strict judge. Output ONLY valid JSON.',
dimensions: [...dims],
buildPrompt: ({ transcript, persona }) =>
`Score this agent's output for ${persona.name} (${persona.domain}). 0-10 each.\n\n${transcript
.filter((m) => m.role !== 'tool')
.map((m) => `${m.role}: ${m.content}`)
.join('\n\n')}\n\nRespond with ONLY: {"concreteness":N,"audience_fit":N,"notes":"..."}`,
}

// ── 5. Analyst — reads v0 transcripts + scores, proposes a mutation ────────

Expand All @@ -106,13 +117,16 @@ interface AnalystFinding {
proposedMutation: string
}

async function runAnalyst(v0Runs: Array<{ persona: FounderPersona; result: MultishotResult; score: { composite: number } }>): Promise<AnalystFinding> {
async function runAnalyst(
v0Runs: Array<{ persona: FounderPersona; result: MultishotResult; score: { composite: number } }>,
): Promise<AnalystFinding> {
// In a real product the analyst would be an LLM call (@tangle-network/agent-runtime/analyst-loop).
// Here we synthesise the finding deterministically so the demo is reproducible.
const worst = [...v0Runs].sort((a, b) => a.score.composite - b.score.composite)[0]
return {
rootCause: `${worst.persona.name} run scored ${worst.score.composite.toFixed(1)} — output was too generic, no concrete posts.`,
proposedMutation: 'Always include 2 ready-to-post examples tailored to the persona\'s exact domain (use specific verbs, numbers, and audience language).',
proposedMutation:
"Always include 2 ready-to-post examples tailored to the persona's exact domain (use specific verbs, numbers, and audience language).",
}
}

Expand All @@ -128,10 +142,19 @@ function applyMutation(base: AgentProfile, mutation: string): AgentProfile {

// ── 6. Gate — promote v1 only if it beats v0 by >= delta ───────────────────

function gate(v0Mean: number, v1Mean: number, requiredDelta = 0.5): { ship: boolean; delta: number; reason: string } {
function gate(
v0Mean: number,
v1Mean: number,
requiredDelta = 0.5,
): { ship: boolean; delta: number; reason: string } {
const delta = v1Mean - v0Mean
if (delta >= requiredDelta) return { ship: true, delta, reason: `v1 beat v0 by ${delta.toFixed(2)} (>= ${requiredDelta})` }
return { ship: false, delta, reason: `v1 only beat v0 by ${delta.toFixed(2)} (< ${requiredDelta})` }
if (delta >= requiredDelta)
return { ship: true, delta, reason: `v1 beat v0 by ${delta.toFixed(2)} (>= ${requiredDelta})` }
return {
ship: false,
delta,
reason: `v1 only beat v0 by ${delta.toFixed(2)} (< ${requiredDelta})`,
}
}

// ── 7. Wire it together ─────────────────────────────────────────────────────
Expand All @@ -140,7 +163,11 @@ async function runVariant(profile: AgentProfile, scriptedReplies: ScriptedReply[
const restore = installMockRouter(scriptedReplies)
process.env.TANGLE_API_KEY ??= 'test-key'
try {
const runs: Array<{ persona: FounderPersona; result: MultishotResult; score: { composite: number } }> = []
const runs: Array<{
persona: FounderPersona
result: MultishotResult
score: { composite: number }
}> = []
for (const persona of PERSONAS) {
const result = await runMultishot({ profile, persona, shape, maxTurns: 1 })
const score = await runJudge(conversationJudge, { transcript: result.transcript, persona })
Expand Down Expand Up @@ -168,7 +195,8 @@ async function main(): Promise<void> {
console.log('— Phase 1: v0 baseline run')
const v0 = await runVariant(baseline, v0Replies)
console.log(` v0 mean: ${v0.mean.toFixed(2)} (over ${v0.runs.length} personas)`)
for (const r of v0.runs) console.log(` ${r.persona.id.padEnd(14)} composite=${r.score.composite.toFixed(2)}`)
for (const r of v0.runs)
console.log(` ${r.persona.id.padEnd(14)} composite=${r.score.composite.toFixed(2)}`)

console.log('\n— Phase 2: analyst proposes mutation')
const finding = await runAnalyst(v0.runs)
Expand All @@ -180,26 +208,37 @@ async function main(): Promise<void> {

// v1 replies: now concrete + audience-fit
const v1Replies: ScriptedReply[] = [
{ text: 'Here are 2 tweets for Maya: "Just opened our 50th retailer in TX — onboarding playbook is up on Notion." / "Why we said no to Kroger: margin math + ops bandwidth."' },
{
text: 'Here are 2 tweets for Maya: "Just opened our 50th retailer in TX — onboarding playbook is up on Notion." / "Why we said no to Kroger: margin math + ops bandwidth."',
},
{ text: '{"concreteness":8,"audience_fit":9,"notes":"concrete + retail-specific"}' },
{ text: 'Here are 2 LinkedIn posts for Theo: "We cut MRR churn 32% by routing every renewal through a forecasted-risk score." / "Why your B2B PLG playbook stalls at $5M ARR (and what to do)."' },
{
text: 'Here are 2 LinkedIn posts for Theo: "We cut MRR churn 32% by routing every renewal through a forecasted-risk score." / "Why your B2B PLG playbook stalls at $5M ARR (and what to do)."',
},
{ text: '{"concreteness":9,"audience_fit":8,"notes":"B2B-specific metrics"}' },
{ text: 'Two TikTok hooks for Aurora: "POV: you finally found the foundation that matches NC15 + has SPF" / "What I wish I knew before booking my first brand deal at 50k followers."' },
{
text: 'Two TikTok hooks for Aurora: "POV: you finally found the foundation that matches NC15 + has SPF" / "What I wish I knew before booking my first brand deal at 50k followers."',
},
{ text: '{"concreteness":8,"audience_fit":9,"notes":"creator-economy-specific"}' },
]

console.log('\n— Phase 4: v1 re-run')
const v1Result = await runVariant(v1, v1Replies)
console.log(` v1 mean: ${v1Result.mean.toFixed(2)} (over ${v1Result.runs.length} personas)`)
for (const r of v1Result.runs) console.log(` ${r.persona.id.padEnd(14)} composite=${r.score.composite.toFixed(2)}`)
for (const r of v1Result.runs)
console.log(` ${r.persona.id.padEnd(14)} composite=${r.score.composite.toFixed(2)}`)

console.log('\n— Phase 5: gate decision')
const verdict = gate(v0.mean, v1Result.mean)
console.log(` ship: ${verdict.ship} | delta: ${verdict.delta >= 0 ? '+' : ''}${verdict.delta.toFixed(2)} | ${verdict.reason}`)
console.log(
` ship: ${verdict.ship} | delta: ${verdict.delta >= 0 ? '+' : ''}${verdict.delta.toFixed(2)} | ${verdict.reason}`,
)

if (verdict.ship) {
console.log('\n═══ PROMOTED v1 → production ═══')
console.log('In a real product the new systemPrompt would land in the production composer\nand subsequent chat turns would use it. See agent-eval-adoption skill Phase 3.')
console.log(
'In a real product the new systemPrompt would land in the production composer\nand subsequent chat turns would use it. See agent-eval-adoption skill Phase 3.',
)
} else {
console.log('\n═══ HELD — keep v0 ═══')
}
Expand Down
5 changes: 4 additions & 1 deletion src/mcp/bin-helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ const KNOWN_HARNESSES: ReadonlyArray<LocalHarness> = ['claude', 'codex', 'openco

function parseHarnesses(raw: string | undefined): ReadonlyArray<LocalHarness> | undefined {
if (!raw) return undefined
const parts = raw.split(',').map((s) => s.trim()).filter(Boolean)
const parts = raw
.split(',')
.map((s) => s.trim())
.filter(Boolean)
if (parts.length === 0) return undefined
for (const part of parts) {
if (!KNOWN_HARNESSES.includes(part as LocalHarness)) {
Expand Down
82 changes: 57 additions & 25 deletions src/mcp/in-process-executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ import { randomUUID } from 'node:crypto'
import type { CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox'
import type { LoopSandboxClient, LoopSandboxPlacement } from '../loops'
import type { DelegationExecutor } from './executor'
import { runLocalHarness, type LocalHarness } from './local-harness'
import { type LocalHarness, runLocalHarness } from './local-harness'
import {
captureWorktreeDiff,
createWorktree,
removeWorktree,
type GitRunner,
removeWorktree,
type WorktreeHandle,
} from './worktree'

Expand Down Expand Up @@ -85,7 +85,11 @@ export interface InProcessExecutorOptions {
* Test seam — override the post-check runner. Defaults to spawning the
* configured `testCmd` / `typecheckCmd` via `child_process.spawn`.
*/
runPostCheck?: (cmd: string, cwd: string, signal?: AbortSignal) => Promise<{ exitCode: number; stdout: string; stderr: string }>
runPostCheck?: (
cmd: string,
cwd: string,
signal?: AbortSignal,
) => Promise<{ exitCode: number; stdout: string; stderr: string }>
}

/** @experimental */
Expand Down Expand Up @@ -125,7 +129,10 @@ const DEFAULT_POSTCHECK_TIMEOUT_MS = 2 * 60 * 1000
* @experimental
*/
export function createInProcessExecutor(options: InProcessExecutorOptions): DelegationExecutor {
const harnesses = options.harnesses && options.harnesses.length > 0 ? [...options.harnesses] : (['claude'] as const)
const harnesses =
options.harnesses && options.harnesses.length > 0
? [...options.harnesses]
: (['claude'] as const)
const runHarness = options.runHarness ?? runLocalHarness
const runPostCheck = options.runPostCheck ?? defaultRunPostCheck

Expand All @@ -145,10 +152,21 @@ export function createInProcessExecutor(options: InProcessExecutorOptions): Dele
id: `in-process-${runId}`,
__inProcess: { runId, harness },
// eslint-disable-next-line require-yield
async *streamPrompt(this: VirtualSandbox, message: string | unknown[], promptOpts?: { signal?: AbortSignal }): AsyncGenerator<SandboxEvent> {
const taskPrompt = typeof message === 'string'
? message
: message.map((p) => (typeof p === 'object' && p && 'text' in p ? String((p as { text: unknown }).text) : '')).join('\n')
async *streamPrompt(
this: VirtualSandbox,
message: string | unknown[],
promptOpts?: { signal?: AbortSignal },
): AsyncGenerator<SandboxEvent> {
const taskPrompt =
typeof message === 'string'
? message
: message
.map((p) =>
typeof p === 'object' && p && 'text' in p
? String((p as { text: unknown }).text)
: '',
)
.join('\n')

let worktree: WorktreeHandle | undefined
try {
Expand Down Expand Up @@ -198,18 +216,22 @@ export function createInProcessExecutor(options: InProcessExecutorOptions): Dele
// Optional post-checks. Each runs in the WORKTREE so it sees the
// harness's edits.
const testCheck = options.testCmd
? await runPostCheck(options.testCmd, worktree.path, promptOpts?.signal).catch((err) => ({
exitCode: -1,
stdout: '',
stderr: err instanceof Error ? err.message : String(err),
}))
? await runPostCheck(options.testCmd, worktree.path, promptOpts?.signal).catch(
(err) => ({
exitCode: -1,
stdout: '',
stderr: err instanceof Error ? err.message : String(err),
}),
)
: { exitCode: 0, stdout: '', stderr: '' }
const typecheckCheck = options.typecheckCmd
? await runPostCheck(options.typecheckCmd, worktree.path, promptOpts?.signal).catch((err) => ({
exitCode: -1,
stdout: '',
stderr: err instanceof Error ? err.message : String(err),
}))
? await runPostCheck(options.typecheckCmd, worktree.path, promptOpts?.signal).catch(
(err) => ({
exitCode: -1,
stdout: '',
stderr: err instanceof Error ? err.message : String(err),
}),
)
: { exitCode: 0, stdout: '', stderr: '' }

const coderOutput = {
Expand All @@ -224,9 +246,10 @@ export function createInProcessExecutor(options: InProcessExecutorOptions): Dele
output: tail(typecheckCheck.stderr || typecheckCheck.stdout, 4000),
},
diffStats: diff.stats,
reviewerNotes: harnessResult.exitCode === 0
? undefined
: `harness ${harness} exited ${harnessResult.exitCode}${harnessResult.timedOut ? ' (timed out)' : ''}`,
reviewerNotes:
harnessResult.exitCode === 0
? undefined
: `harness ${harness} exited ${harnessResult.exitCode}${harnessResult.timedOut ? ' (timed out)' : ''}`,
}

// The terminal event the coderProfile parser looks for.
Expand Down Expand Up @@ -286,10 +309,16 @@ async function defaultRunPostCheck(
const child = spawn('sh', ['-c', cmd], { cwd, stdio: 'pipe' })
let stdout = ''
let stderr = ''
child.stdout?.on('data', (c) => { stdout += String(c) })
child.stderr?.on('data', (c) => { stderr += String(c) })
child.stdout?.on('data', (c) => {
stdout += String(c)
})
child.stderr?.on('data', (c) => {
stderr += String(c)
})
if (signal) {
const onAbort = () => { if (!child.killed) child.kill('SIGTERM') }
const onAbort = () => {
if (!child.killed) child.kill('SIGTERM')
}
if (signal.aborted) onAbort()
else signal.addEventListener('abort', onAbort, { once: true })
}
Expand All @@ -299,7 +328,10 @@ async function defaultRunPostCheck(
if (typeof (killTimer as { unref?: () => void }).unref === 'function') {
;(killTimer as { unref: () => void }).unref()
}
child.on('error', (err) => { clearTimeout(killTimer); reject(err) })
child.on('error', (err) => {
clearTimeout(killTimer)
reject(err)
})
child.on('close', (code) => {
clearTimeout(killTimer)
resolve({ exitCode: code ?? -1, stdout, stderr })
Expand Down
22 changes: 11 additions & 11 deletions src/mcp/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,24 +29,15 @@ export type {
SiblingSandboxExecutorOptions,
} from './executor'
export { createFleetWorkspaceExecutor, createSiblingSandboxExecutor } from './executor'
export type { FeedbackEvent, FeedbackStore } from './feedback-store'
export { eventToSnapshot, InMemoryFeedbackStore } from './feedback-store'
export type {
InProcessExecutorDescribePlacement,
InProcessExecutorOptions,
} from './in-process-executor'
export { createInProcessExecutor } from './in-process-executor'
export type { LocalHarness, LocalHarnessResult, RunLocalHarnessOptions } from './local-harness'
export { runLocalHarness } from './local-harness'
export type {
CreateWorktreeOptions,
DiffOptions,
DiffResult,
GitRunner,
RemoveWorktreeOptions,
WorktreeHandle,
} from './worktree'
export { captureWorktreeDiff, createWorktree, removeWorktree } from './worktree'
export type { FeedbackEvent, FeedbackStore } from './feedback-store'
export { eventToSnapshot, InMemoryFeedbackStore } from './feedback-store'
export { mcpToolsForRuntimeMcp, mcpToolsForRuntimeMcpSubset } from './openai-tools'
export type {
JsonRpcMessage,
Expand Down Expand Up @@ -130,3 +121,12 @@ export type {
ResearchOutputShape,
ResearchSource,
} from './types'
export type {
CreateWorktreeOptions,
DiffOptions,
DiffResult,
GitRunner,
RemoveWorktreeOptions,
WorktreeHandle,
} from './worktree'
export { captureWorktreeDiff, createWorktree, removeWorktree } from './worktree'
Loading
Loading