tangle-network · tangletools · May 31, 2026 · May 31, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,17 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
 
 ---
 
+## [0.70.0] — 2026-05-31 — error-grounded reflection (the driver targets real failures, not blind rewrites)
+
+Adversarial verification on TWO domains (legal + tax, two worker models) found the same root cause: the gepaDriver's candidates **regressed** the baseline, so the gate correctly held — but nothing improved. The driver was reflecting on per-scenario *scores* only; the judge's `notes` (the "why it failed") were computed but **dropped** before the reflection. So it proposed generic rewrites a capable model already knows, which distract rather than help.
+
+### Fixed
+
+- **Judge `notes` now reach the reflective driver.** `campaignBreakdown` collects each scenario's judge `notes` (deduped) into `scenarios[].notes`; `GenerationCandidate.scenarios` + `CampaignBreakdown.scenarios` carry it; `gepaDriver`'s `buildEvidence` surfaces it as `TrialTrace.failureNote`; `buildReflectionPrompt` renders a **"Why it scored low"** block per bottom trial. The optimizer now grounds its next edit on the actual failure pattern.
+- **Anti-overfit by contract + by construction.** The `notes` are documented as GENERALIZABLE failure patterns (which checks/lines/dimensions failed, and how) — NOT case-specific ground truth; leaking expected answers would be memorization. And the held-out gate is the structural backstop: a candidate that overfits train cannot clear the paired-bootstrap CI on cases the driver never saw.
+
+Generic — any agent benefits by having its judge emit informative `notes`. 3 new tests (notes surfaced + deduped + rendered into the reflection); full suite (1645) green.
+
 ## [0.69.0] — 2026-05-30 — strong generic baseline roles (engineer / researcher / generalist)
 
 The structured profile (0.68.0) had a hollow top zone — `baselineProfile` took an arbitrary `role` string. Products are file-producing, tool-using agents living in a sandbox, but nothing gave them a strong operator foundation. This adds three generically-useful, verification-first baseline roles distilled from agent-runtime's `coderProfile` doctrine.

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.69.0",
+  "version": "0.70.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {

diff --git a/src/campaign/drivers/gepa.ts b/src/campaign/drivers/gepa.ts
@@ -310,9 +310,12 @@ function buildEvidence(
   if (!best) return { top: [], bottom: [], target: baseTarget }
 
   const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite)
-  const toTrace = (s: { scenarioId: string; composite: number }): TrialTrace => ({
+  const toTrace = (s: { scenarioId: string; composite: number; notes?: string }): TrialTrace => ({
     id: s.scenarioId,
     score: s.composite,
+    // The judge's "why it scored low" — grounds the reflection on real failure
+    // patterns instead of blind rephrasing. Generalizable by the judge contract.
+    ...(s.notes ? { failureNote: s.notes } : {}),
   })
   const top = byScore.slice(0, evidenceK).map(toTrace)
   const bottom = byScore.slice(-evidenceK).reverse().map(toTrace)

diff --git a/src/campaign/score-utils.ts b/src/campaign/score-utils.ts
@@ -28,8 +28,9 @@ export function campaignMeanComposite<TArtifact, TScenario extends Scenario>(
 export interface CampaignBreakdown {
   /** Mean score per judge dimension across all cells. */
   dimensions: Record<string, number>
-  /** Per-scenario composite (mean over reps + judges). */
-  scenarios: Array<{ scenarioId: string; composite: number }>
+  /** Per-scenario composite (mean over reps + judges) + the judge's free-form
+   *  `notes` for that scenario (the "why" a reflective driver grounds on). */
+  scenarios: Array<{ scenarioId: string; composite: number; notes?: string }>
 }
 
 /** Per-candidate evidence a reflective/patch driver grounds its next proposal
@@ -40,13 +41,24 @@ export function campaignBreakdown<TArtifact, TScenario extends Scenario>(
   const dimSums: Record<string, number> = {}
   const dimCounts: Record<string, number> = {}
   const byScenario = new Map<string, number[]>()
+  const notesByScenario = new Map<string, Set<string>>()
   for (const cell of campaign.cells) {
     const judgeScores = Object.values(cell.judgeScores)
     if (judgeScores.length === 0) continue
     const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length
     const arr = byScenario.get(cell.scenarioId) ?? []
     arr.push(cellComposite)
     byScenario.set(cell.scenarioId, arr)
+    // Collect the judges' free-form notes per scenario (deduped) — the failure
+    // evidence the reflective driver grounds on. Generalizable by contract;
+    // the judge must not put case-specific ground truth here.
+    for (const s of judgeScores) {
+      if (s.notes && s.notes.trim()) {
+        const set = notesByScenario.get(cell.scenarioId) ?? new Set<string>()
+        set.add(s.notes.trim())
+        notesByScenario.set(cell.scenarioId, set)
+      }
+    }
     for (const score of judgeScores) {
       for (const [key, value] of Object.entries(score.dimensions)) {
         dimSums[key] = (dimSums[key] ?? 0) + value
@@ -59,9 +71,14 @@ export function campaignBreakdown<TArtifact, TScenario extends Scenario>(
     const count = dimCounts[key] ?? 0
     dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0
   }
-  const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
-    scenarioId,
-    composite: comps.reduce((a, b) => a + b, 0) / comps.length,
-  }))
+  const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => {
+    const notesSet = notesByScenario.get(scenarioId)
+    const notes = notesSet && notesSet.size > 0 ? [...notesSet].join(' | ') : undefined
+    return {
+      scenarioId,
+      composite: comps.reduce((a, b) => a + b, 0) / comps.length,
+      ...(notes ? { notes } : {}),
+    }
+  })
   return { dimensions, scenarios }
 }
diff --git a/src/campaign/types.ts b/src/campaign/types.ts
@@ -503,8 +503,13 @@ export interface GenerationCandidate {
   /** Mean score per judge dimension across all cells (scenarios × reps ×
    *  judges that reported the dimension). */
   dimensions: Record<string, number>
-  /** Per-scenario composite (mean over reps + judges). */
-  scenarios: Array<{ scenarioId: string; composite: number }>
+  /** Per-scenario composite (mean over reps + judges), plus the judge's
+   *  free-form `notes` for that scenario — the "why it scored low" evidence a
+   *  reflective driver grounds its next edit on. Keep `notes` GENERALIZABLE
+   *  (which checks/lines/dimensions failed and how), NOT case-specific ground
+   *  truth: leaking expected answers into the prompt is memorization, and the
+   *  held-out gate would reject it anyway. */
+  scenarios: Array<{ scenarioId: string; composite: number; notes?: string }>
   /** Driver-supplied short label for the change. Present when the driver
    *  returned a `ProposedCandidate`; absent for bare-surface mutators. */
   label?: string

diff --git a/src/reflective-mutation.ts b/src/reflective-mutation.ts
@@ -31,6 +31,11 @@ export interface TrialTrace {
   expectations?: Array<{ id: string; phrase: string; matched: boolean }>
   /** Free-form text — what the agent actually emitted (e.g., findings, plan). */
   emitted?: string
+  /** The judge's free-form note on WHY this trial scored low — the failure
+   *  evidence the reflection targets. Should be a generalizable pattern (which
+   *  checks/lines/dimensions failed, and how), never case-specific ground
+   *  truth (that would teach memorization, not capability). */
+  failureNote?: string
   /** Optional structured metrics (recall, precision, cost, latency). */
   metrics?: Record<string, number>
 }
@@ -88,6 +93,10 @@ export function buildReflectionPrompt(ctx: ReflectionContext): string {
       sections.push(
         `### Trial \`${trial.id}\` — score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ''}`,
       )
+      if (trial.failureNote) {
+        sections.push('')
+        sections.push(`**Why it scored low:** ${truncate(trial.failureNote, 600)}`)
+      }
       const missed = (trial.expectations ?? []).filter((e) => !e.matched)
       if (missed.length > 0) {
         sections.push('')

diff --git a/tests/campaign/error-grounding.test.ts b/tests/campaign/error-grounding.test.ts
@@ -0,0 +1,69 @@
+import { describe, expect, it } from 'vitest'
+import { campaignBreakdown } from '../../src/campaign/score-utils'
+import type { CampaignCellResult, CampaignResult, Scenario } from '../../src/campaign/types'
+import { buildReflectionPrompt } from '../../src/reflective-mutation'
+
+// The conjunct-2 fix: a reflective driver was proposing blind generic rewrites
+// because it only saw per-scenario SCORES, never the judge's "why it failed".
+// These guard the now-threaded path: judge.notes → campaignBreakdown.scenarios
+// → the reflection prompt's failure evidence. Generalizable by contract; the
+// held-out gate is the anti-overfit backstop.
+
+function cell(scenarioId: string, composite: number, notes: string): CampaignCellResult<unknown> {
+  return {
+    cellId: `${scenarioId}:0`,
+    scenarioId,
+    rep: 0,
+    artifact: { text: 'x' },
+    judgeScores: { j: { composite, dimensions: { d: composite }, notes } },
+    costUsd: 0,
+    tokenUsage: { input: 0, output: 0 },
+    durationMs: 0,
+    seed: 0,
+    cached: false,
+  }
+}
+
+describe('error-grounding — judge notes reach the reflective driver', () => {
+  it('campaignBreakdown surfaces the per-scenario judge notes (the "why")', () => {
+    const campaign = {
+      cells: [
+        cell('weak-case', 0.5, 'missed lines: 16 (tax recompute), 25d (withholding sum)'),
+        cell('strong-case', 1.0, 'all lines correct'),
+      ],
+    } as unknown as CampaignResult<unknown, Scenario>
+    const bd = campaignBreakdown(campaign)
+    const weak = bd.scenarios.find((s) => s.scenarioId === 'weak-case')
+    expect(weak?.composite).toBeCloseTo(0.5)
+    expect(weak?.notes).toMatch(/16 \(tax recompute\)/i)
+    // notes are carried, not dropped
+    expect(bd.scenarios.every((s) => typeof s.notes === 'string')).toBe(true)
+  })
+
+  it('dedups repeated notes across reps of the same scenario', () => {
+    const campaign = {
+      cells: [
+        cell('c', 0.4, 'missed line 16'),
+        { ...cell('c', 0.6, 'missed line 16'), cellId: 'c:1', rep: 1 },
+      ],
+    } as unknown as CampaignResult<unknown, Scenario>
+    const bd = campaignBreakdown(campaign)
+    const c = bd.scenarios.find((s) => s.scenarioId === 'c')
+    expect(c?.notes).toBe('missed line 16') // deduped, not 'missed line 16 | missed line 16'
+  })
+
+  it('buildReflectionPrompt quotes the failure note so the model targets it', () => {
+    const prompt = buildReflectionPrompt({
+      target: 'tax guidance',
+      parentPayload: 'BASE',
+      topTrials: [{ id: 'strong', score: 1.0 }],
+      bottomTrials: [
+        { id: 'weak', score: 0.5, failureNote: 'missed lines: 16 (tax recompute), 25d' },
+      ],
+      childCount: 1,
+    })
+    expect(prompt).toMatch(/Why it scored low.*tax recompute/is)
+    // the failure pattern is in the prompt the optimizer sees — not just a score
+    expect(prompt).toContain('25d')
+  })
+})