From 99e81e21dfe95dd756b035bf690cd6f8e4f40397 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Fri, 5 Jun 2026 16:52:50 -0600 Subject: [PATCH] =?UTF-8?q?chore(release):=200.81.0=20=E2=80=94=20aggregat?= =?UTF-8?q?eJudgeVerdicts=20+=20token-recall=20checker=20+=20ErrorCluster?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lockstep version bump (npm + pyproject + python __version__ fallback) for the eval-campaign scaffold prep primitives merged in #223 + #224. --- CHANGELOG.md | 14 ++++++++++++++ clients/python/pyproject.toml | 2 +- clients/python/src/agent_eval_rpc/__init__.py | 2 +- package.json | 2 +- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04b398dc..7c6f5f52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval- --- +## [0.81.0] — 2026-06-05 — eval-campaign scaffold prep primitives + +### Added + +- **`aggregateJudgeVerdicts` (root).** Generic judge-ensemble reducer: fan out N uncorrelated judges, mean each rubric dimension over the SURVIVORS, report the inter-rater disagreement spread, sum cost. Replaces the same reduction hand-rolled in legal (`aggregateEnsemble`), creative (`production-loop/judges.ts`), and tax (`judge-ensemble.ts`). Fail-loud: a failed judge (`perDimension: null`) is recorded in `failedJudges`, never folded into a zero; all-failed throws; a failed judge's cost is still summed. Composite reuses `weightedComposite`. +- **`createTokenRecallChecker` (root).** The deterministic, no-LLM `CorrectnessChecker` — sibling of `createLlmCorrectnessChecker`. A produced item fulfils a requirement when its content is substantive and recalls ≥ `minRecall` of the requirement title's significant tokens. The default completion gate for apps/tests without an LLM judge. +- **`ErrorCluster` (root + `/analyst`).** The failure-cluster element type is now a named export, so consumers import it instead of deriving `DatasetOverview['error_clusters'][number]`. + +### Fixed + +- **Lint drift + non-executable pre-commit hook.** `.husky/pre-commit` was tracked `100644`, so the hook silently no-op'd and unformatted code reached `main`; marked executable and reformatted the drift. + +--- + ## [0.72.3] — 2026-06-01 — workflow trace hardening and driver backtests ### Added diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 38a7fbe3..0961232a 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agent-eval-rpc" -version = "0.80.0" +version = "0.81.0" description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client." readme = "README.md" requires-python = ">=3.10" diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py index 3a0f166d..6c490740 100644 --- a/clients/python/src/agent_eval_rpc/__init__.py +++ b/clients/python/src/agent_eval_rpc/__init__.py @@ -58,7 +58,7 @@ try: __version__ = version("agent-eval-rpc") except PackageNotFoundError: - __version__ = "0.80.0" + __version__ = "0.81.0" __all__ = [ "Client", diff --git a/package.json b/package.json index 2315ccc0..d1961078 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.80.0", + "version": "0.81.0", "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": {