diff --git a/examples/agents-of-all-shapes/README.md b/examples/agents-of-all-shapes/README.md new file mode 100644 index 0000000..cfdbc36 --- /dev/null +++ b/examples/agents-of-all-shapes/README.md @@ -0,0 +1,69 @@ +# Agents of all shapes → one Tangle Intelligence pipe + +Proof that Tangle Intelligence works with **any agent, not just our sandbox**. +Every shape — the Tangle runtime, an OpenAI-compatible router (tcloud / +OpenRouter), a Mastra agent, the Claude Agent SDK, a Python agno agent — +converges on the **same** canonical OpenTelemetry GenAI spans, and the **same** +in-process engine produces the decision packet: + +``` +your agent (any framework) + → OTel GenAI spans (gen_ai.request.model, gen_ai.usage.*, score) + → fromOtelSpans() → RunRecord[] + → analyzeRuns() → InsightReport (composite, lift CI, Pareto, + failureModes, recommendations) +``` + +No sandbox. No deploy. No server. The analysis runs **in-process**. + +## Run it + +```bash +# Verified QA path — in-process, no key, no infra: +npx tsx examples/agents-of-all-shapes/run.ts + +# CI verification (what proves it): +pnpm test -- tests/agents-of-all-shapes.test.ts +``` + +Set `TANGLE_API_KEY=sk-tan-...` to *also* POST the same spans to the hosted +`/v1/otlp/v1/traces` ingest for the dashboard — identical analysis, server-side. + +## The one contract every shape meets + +`shared/intelligence.ts` is the whole integration surface. A shape only has to +emit OTel spans carrying the standard GenAI attributes plus a `score`: + +| attribute | meaning | +|---|---| +| `gen_ai.request.model` | model snapshot (also `llm.model`, `tangle.model`) | +| `gen_ai.usage.input_tokens` / `output_tokens` | token usage | +| `gen_ai.usage.cost_usd` | cost (also `cost.usd`) | +| `score` | your eval/judge/rubric outcome 0..1 (also `tangle.score`, `eval.score`) | +| an `ERROR`-status span's `name` | → `RunRecord.failureMode` | + +These are **standard OpenTelemetry GenAI semantic conventions** — most +frameworks already emit them; you add `score`. + +## The shapes + +| Shape | File | Live wiring | +|---|---|---| +| **Tangle runtime / router (tcloud)** | `shapes.ts` → `tangleRuntimeRuns` | `createOtelExporter` + `loopEventToOtelSpan` (see `examples/with-intelligence-export`) | +| **OpenAI-compatible** (tcloud / OpenRouter / OpenAI / vLLM) | `shapes.ts` → `openAiCompatibleRuns` | any OpenAI client at the router's `baseURL`; emit a GenAI span per call | +| **Mastra** | `shapes.ts` → `mastraRuns` | Mastra's native OTLP exporter → `${INTELLIGENCE_BASE}/v1/otlp/v1/traces` | +| **Claude Agent SDK** | `shapes.ts` → `claudeAgentSdkRuns` | wrap `query()`, one GenAI span per turn from `msg.usage` | +| **Python agno** | `python-agno/agno_to_intelligence.py` | agno run → OTLP/HTTP POST (or `pip install agent-eval-rpc`) | + +The TypeScript shapes ship deterministic batches so the showcase is +**verifiable in CI with no key** (`tests/agents-of-all-shapes.test.ts`). Each +shape's header comment shows the exact live wiring — swap the batch for your +framework's real telemetry and it lands on the identical engine. + +## Why this matters + +The integration point is the **OTel wire**, not the Tangle SDK or sandbox. Any +team with agent traces — whatever framework, whatever runtime — gets the full +`InsightReport` (failure clustering, cost/quality Pareto, ranked +recommendations, and lift CI once they emit two cohorts) without adopting our +execution stack. diff --git a/examples/agents-of-all-shapes/python-agno/agno_to_intelligence.py b/examples/agents-of-all-shapes/python-agno/agno_to_intelligence.py new file mode 100644 index 0000000..d38bab5 --- /dev/null +++ b/examples/agents-of-all-shapes/python-agno/agno_to_intelligence.py @@ -0,0 +1,129 @@ +""" +Python agno agent -> Tangle Intelligence. No sandbox, no Tangle SDK. + +The same canonical OTel GenAI spans the TypeScript shapes emit, from a +Python agno agent. Two ways, same engine: + + 1. Hosted: POST OTLP/HTTP-JSON straight to the ingest route. Works with + any Python agent; no Tangle dependency at all. + 2. Substrate (via the published `agent-eval-rpc` client): judge/analyze + over the wire — `pip install agent-eval-rpc`. + +Run (live): TANGLE_API_KEY=sk-tan-... python agno_to_intelligence.py +Without agno installed it falls back to a recorded batch so the wiring is +runnable as-is. +""" + +import json +import os +import time +import urllib.request + +INTELLIGENCE_BASE = os.environ.get( + "INTELLIGENCE_BASE", "https://intelligence.tangle.tools/v1/otlp" +) +API_KEY = os.environ.get("TANGLE_API_KEY", "sk-tan-...") + + +def run_agno_agent(prompt: str) -> dict: + """Run a real agno agent if installed; else a recorded run so this + file is runnable without the dep. Live wiring shown inline.""" + try: + from agno.agent import Agent # type: ignore + from agno.models.openai import OpenAIChat # type: ignore + + agent = Agent(model=OpenAIChat(id="gpt-4o")) + resp = agent.run(prompt) + usage = getattr(resp, "metrics", {}) or {} + return { + "model": "openai/gpt-4o", + "input_tokens": int(usage.get("input_tokens", 0) or 0), + "output_tokens": int(usage.get("output_tokens", 0) or 0), + "cost_usd": float(usage.get("cost", 0.0) or 0.0), + # Your acceptance check / judge score in 0..1. + "score": 1.0 if resp and getattr(resp, "content", None) else 0.0, + "failure_mode": None if getattr(resp, "content", None) else "format_drift", + } + except Exception: + # Recorded run — agno not installed or no key. Wiring stays valid. + return { + "model": "openai/gpt-4o", + "input_tokens": 1240, + "output_tokens": 320, + "cost_usd": 0.018, + "score": 0.83, + "failure_mode": None, + } + + +def otlp_spans_for_run(run_id: str, r: dict) -> list[dict]: + now_ns = time.time_ns() + attrs = [ + {"key": "gen_ai.request.model", "value": {"stringValue": r["model"]}}, + {"key": "gen_ai.usage.input_tokens", "value": {"doubleValue": r["input_tokens"]}}, + {"key": "gen_ai.usage.output_tokens", "value": {"doubleValue": r["output_tokens"]}}, + {"key": "gen_ai.usage.cost_usd", "value": {"doubleValue": r["cost_usd"]}}, + {"key": "score", "value": {"doubleValue": r["score"]}}, + ] + spans = [ + { + "traceId": run_id, + "spanId": f"{run_id}-llm", + "name": "gen_ai.chat", + "startTimeUnixNano": str(now_ns), + "endTimeUnixNano": str(now_ns + 800_000_000), + "attributes": attrs, + "status": {"code": "STATUS_CODE_ERROR" if r["failure_mode"] else "STATUS_CODE_OK"}, + } + ] + if r["failure_mode"]: + spans.append( + { + "traceId": run_id, + "spanId": f"{run_id}-err", + "name": r["failure_mode"], + "startTimeUnixNano": str(now_ns + 800_000_000), + "endTimeUnixNano": str(now_ns + 800_000_000), + "attributes": [], + "status": {"code": "STATUS_CODE_ERROR"}, + } + ) + return spans + + +def ship(spans: list[dict]) -> None: + body = json.dumps( + { + "resourceSpans": [ + { + "resource": { + "attributes": [ + {"key": "service.name", "value": {"stringValue": "agno-agent"}} + ] + }, + "scopeSpans": [{"scope": {"name": "agno"}, "spans": spans}], + } + ] + } + ).encode() + req = urllib.request.Request( + f"{INTELLIGENCE_BASE}/v1/traces", + data=body, + headers={"content-type": "application/json", "authorization": f"Bearer {API_KEY}"}, + method="POST", + ) + with urllib.request.urlopen(req) as resp: + if resp.status >= 300: + raise RuntimeError(f"ingest failed: {resp.status}") + + +if __name__ == "__main__": + prompts = ["Summarise the Q3 report", "Draft a follow-up email", "Classify this ticket"] + all_spans: list[dict] = [] + for i, p in enumerate(prompts): + all_spans += otlp_spans_for_run(f"agno-{i}", run_agno_agent(p)) + if API_KEY != "sk-tan-...": + ship(all_spans) + print(f"Shipped {len(all_spans)} spans from agno → Tangle Intelligence.") + else: + print("(set TANGLE_API_KEY to ship; printing spans)\n", json.dumps(all_spans, indent=2)[:600]) diff --git a/examples/agents-of-all-shapes/run.ts b/examples/agents-of-all-shapes/run.ts new file mode 100644 index 0000000..379af80 --- /dev/null +++ b/examples/agents-of-all-shapes/run.ts @@ -0,0 +1,59 @@ +/** + * Agents of all shapes → one decision packet. No sandbox. No deploy. + * + * pnpm tsx examples/agents-of-all-shapes/run.ts + * + * Runs every shape (Tangle runtime / OpenAI-compatible router / Mastra / + * Claude Agent SDK), converts each to canonical OTel GenAI spans, and feeds + * the merged stream through the in-process intelligence engine + * (`fromOtelSpans → analyzeRuns`). Prints the fleet `InsightReport` plus a + * per-shape breakdown. + * + * Optional hosted path: set TANGLE_API_KEY (and INTELLIGENCE_BASE) to also + * POST the spans to the hosted OTLP ingest for the dashboard. + */ + +import { allShapes } from './shapes' +import { shipToTangleOtlp, spansForRuns, toInsightReport } from './shared/intelligence' + +async function main() { + const shapes = allShapes() + const allRuns = Object.values(shapes).flat() + const allSpans = spansForRuns(allRuns) + + // The fleet view — every framework's runs in one vocabulary. + const fleet = await toInsightReport(allSpans) + console.log('=== Fleet InsightReport (all shapes) ===') + console.log(`runs: ${fleet.composite.n}`) + console.log(`composite mean: ${fleet.composite.mean.toFixed(3)}`) + console.log(`composite p50: ${fleet.composite.p50.toFixed(3)}`) + console.log(`failure modes: ${JSON.stringify(fleet.failureModes ?? [])}`) + console.log(`recommendations: ${fleet.recommendations.length}`) + for (const r of fleet.recommendations.slice(0, 3)) { + console.log(` [${r.priority}] ${r.title}`) + } + + // Per-shape — prove the SAME engine works on each framework alone. + console.log('\n=== Per-shape composite ===') + for (const [name, runs] of Object.entries(shapes)) { + const report = await toInsightReport(spansForRuns(runs)) + console.log( + `${name.padEnd(20)} n=${report.composite.n} mean=${report.composite.mean.toFixed(3)}`, + ) + } + + // Optional: also ship to the hosted ingest for the dashboard. + const apiKey = process.env.TANGLE_API_KEY + if (apiKey) { + const endpoint = process.env.INTELLIGENCE_BASE ?? 'https://intelligence.tangle.tools/v1/otlp' + await shipToTangleOtlp(allSpans, { endpoint, apiKey }) + console.log(`\nShipped ${allSpans.length} spans to ${endpoint} for the dashboard.`) + } else { + console.log('\n(set TANGLE_API_KEY to also ship to the hosted dashboard)') + } +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/examples/agents-of-all-shapes/shapes.ts b/examples/agents-of-all-shapes/shapes.ts new file mode 100644 index 0000000..369da2c --- /dev/null +++ b/examples/agents-of-all-shapes/shapes.ts @@ -0,0 +1,144 @@ +/** + * Agents of all shapes → one Tangle Intelligence pipe. + * + * Each shape is a different way to PRODUCE agent runs. They all converge on + * the same canonical OTel GenAI spans (`shared/intelligence.ts`), so the + * `InsightReport` is computed identically no matter who ran the agent — + * Tangle's runtime, an OpenAI-compatible router (tcloud / OpenRouter), a + * Mastra agent, the Claude Agent SDK, or a Python agno agent. + * + * The runs below are deterministic so the showcase is verifiable in CI with + * no LLM key. Each shape's header shows the REAL wiring — swap the seeded + * batch for your framework's telemetry and it lands on the same engine. None + * of this touches a Tangle sandbox. + */ + +import type { AgentRun } from './shared/intelligence' + +/** Deterministic pseudo-random in [0,1) from a string seed — keeps the + * showcase reproducible (no `Math.random()` in asserted output). */ +function rand(seed: string): number { + let h = 2166136261 + for (let i = 0; i < seed.length; i++) { + h ^= seed.charCodeAt(i) + h = Math.imul(h, 16777619) + } + return ((h >>> 0) % 100000) / 100000 +} + +interface BatchSpec { + shape: string + model: string + n: number + /** Fraction of runs that fail, with the failure tag to attach. */ + failRate: number + failureMode: string +} + +function batch(spec: BatchSpec): AgentRun[] { + const runs: AgentRun[] = [] + for (let i = 0; i < spec.n; i++) { + const id = `${spec.shape}-${i}` + const failed = rand(`${id}:f`) < spec.failRate + runs.push({ + runId: id, + model: spec.model, + score: failed ? 0.1 + rand(`${id}:s`) * 0.25 : 0.62 + rand(`${id}:s`) * 0.35, + costUsd: 0.004 + rand(`${id}:c`) * 0.05, + inputTokens: 700 + Math.floor(rand(`${id}:i`) * 1500), + outputTokens: 120 + Math.floor(rand(`${id}:o`) * 600), + startMs: 1_700_000_000_000 + i * 1000, + durationMs: 800 + Math.floor(rand(`${id}:d`) * 4000), + ...(failed ? { failureMode: spec.failureMode } : {}), + }) + } + return runs +} + +/** + * 1. Tangle agent-runtime / router (tcloud). + * + * LIVE: agent-runtime already emits every loop event; ship them with the + * built-in exporter (see `examples/with-intelligence-export`): + * const exporter = createOtelExporter({ endpoint, headers }) + * for await (const e of runAgentTaskStream({ task, backend })) { + * exporter.exportSpan(loopEventToOtelSpan({ kind: e.type, runId, ... }, traceId)) + * } + * Attach your eval/judge score as a `score` attribute on the run's span. + */ +export function tangleRuntimeRuns(): AgentRun[] { + return batch({ + shape: 'tangle-runtime', + model: 'tcloud/claude-sonnet-4-6@2026-05-08', + n: 12, + failRate: 0.17, + failureMode: 'tool_recovery_failure', + }) +} + +/** + * 2. OpenAI-compatible router — tcloud / OpenRouter / OpenAI / vLLM. + * + * LIVE: any OpenAI-compatible client. Point it at the router's baseURL and + * record the OTel GenAI span per call: + * const res = await openai.chat.completions.create({ model, messages }) + * // emit a span with gen_ai.request.model, gen_ai.usage.{input,output}_tokens, + * // gen_ai.usage.cost_usd, and your `score`. + */ +export function openAiCompatibleRuns(): AgentRun[] { + return batch({ + shape: 'openai-compatible', + model: 'openrouter/google/gemini-2.5-pro', + n: 10, + failRate: 0.2, + failureMode: 'format_drift', + }) +} + +/** + * 3. Mastra agent (TypeScript). + * + * LIVE: Mastra emits OpenTelemetry natively. Configure its OTLP exporter to + * point at `${INTELLIGENCE_BASE}/v1/otlp` (hosted) OR collect the spans and + * call `toInsightReport` in-process: + * export const mastra = new Mastra({ telemetry: { enabled: true, + * export: { type: 'otlp', endpoint: `${INTELLIGENCE_BASE}/v1/otlp/v1/traces` } } }) + * Add a `score` attribute from your eval step. No Tangle SDK required. + */ +export function mastraRuns(): AgentRun[] { + return batch({ + shape: 'mastra', + model: 'openai/gpt-4o-2024-11-20', + n: 10, + failRate: 0.1, + failureMode: 'instruction_following', + }) +} + +/** + * 4. Claude Agent SDK (TypeScript). + * + * LIVE: wrap the SDK's query loop and emit one GenAI span per turn: + * for await (const msg of query({ prompt, options })) { ...collect usage... } + * // span: gen_ai.request.model='claude-...', gen_ai.usage.* from msg.usage, + * // score from your acceptance check. + */ +export function claudeAgentSdkRuns(): AgentRun[] { + return batch({ + shape: 'claude-agent-sdk', + model: 'anthropic/claude-sonnet-4-6@2026-05-08', + n: 10, + failRate: 0.12, + failureMode: 'reasoning_error', + }) +} + +/** Every shape, merged — the fleet view across frameworks. */ +export function allShapes(): Record { + return { + 'tangle-runtime': tangleRuntimeRuns(), + 'openai-compatible': openAiCompatibleRuns(), + mastra: mastraRuns(), + 'claude-agent-sdk': claudeAgentSdkRuns(), + } +} diff --git a/examples/agents-of-all-shapes/shared/intelligence.ts b/examples/agents-of-all-shapes/shared/intelligence.ts new file mode 100644 index 0000000..8163194 --- /dev/null +++ b/examples/agents-of-all-shapes/shared/intelligence.ts @@ -0,0 +1,151 @@ +/** + * The ONE pipe every agent shape converges on. + * + * Tangle Intelligence does not care what framework produced a run — it + * consumes OpenTelemetry GenAI spans. `fromOtelSpans` reads the standard + * `gen_ai.*` semantic conventions (plus `tangle.*` aliases and a generic + * `score`), turns each trace into a `RunRecord`, and `analyzeRuns` produces + * the `InsightReport` decision packet — composite distribution, lift CI, + * Pareto, failure clustering, ranked recommendations. + * + * Two ways to use it, same engine: + * - `toInsightReport(spans)` — in-process, zero infra. No sandbox, no + * hosted endpoint, no server. This is the QA path every shape verifies. + * - `shipToTangleOtlp(spans, opts)` — POST the same spans to the hosted + * `/v1/otlp/v1/traces` ingest for the dashboard. Optional. + */ + +import { analyzeRuns, fromOtelSpans, type InsightReport } from '@tangle-network/agent-eval/contract' +import type { TraceSpanEvent } from '@tangle-network/agent-eval/hosted' + +export type { InsightReport, TraceSpanEvent } + +/** One agent run, framework-agnostic. A shape produces a list of these. */ +export interface AgentRun { + runId: string + /** Snapshot model id, e.g. `claude-sonnet-4-6@2025-05-08`. */ + model: string + /** Outcome quality on 0..1 (your judge / eval / rubric score). */ + score: number + costUsd: number + inputTokens: number + outputTokens: number + startMs: number + durationMs: number + /** When set, the run is marked failed and the tag becomes the failure + * span name (→ `RunRecord.failureMode`). */ + failureMode?: string +} + +const NANO = 1_000_000 + +/** + * Canonical OTel GenAI spans for one agent run. Any framework that emits + * these standard attributes (`gen_ai.request.model`, `gen_ai.usage.*`, + * `gen_ai.usage.cost_usd`) plus a `score` lands here byte-identically — + * Mastra, the Claude Agent SDK, agno, an OpenAI-compatible router, or the + * Tangle runtime. That is the whole point: one wire, every shape. + */ +export function otelSpansForRun(run: AgentRun): TraceSpanEvent[] { + const start = run.startMs * NANO + const end = (run.startMs + run.durationMs) * NANO + const spans: TraceSpanEvent[] = [ + { + traceId: run.runId, + spanId: `${run.runId}::llm`, + name: 'gen_ai.chat', + startTimeUnixNano: start, + endTimeUnixNano: end, + attributes: { + 'gen_ai.request.model': run.model, + 'gen_ai.usage.input_tokens': run.inputTokens, + 'gen_ai.usage.output_tokens': run.outputTokens, + 'gen_ai.usage.cost_usd': run.costUsd, + score: run.score, + }, + status: run.failureMode ? { code: 'ERROR' } : { code: 'OK' }, + }, + ] + // Failure span — its name becomes the RunRecord.failureMode. + if (run.failureMode) { + spans.push({ + traceId: run.runId, + spanId: `${run.runId}::err`, + name: run.failureMode, + startTimeUnixNano: end, + endTimeUnixNano: end, + attributes: {}, + status: { code: 'ERROR' }, + }) + } + return spans +} + +/** Flatten many runs into one OTel span stream. */ +export function spansForRuns(runs: AgentRun[]): TraceSpanEvent[] { + return runs.flatMap(otelSpansForRun) +} + +/** In-process intelligence: OTel spans → RunRecords → InsightReport. No + * sandbox, no server, no deploy. The verifiable QA path. */ +export async function toInsightReport(spans: TraceSpanEvent[]): Promise { + const runs = fromOtelSpans({ spans }) + return analyzeRuns({ runs }) +} + +export interface ShipOptions { + /** Hosted ingest base; the route `/v1/traces` is appended. */ + endpoint: string + /** `sk-tan-...` key — tenant resolves from the Bearer, never the payload. */ + apiKey: string + serviceName?: string +} + +/** Optional hosted path: POST the same OTel spans to Tangle Intelligence's + * OTLP/HTTP ingest. Identical analysis runs server-side. */ +export async function shipToTangleOtlp(spans: TraceSpanEvent[], opts: ShipOptions): Promise { + const res = await fetch(`${opts.endpoint}/v1/traces`, { + method: 'POST', + headers: { + 'content-type': 'application/json', + authorization: `Bearer ${opts.apiKey}`, + }, + body: JSON.stringify({ + resourceSpans: [ + { + resource: { + attributes: [ + { + key: 'service.name', + value: { stringValue: opts.serviceName ?? 'agents-of-all-shapes' }, + }, + ], + }, + scopeSpans: [ + { + scope: { name: 'agents-of-all-shapes' }, + spans: spans.map((s) => ({ + traceId: s.traceId, + spanId: s.spanId, + name: s.name, + startTimeUnixNano: String(s.startTimeUnixNano), + endTimeUnixNano: String(s.endTimeUnixNano), + attributes: Object.entries(s.attributes).map(([key, value]) => ({ + key, + value: + typeof value === 'number' + ? { doubleValue: value } + : { stringValue: String(value) }, + })), + status: s.status, + })), + }, + ], + }, + ], + }), + }) + if (!res.ok) { + throw new Error(`intelligence ingest failed: ${res.status} ${await res.text()}`) + } +} diff --git a/examples/with-intelligence-export/with-intelligence-export.ts b/examples/with-intelligence-export/with-intelligence-export.ts index 7cb1da3..4848c4f 100644 --- a/examples/with-intelligence-export/with-intelligence-export.ts +++ b/examples/with-intelligence-export/with-intelligence-export.ts @@ -39,10 +39,36 @@ const INTELLIGENCE_BASE = const backend = createIterableBackend({ kind: 'intel-demo', async *stream(_input, ctx) { - yield { type: 'text_delta', task: ctx.task, session: ctx.session, text: 'working...\n', timestamp: new Date().toISOString() } - yield { type: 'tool_call', task: ctx.task, session: ctx.session, toolName: 'web_search', args: {}, timestamp: new Date().toISOString() } - yield { type: 'tool_result', task: ctx.task, session: ctx.session, toolName: 'web_search', result: { ok: true }, timestamp: new Date().toISOString() } - yield { type: 'text_delta', task: ctx.task, session: ctx.session, text: 'done.\n', timestamp: new Date().toISOString() } + yield { + type: 'text_delta', + task: ctx.task, + session: ctx.session, + text: 'working...\n', + timestamp: new Date().toISOString(), + } + yield { + type: 'tool_call', + task: ctx.task, + session: ctx.session, + toolName: 'web_search', + args: {}, + timestamp: new Date().toISOString(), + } + yield { + type: 'tool_result', + task: ctx.task, + session: ctx.session, + toolName: 'web_search', + result: { ok: true }, + timestamp: new Date().toISOString(), + } + yield { + type: 'text_delta', + task: ctx.task, + session: ctx.session, + text: 'done.\n', + timestamp: new Date().toISOString(), + } }, }) diff --git a/tests/agents-of-all-shapes.test.ts b/tests/agents-of-all-shapes.test.ts new file mode 100644 index 0000000..f4a694c --- /dev/null +++ b/tests/agents-of-all-shapes.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from 'vitest' +import { allShapes } from '../examples/agents-of-all-shapes/shapes' +import { spansForRuns, toInsightReport } from '../examples/agents-of-all-shapes/shared/intelligence' + +/** + * Verifies the showcase end-to-end with NO sandbox, NO hosted endpoint, NO + * LLM key: every agent shape → canonical OTel spans → fromOtelSpans → + * analyzeRuns → a real InsightReport. This is the QA path a customer runs to + * prove "any agent, not just your sandbox" before wiring their own traces. + */ +describe('agents-of-all-shapes — one intelligence pipe, no sandbox', () => { + it('every shape produces a real InsightReport in-process', async () => { + const shapes = allShapes() + expect(Object.keys(shapes).sort()).toEqual([ + 'claude-agent-sdk', + 'mastra', + 'openai-compatible', + 'tangle-runtime', + ]) + + for (const [name, runs] of Object.entries(shapes)) { + expect(runs.length).toBeGreaterThan(0) + const report = await toInsightReport(spansForRuns(runs)) + // Real decision packet per framework — composite over its runs. + expect(report.composite.n).toBe(runs.length) + expect(report.composite.mean).toBeGreaterThan(0) + expect(report.composite.mean).toBeLessThanOrEqual(1) + expect(report.composite.min).toBeGreaterThanOrEqual(0) + expect(report.composite.max).toBeLessThanOrEqual(1) + expect(Array.isArray(report.recommendations)).toBe(true) + // Cost/quality Pareto is computed from the gen_ai.usage.cost_usd attrs. + expect(report.costQuality).toBeDefined() + } + }) + + it('merges all shapes into one fleet report (cross-framework aggregation)', async () => { + const shapes = allShapes() + const total = Object.values(shapes).reduce((sum, r) => sum + r.length, 0) + const fleet = await toInsightReport(spansForRuns(Object.values(shapes).flat())) + expect(fleet.composite.n).toBe(total) + // The merged corpus carries failures from multiple frameworks; the + // model-free failureModes breakdown surfaces the dominant one. + expect(fleet.failureModes).toBeDefined() + expect(fleet.failureModes!.length).toBeGreaterThan(0) + expect(fleet.failureModes![0]!.count).toBeGreaterThan(0) + }) + + it('derives a real cost from gen_ai.usage.cost_usd across shapes', async () => { + const fleet = await toInsightReport(spansForRuns(Object.values(allShapes()).flat())) + // The Pareto/cost view is populated from the OTel cost attribute, not zeros. + expect(fleet.costQuality.cost.n).toBeGreaterThan(0) + expect(fleet.costQuality.cost.mean).toBeGreaterThan(0) + }) +})