# ACP Agent Evaluation Results

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/plaited/acp-evals/blob/main/notebooks/summary.ipynb)

**Public benchmark comparing agent web search capabilities.**

This notebook visualizes results from the latest evaluation run. GitHub renders this automatically - scroll down to see charts. Click the Colab badge above to explore the data interactively.

**Raw data:** All results are committed in `/data/results/runs/` for reproducibility.

**Analysis includes:**
- Overall quality rankings (weighted strategy)
- Head-to-head win/loss matrix
- Statistical significance testing (bootstrap)
- Raw metrics (pass rates, latency, errors)
- Historical trends across runs

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Read latest.json to find current run
with open('../data/results/latest.json') as f:
    latest = json.load(f)

print(f"Analyzing run: {latest['date']}")
print(f"Prompts: {latest['promptCount']}")
print(f"Commit: {latest['commit']}")

## Load Data

Load raw trajectory results and comparison analysis.

In [None]:
# Read agents and search providers from manifest
with open('../data/results/MANIFEST.jsonl') as f:
    lines = f.readlines()
    latest_entry = json.loads(lines[-1])  # Get most recent run
    agents = latest_entry['agents']
    search_providers = latest_entry['searchProviders']

# Load raw trajectory results
results = {}
for agent in agents:
    results[agent] = {}
    for provider in search_providers:
        path = f"../data/results/{latest['path']}/{agent}/{provider}.jsonl"
        try:
            with open(path) as f:
                results[agent][provider] = [json.loads(line) for line in f]
        except FileNotFoundError:
            print(f"⚠️  Missing: {path}")
            results[agent][provider] = []

# Flatten to DataFrame for raw metrics
rows = []
for agent, providers_dict in results.items():
    for provider, result_list in providers_dict.items():
        for r in result_list:
            rows.append({
                'agent': agent,
                'search_provider': provider,
                'id': r['id'],
                'score': r.get('score', 0),
                'latency_ms': r.get('timing', {}).get('total', 0),
                'tool_errors': r.get('toolErrors', False),
            })

df = pd.DataFrame(rows)

# Load comparison analysis (weighted strategy)
comparison_path = f"../data/comparisons/{latest['path']}/all-weighted.json"
try:
    with open(comparison_path) as f:
        comparison = json.load(f)
    quality = comparison['quality']
    head_to_head = comparison['headToHead']['pairwise']
    print(f"✓ Loaded comparison analysis with {len(quality['rankings'])} runs")
except FileNotFoundError:
    print(f"⚠️  No comparison data found at {comparison_path}")
    print("   Run: bun scripts/compare.ts --mode full")
    quality = None
    head_to_head = None

print(f"✓ Loaded {len(df)} trajectory results")
df.head()

## Overall Rankings

Quality rankings from weighted comparison strategy.

In [None]:
if quality:
    # Show quality rankings with scores
    rankings_df = pd.DataFrame(quality['rankings'])
    rankings_df['score_pct'] = rankings_df['score'] * 100

    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.barh(rankings_df['run'], rankings_df['score_pct'])

    # Color bars by score (green > 50%, red < 50%)
    for i, bar in enumerate(bars):
        if rankings_df.iloc[i]['score_pct'] >= 50:
            bar.set_color('green')
        else:
            bar.set_color('red')

    ax.axvline(50, color='black', linestyle='--', linewidth=1, label='50% baseline')
    ax.set_xlabel('Quality Score (%)')
    ax.set_title('Overall Quality Rankings (Weighted Strategy)')
    ax.legend()
    plt.tight_layout()
    plt.show()

    print("\nTop performers:")
    for _, row in rankings_df.head(3).iterrows():
        print(f"  {row['rank']}. {row['run']}: {row['score_pct']:.1f}%")
else:
    print("⚠️  Skipping rankings chart - no comparison data")

## Head-to-Head Comparison Matrix

Win confidence matrix from pairwise comparisons.

In [None]:
if head_to_head:
    # Build win matrix from pairwise comparisons
    runs = list(set([p['winner'] for p in head_to_head] + [p['loser'] for p in head_to_head]))
    n = len(runs)
    matrix = np.zeros((n, n))

    run_to_idx = {run: i for i, run in enumerate(runs)}
    for pair in head_to_head:
        winner_idx = run_to_idx[pair['winner']]
        loser_idx = run_to_idx[pair['loser']]
        matrix[winner_idx, loser_idx] = pair['confidence']

    # Heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(matrix, xticklabels=runs, yticklabels=runs, annot=True, fmt='.2f',
                cmap='RdYlGn', center=0.5, vmin=0, vmax=1, cbar_kws={'label': 'Win Confidence'})
    plt.title('Head-to-Head Win Confidence Matrix\n(row wins against column with confidence score)')
    plt.xlabel('Loser')
    plt.ylabel('Winner')
    plt.tight_layout()
    plt.show()

    print(f"\nSignificant wins (confidence > 0.95):")
    for pair in head_to_head:
        if pair['confidence'] > 0.95:
            print(f"  {pair['winner']} > {pair['loser']} ({pair['confidence']:.3f})")
else:
    print("⚠️  Skipping head-to-head matrix - no comparison data")

## Pass Rates by Agent and Search Provider

Success rate (score >= 0.5) by agent/provider combination.

In [None]:
df['passed'] = df['score'] >= 0.5
pass_rates = df.groupby(['agent', 'search_provider'])['passed'].mean() * 100

pass_rates.unstack().plot(kind='bar', figsize=(10, 6))
plt.title('Pass Rate (%) by Agent and Search Provider')
plt.ylabel('Pass Rate (%)')
plt.xlabel('Agent')
plt.axhline(50, color='red', linestyle='--', label='50% threshold')
plt.legend(title='Search Provider')
plt.tight_layout()
plt.show()

## Latency Distribution

Response time histograms for each agent.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for idx, agent in enumerate(agents):
    ax = axes[idx // 2, idx % 2]
    for provider in search_providers:
        data = df[(df['agent'] == agent) & (df['search_provider'] == provider)]['latency_ms'] / 1000
        if len(data) > 0:
            ax.hist(data, bins=30, alpha=0.6, label=provider)
    ax.set_title(f'{agent.title()} - Latency Distribution')
    ax.set_xlabel('Latency (seconds)')
    ax.set_ylabel('Frequency')
    ax.legend(title='Search Provider')
plt.tight_layout()
plt.show()

## Tool Error Rates

Percentage of queries with tool errors by agent/provider.

In [None]:
error_rates = df.groupby(['agent', 'search_provider'])['tool_errors'].mean() * 100

error_rates.unstack().plot(kind='bar', figsize=(10, 6))
plt.title('Tool Error Rate (%) by Agent and Search Provider')
plt.ylabel('Error Rate (%)')
plt.xlabel('Agent')
plt.legend(title='Search Provider')
plt.tight_layout()
plt.show()

## Historical Trends

Performance over time (if multiple runs exist).

In [None]:
# Load all runs from MANIFEST
with open('../data/results/MANIFEST.jsonl') as f:
    all_runs = [json.loads(line) for line in f]

if len(all_runs) > 1:
    # Track quality scores over time for each agent-provider combo
    historical_data = []

    for run_entry in all_runs:
        run_date = run_entry['date']
        run_path = f"runs/{run_date}"

        # Try to load comparison for this run
        try:
            comp_path = f"../data/comparisons/{run_path}/all-weighted.json"
            with open(comp_path) as f:
                comp = json.load(f)

            for ranking in comp['quality']['rankings']:
                historical_data.append({
                    'date': run_date,
                    'run': ranking['run'],
                    'score': ranking['score'] * 100,
                })
        except FileNotFoundError:
            print(f"⚠️  No comparison data for {run_date}")

    if historical_data:
        hist_df = pd.DataFrame(historical_data)

        # Plot trends for top performers
        fig, ax = plt.subplots(figsize=(12, 6))
        for run_name in hist_df['run'].unique():
            run_data = hist_df[hist_df['run'] == run_name]
            ax.plot(run_data['date'], run_data['score'], marker='o', label=run_name)

        ax.set_xlabel('Run Date')
        ax.set_ylabel('Quality Score (%)')
        ax.set_title('Quality Score Trends Over Time')
        ax.axhline(50, color='black', linestyle='--', linewidth=1)
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
else:
    print("Only one run in history. Historical trends will appear after future runs.")