# Web Search Agent Evaluations - Results Analysis

Analysis of evaluation results comparing 4 CLI agents (Claude Code, Gemini, Droid, Codex) with 2 search tools (builtin, You.com MCP).

## Quick Navigation

1. [Setup & Data Loading](#setup)
2. [Executive Summary](#summary)
3. [Rankings](#rankings)
4. [Score Distribution](#distribution)
5. [Pass Rates](#pass-rates)
6. [Latency Analysis](#latency)
7. [Tool Errors](#errors)
8. [Dataset Summary](#dataset)

## Methodology

- **Prompts**: 151 web search tasks
- **Configurations**: 8 (4 agents √ó 2 search tools)
- **Scoring**: Deterministic (60%) + LLM judge (35%) + Metadata (5%)
- **Pass Threshold**: 70% score

In [None]:
# Cell 1: Colab Setup (auto-detects environment)
import os
from pathlib import Path

# Detect if running in Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    print("üîß Running in Google Colab - cloning repository...")
    
    # Clone repository if not already present
    repo_dir = Path('/content/web-search-agent-evals')
    if not repo_dir.exists():
        !git clone https://github.com/youdotcom-oss/web-search-agent-evals.git /content/web-search-agent-evals
        print("‚úì Repository cloned")
    else:
        print("‚úì Repository already exists")
        # Pull latest changes
        %cd /content/web-search-agent-evals
        !git pull origin main
    
    # Change to repo directory
    %cd /content/web-search-agent-evals
    print(f"‚úì Working directory: {Path.cwd()}")
else:
    print("‚úì Running locally")

In [None]:
# Cell 2: Dependencies & Configuration
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

# Find project root
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent

DATA_DIR = PROJECT_ROOT / 'data'
print(f"üìÅ Project root: {PROJECT_ROOT}")
print(f"üìä Data directory: {DATA_DIR}")

# Verify data directory exists
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Data directory not found: {DATA_DIR}. Make sure you're in the project root.")

In [None]:
# Cell 3: Load Latest Run Metadata
# Read latest run pointer
with open(DATA_DIR / 'results' / 'latest.json') as f:
    latest = json.load(f)

print(f"üìä Latest Run: {latest['date']}")
print(f"   Prompts: {latest['promptCount']}")
print(f"   Path: {latest['path']}")

In [None]:
# Cell 4: Load Raw Trajectory Data
# Read MANIFEST for agent/provider list
with open(DATA_DIR / 'results' / 'MANIFEST.jsonl') as f:
    manifest_entry = json.loads(f.readlines()[-1])
    agents = manifest_entry['agents']
    providers = manifest_entry['searchProviders']

# Load all trajectory results
rows = []
for agent in agents:
    for provider in providers:
        result_file = DATA_DIR / 'results' / latest['path'] / agent / f"{provider}.jsonl"
        
        if not result_file.exists():
            print(f"‚ö†Ô∏è  Missing: {result_file}")
            continue
        
        with open(result_file) as f:
            for line in f:
                record = json.loads(line)
                
                # Extract nested score (critical fix!)
                score_obj = record.get('score', {})
                numeric_score = score_obj.get('score', 0) if isinstance(score_obj, dict) else 0
                pass_flag = score_obj.get('pass', False) if isinstance(score_obj, dict) else False
                
                rows.append({
                    'agent': agent,
                    'provider': provider,
                    'config': f"{agent}-{provider}",
                    'id': record['id'],
                    'score': numeric_score,
                    'pass': pass_flag,
                    'latency_ms': record.get('timing', {}).get('total', 0),
                    'tool_errors': record.get('toolErrors', False),
                })

df = pd.DataFrame(rows)
print(f"‚úì Loaded {len(df):,} trajectory records")
print(f"  Agents: {', '.join(agents)}")
print(f"  Providers: {', '.join(providers)}")

In [None]:
# Cell 5: Load Comparison Data
comp_file = DATA_DIR / 'comparisons' / latest['path'] / 'all-weighted.json'

if comp_file.exists():
    with open(comp_file) as f:
        comparison = json.load(f)
    
    # Extract rankings
    rankings = []
    for config, metrics in comparison['quality'].items():
        rankings.append({
            'config': config,
            'avgScore': metrics['avgScore'],
            'passRate': metrics['passRate'],
            'passCount': metrics['passCount'],
            'failCount': metrics['failCount']
        })
    
    rankings_df = pd.DataFrame(rankings).sort_values('avgScore', ascending=False)
    rankings_df['rank'] = range(1, len(rankings_df) + 1)
    
    print(f"‚úì Loaded comparison data: {len(rankings_df)} configurations")
else:
    print(f"‚ö†Ô∏è  No comparison file found: {comp_file}")
    comparison = None
    rankings_df = None

<a id='summary'></a>
## Executive Summary

In [None]:
# Cell 6: Executive Summary
if rankings_df is not None:
    print(f"üìä WEB SEARCH AGENT EVALUATIONS")
    print(f"{'='*70}")
    print(f"Run Date: {latest['date']}")
    print(f"Prompts: {latest['promptCount']}")
    print(f"Configurations: {len(rankings_df)} (4 agents √ó 2 search tools)")
    print(f"\n{'='*70}")
    print("TOP 3 CONFIGURATIONS")
    print(f"{'='*70}\n")
    
    for _, row in rankings_df.head(3).iterrows():
        print(f"#{int(row['rank'])} {row['config']}")
        print(f"   Score: {row['avgScore']:.1%} | Pass Rate: {row['passRate']:.1%}\n")

<a id='rankings'></a>
## Configuration Rankings

In [None]:
# Cell 7: Rankings Bar Chart
if rankings_df is not None:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    colors = ['#2ecc71' if i < 3 else '#3498db' for i in range(len(rankings_df))]
    bars = ax.barh(rankings_df['config'], rankings_df['avgScore'] * 100, color=colors)
    
    ax.set_xlabel('Average Score (%)')
    ax.set_ylabel('Configuration')
    ax.set_title('Agent Rankings by Quality Score')
    ax.axvline(x=70, color='red', linestyle='--', alpha=0.5, label='Pass Threshold')
    ax.legend()
    
    for bar, score in zip(bars, rankings_df['avgScore']):
        ax.text(score * 100 + 0.5, bar.get_y() + bar.get_height()/2,
                f"{score*100:.1f}%", va='center')
    
    plt.tight_layout()
    plt.show()

<a id='distribution'></a>
## Score Distribution

In [None]:
# Cell 8: Score Distribution (Violin Plot)
fig, ax = plt.subplots(figsize=(12, 6))

sns.violinplot(data=df, y='config', x='score', ax=ax, inner='quartile')
ax.axvline(x=0.7, color='red', linestyle='--', alpha=0.5, label='Pass Threshold')
ax.set_xlabel('Score')
ax.set_ylabel('Configuration')
ax.set_title('Score Distribution by Configuration')
ax.legend()

plt.tight_layout()
plt.show()

# Summary statistics
print("\nüìä SCORE STATISTICS")
print("="*70)
for config in sorted(df['config'].unique()):
    scores = df[df['config'] == config]['score']
    print(f"\n{config}:")
    print(f"  Mean: {scores.mean():.3f} | Median: {scores.median():.3f} | Std: {scores.std():.3f}")

<a id='pass-rates'></a>
## Pass Rates

In [None]:
# Cell 9: Pass Rates
pass_rates = df.groupby('config')['pass'].mean().sort_values(ascending=False) * 100

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#2ecc71' if rate >= 20 else '#f39c12' if rate >= 10 else '#e74c3c'
          for rate in pass_rates]
bars = ax.barh(pass_rates.index, pass_rates.values, color=colors)

ax.set_xlabel('Pass Rate (%)')
ax.set_ylabel('Configuration')
ax.set_title('Pass Rates (Score ‚â• 70%)')

for bar, rate in zip(bars, pass_rates.values):
    ax.text(rate + 0.5, bar.get_y() + bar.get_height()/2,
            f"{rate:.1f}%", va='center')

plt.tight_layout()
plt.show()

<a id='latency'></a>
## Latency Analysis

In [None]:
# Cell 10: Latency Analysis
latency_stats = df.groupby('config')['latency_ms'].agg([
    'median',
    ('p90', lambda x: x.quantile(0.9))
]).sort_values('median')

fig, ax = plt.subplots(figsize=(10, 6))
x = range(len(latency_stats))
ax.barh(x, latency_stats['median'], label='Median', alpha=0.7, color='#3498db')
ax.barh(x, latency_stats['p90'], label='P90', alpha=0.5, color='#e74c3c')

ax.set_yticks(x)
ax.set_yticklabels(latency_stats.index)
ax.set_xlabel('Latency (ms)')
ax.set_ylabel('Configuration')
ax.set_title('Response Time Distribution')
ax.legend()

plt.tight_layout()
plt.show()

print("\nLatency Summary (ms):")
print(latency_stats.round(0))

<a id='errors'></a>
## Tool Error Rates

In [None]:
# Cell 11: Tool Error Rates
error_rates = df.groupby('config')['tool_errors'].mean().sort_values(ascending=False) * 100

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#e74c3c' if rate > 20 else '#f39c12' if rate > 10 else '#2ecc71'
          for rate in error_rates]
bars = ax.barh(error_rates.index, error_rates.values, color=colors)

ax.set_xlabel('Error Rate (%)')
ax.set_ylabel('Configuration')
ax.set_title('Tool Error Rates')

for bar, rate in zip(bars, error_rates.values):
    ax.text(rate + 0.5, bar.get_y() + bar.get_height()/2,
            f"{rate:.1f}%", va='center')

plt.tight_layout()
plt.show()

<a id='dataset'></a>
## Dataset Summary

In [None]:
# Cell 12: Data Summary
print("üìã DATASET SUMMARY")
print("="*70)
print(f"Total Records: {len(df):,}")
print(f"Agents: {df['agent'].nunique()}")
print(f"Providers: {df['provider'].nunique()}")
print(f"Configurations: {df['config'].nunique()}")
print(f"Unique Prompts: {df['id'].nunique()}")
print(f"\nScore Range: {df['score'].min():.3f} - {df['score'].max():.3f}")
print(f"Pass Rate (overall): {df['pass'].mean():.1%}")
print(f"Avg Latency: {df['latency_ms'].mean():.0f}ms")
print(f"Tool Error Rate: {df['tool_errors'].mean():.1%}")