# 07. Evaluation Harness and Metrics

Aggregate run summaries into decision-oriented metrics.


In [None]:
from __future__ import annotations

import json
import os
import math
import random
import statistics
from pathlib import Path


def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / 'README.md').exists() and (candidate / 'main_langgraph.py').exists():
            return candidate
    return start


PROJECT_ROOT = find_project_root(Path.cwd().resolve())
os.chdir(PROJECT_ROOT)
print('PROJECT_ROOT =', PROJECT_ROOT)


In [None]:
summary_paths = sorted((PROJECT_ROOT / 'test_outputs').glob('**/run_*.summary.json'))

if not summary_paths:
    synthetic = [
        {'status': 'success', 'tool_error_count': 0, 'event_count': 20, 'report_len': 6000},
        {'status': 'success', 'tool_error_count': 1, 'event_count': 25, 'report_len': 7000},
        {'status': 'error', 'tool_error_count': 2, 'event_count': 10, 'report_len': 0},
    ]
else:
    synthetic = [json.loads(p.read_text()) for p in summary_paths]

print('runs=', len(synthetic))


In [None]:
total = len(synthetic)
success = sum(1 for r in synthetic if r.get('status') == 'success')
error = total - success
success_rate = success / total if total else 0.0
avg_tool_errors = statistics.mean([r.get('tool_error_count', 0) for r in synthetic]) if total else 0.0
avg_report_len = statistics.mean([r.get('report_len', 0) or 0 for r in synthetic]) if total else 0.0

metrics = {
    'total_runs': total,
    'success_rate': round(success_rate, 4),
    'error_runs': error,
    'avg_tool_errors': round(avg_tool_errors, 4),
    'avg_report_len': round(avg_report_len, 2),
}
print(metrics)

assert metrics['total_runs'] >= 1
assert 0.0 <= metrics['success_rate'] <= 1.0
