# MADE Results Analysis

This notebook demonstrates how to load and analyze results from MADE benchmark runs.

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt

from results_analysis_utils import (
    load_single_run_results,
    load_baseline_results,
    load_baseline_overall_summary,
    load_baseline_per_system_summary,
    load_experiment_metadata,
    load_experiment_progress,
    load_summary,
    plot_discovery_curves,
    plot_final_metrics_comparison,
    results_to_dataframe,
    STRATEGY_LABELS,
)

## 1. Loading Results from a Single Run

Results from `run_benchmark.py` are saved in a timestamped directory.

In [None]:
# Find the most recent results directory
results_base = Path("../results")
run_dirs = sorted([d for d in results_base.iterdir() if d.is_dir() and not d.name.startswith("baselines")])

if run_dirs:
    latest_run = run_dirs[-1]
    print(f"Loading results from: {latest_run.name}")
else:
    print("No results found. Run a benchmark first with: uv run scripts/run_benchmark.py")
    latest_run = None

In [None]:
if latest_run:
    # Load episode trajectories
    metrics_histories, final_metrics_list = load_single_run_results(latest_run, verbose=True)
    print(f"Loaded {len(metrics_histories)} episodes")
    
    # Load summary statistics
    summary = load_summary(latest_run)
    if summary:
        print(f"\nSummary metrics (showing key discovery metrics):")
        key_metrics = [
            "final/novelty_stable_unique_novel_count",
            "final/novelty_stable_unique_novel_fraction",
            "final/recall_formula",
            "final/precision_formula",
            "final/area_under_discovery_curve_normalized",
        ]
        for key in key_metrics:
            if key in summary:
                value = summary[key]
                clean_key = key.replace("final/", "")
                print(f"  {clean_key}: {value['mean']:.3f} ± {value.get('sem', 0):.3f}")

## 2. Plot Discovery Curves

Discovery curves show the cumulative number of stable structures discovered vs oracle queries.

In [None]:
if latest_run and metrics_histories:
    fig, ax = plt.subplots(figsize=(8, 6))
    plot_discovery_curves(metrics_histories, label="Agent", ax=ax)
    ax.set_title(f"Discovery Curve - {latest_run.name}")
    plt.show()

## 3. Final Metrics Comparison

Compare final metrics across episodes.

In [None]:
if latest_run and final_metrics_list:
    fig, ax = plt.subplots(figsize=(10, 6))
    plot_final_metrics_comparison(final_metrics_list, ax=ax)
    ax.set_title("Final Metrics (Mean ± Std across Episodes)")
    plt.show()