# Aggregated Seed Metrics
Generated from `outputs/LLM-evaluation/seed_aggregate_metrics_from_judged.json`.
This notebook loads the aggregated JSON and shows comparison tables and plots.


## Requirements
Install the plotting and data libraries if needed:
```
pip install pandas matplotlib seaborn
```


In [None]:
import json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  # seaborn should be installed in the notebook kernel
from IPython.display import display
# Use POSIX-style path to avoid Windows backslash escape warnings
PATH = '/home/infres/vperlic/projects/nl2atl/outputs/LLM-evaluation/seed_aggregate_metrics_from_judged.json'
with open(PATH, 'r', encoding='utf-8') as fh:
    aggregates = json.load(fh)
rows = []
for g in aggregates:
    row = {k: g.get(k) for k in ('model_short','condition','finetuned','few_shot','num_seeds')}
    for metric,vals in (g.get('metrics') or {}).items():
        row[f'{metric}_mean'] = vals.get('mean')
        row[f'{metric}_std'] = vals.get('std')
    # Add confidence score (mean if aggregated)
    judge_agreement = g.get('judge_agreement', {})
    conf_score = judge_agreement.get('confidence_score')
    if isinstance(conf_score, dict):
        row['confidence_score'] = conf_score.get('mean')
    else:
        row['confidence_score'] = conf_score
    rows.append(row)
df = pd.DataFrame(rows)
display(df.sort_values(by=['model_short','condition']).head(20))
# Show top models by accuracy with confidence scores
if 'accuracy_mean' in df.columns:
    top_models = df.sort_values('accuracy_mean', ascending=False).head(10)
    display_cols = ['model_short', 'condition', 'accuracy_mean', 'confidence_score', 'num_seeds']
    print('\n=== Top 10 Most Accurate Models (with Confidence Scores) ===')
    display(top_models[[c for c in display_cols if c in top_models.columns]])
    plot_df = top_models
    plt.figure(figsize=(12,6))
    sns.barplot(data=plot_df, x='accuracy_mean', y='model_short', hue='condition', dodge=False)
    plt.title('Top 10 Groups by Accuracy (with Judge Agreement Scores)')
    plt.tight_layout()
    plt.show()


In [None]:
# Display aggregated table, ranking by accuracy_mean when available
if 'accuracy_mean' in df.columns:
    display(df.sort_values(by='accuracy_mean', ascending=False).head(20))
else:
    display(df.sort_values(by=['model_short','condition']).head(20))
