# Model Efficiency Report
Report file: `efficiency_report.json`

This notebook explores model efficiency metrics including accuracy, latency, and derived efficiency scores.


In [None]:
import json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

report_path = Path("outputs/LLM-evaluation/efficiency_report.json")
if not report_path.exists():
    report_path = Path("efficiency_report.json")

report = json.loads(report_path.read_text())
df = pd.DataFrame(report['models'])
print(f"Loaded {len(df)} model-condition entries")
print(f"Created: {report['created_at']}")


## Top Models by Efficiency Score


In [None]:
cols = ['model_short', 'condition', 'accuracy', 'latency_mean_ms', 'efficiency_score', 'confidence_score']
display(df[[c for c in cols if c in df.columns]].sort_values('efficiency_score', ascending=False).head(10))


## Top Most Accurate Models (with Judge Confidence Scores)


In [None]:
# Show top accurate models with confidence scores
acc_cols = ['model_short', 'condition', 'accuracy', 'confidence_score', 'num_seeds', 'latency_mean_ms']
top_accurate = df[[c for c in acc_cols if c in df.columns]].sort_values('accuracy', ascending=False).head(10)
print('\n=== Top 10 Most Accurate Models (with Judge Agreement Confidence) ===')
display(top_accurate)


## Accuracy by Model and Condition


In [None]:
plt.figure(figsize=(12, 5))
df_sorted = df.sort_values('accuracy', ascending=False)
labels = df_sorted['model_short'] + ' (' + df_sorted['condition'].fillna('') + ')'
colors = ['green' if x else 'blue' for x in df_sorted['finetuned']]
plt.barh(range(len(df_sorted)), df_sorted['accuracy'], color=colors)
plt.yticks(range(len(df_sorted)), labels, fontsize=8)
plt.xlabel('Accuracy')
plt.title('Accuracy by Model-Condition (Green=Finetuned, Blue=Baseline)')
plt.tight_layout()
plt.show()


## Latency Distribution


In [None]:
plt.figure(figsize=(12, 5))
df_sorted = df.sort_values('latency_mean_ms', ascending=True)
labels = df_sorted['model_short'] + ' (' + df_sorted['condition'].fillna('') + ')'
plt.barh(range(len(df_sorted)), df_sorted['latency_mean_ms'])
plt.yticks(range(len(df_sorted)), labels, fontsize=8)
plt.xlabel('Mean Latency (ms)')
plt.title('Latency by Model-Condition (Lower is Better)')
plt.tight_layout()
plt.show()


## Efficiency Score Comparison


In [None]:
plt.figure(figsize=(12, 5))
df_sorted = df.sort_values('efficiency_score', ascending=False)
labels = df_sorted['model_short'] + ' (' + df_sorted['condition'].fillna('') + ')'
plt.barh(range(len(df_sorted)), df_sorted['efficiency_score'])
plt.yticks(range(len(df_sorted)), labels, fontsize=8)
plt.xlabel('Efficiency Score')
plt.title('Composite Efficiency Score (Higher is Better)')
plt.tight_layout()
plt.show()


## Accuracy vs Latency Scatter


In [None]:
plt.figure(figsize=(10, 6))
colors = ['green' if x else 'blue' for x in df['finetuned']]
plt.scatter(df['latency_mean_ms'], df['accuracy'], c=colors, alpha=0.7, s=100)
for idx, row in df.iterrows():
    plt.annotate(row['model_short'], (row['latency_mean_ms'], row['accuracy']), fontsize=7)
plt.xlabel('Mean Latency (ms)')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Latency (Green=Finetuned, Blue=Baseline)')
plt.tight_layout()
plt.show()


## Summary by Condition


In [None]:
by_condition = pd.DataFrame(report['by_condition']).T
display(by_condition)


## Summary by Model


In [None]:
by_model = pd.DataFrame(report['by_model']).T
display(by_model.sort_values('accuracy_best', ascending=False))


## Rankings Summary


In [None]:
for ranking_name, ranking_data in report['rankings'].items():
    print(f"\n=== {ranking_name.upper()} ===")
    for item in ranking_data:
        print(f"  {item['rank']}. {item['model']} ({item.get('condition', 'N/A')})")
