In [None]:
# OldBench Evaluation Analysis Notebook

# %% [markdown]
# # Analyze OldBench Evaluation Results
# This notebook loads and analyzes your completed evaluation

# %% Import libraries
from inspect_ai.log import read_eval_log
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from IPython.display import display, HTML

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', None)

# %% Load the evaluation log
log_file = "2025-06-14T12-00-29-04-00_oldbench-eval_b5G4V3ymhC27acePkmyWjT.eval"
print(f"Loading evaluation log: {log_file}")

eval_log = read_eval_log(log_file)
print(f"Evaluation loaded successfully!")
print(f"Model: {eval_log.eval.model}")
print(f"Total samples: {eval_log.eval.dataset.samples}")
print(f"Success rate: {eval_log.eval.results.total_samples}/{eval_log.eval.dataset.samples}")

# %% Convert to DataFrame
def eval_to_dataframe(eval_log):
    """Convert evaluation log to a pandas DataFrame"""
    data = []
    
    for sample in eval_log.samples:
        # Extract the question and target answer
        question = sample.input
        target = sample.target
        
        # Extract the model's response
        model_response = ""
        for message in sample.messages:
            if message.role == "assistant" and message.content:
                model_response = message.content
                break
        
        # Extract score
        score = sample.score.value if sample.score else None
        
        # Extract metadata
        metadata = sample.metadata or {}
        
        # Calculate response length
        response_length = len(model_response.split())
        target_length = len(str(target).split())
        
        data.append({
            'sample_id': sample.id,
            'question': question,
            'target_answer': target,
            'model_response': model_response,
            'score': score,
            'directorate': metadata.get('directorate', 'unknown'),
            'question_type': metadata.get('question_type', 'unknown'),
            'response_length': response_length,
            'target_length': target_length,
            'error': sample.error.message if sample.error else None
        })
    
    return pd.DataFrame(data)

df = eval_to_dataframe(eval_log)
print(f"\nCreated DataFrame with {len(df)} samples")

# %% Display overall statistics
print("=" * 60)
print("EVALUATION SUMMARY")
print("=" * 60)
print(f"Total Samples: {len(df)}")
print(f"Successful: {df['score'].notna().sum()}")
print(f"Failed: {df['score'].isna().sum()}")
print(f"\nScore Statistics:")
print(f"Mean Score: {df['score'].mean():.3f}")
print(f"Median Score: {df['score'].median():.3f}")
print(f"Std Dev: {df['score'].std():.3f}")
print(f"Min Score: {df['score'].min():.3f}")
print(f"Max Score: {df['score'].max():.3f}")

# %% Score distribution
plt.figure(figsize=(10, 6))
df['score'].hist(bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title('Distribution of Evaluation Scores')
plt.axvline(df['score'].mean(), color='red', linestyle='--', label=f'Mean: {df["score"].mean():.3f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# %% Analyze by directorate
print("\n" + "=" * 60)
print("ANALYSIS BY DIRECTORATE")
print("=" * 60)
directorate_stats = df.groupby('directorate').agg({
    'score': ['count', 'mean', 'std', 'min', 'max'],
    'response_length': 'mean'
}).round(3)
directorate_stats.columns = ['Count', 'Mean Score', 'Std Dev', 'Min Score', 'Max Score', 'Avg Response Length']
directorate_stats = directorate_stats.sort_values('Mean Score', ascending=False)
display(directorate_stats)

# %% Visualize by directorate
plt.figure(figsize=(12, 6))
directorate_means = df.groupby('directorate')['score'].mean().sort_values(ascending=True)
plt.barh(directorate_means.index, directorate_means.values)
plt.xlabel('Mean Score')
plt.title('Mean Scores by Directorate')
plt.xlim(0, 1)
for i, v in enumerate(directorate_means.values):
    plt.text(v + 0.01, i, f'{v:.3f}', va='center')
plt.tight_layout()
plt.show()

# %% Analyze by question type
print("\n" + "=" * 60)
print("ANALYSIS BY QUESTION TYPE")
print("=" * 60)
question_type_stats = df.groupby('question_type').agg({
    'score': ['count', 'mean', 'std', 'min', 'max'],
    'response_length': 'mean'
}).round(3)
question_type_stats.columns = ['Count', 'Mean Score', 'Std Dev', 'Min Score', 'Max Score', 'Avg Response Length']
question_type_stats = question_type_stats.sort_values('Mean Score', ascending=False)
display(question_type_stats)

# %% Find best and worst performing questions
print("\n" + "=" * 60)
print("TOP 5 BEST PERFORMING QUESTIONS")
print("=" * 60)
best_questions = df.nlargest(5, 'score')[['question', 'score', 'directorate', 'question_type']]
for idx, row in best_questions.iterrows():
    print(f"\nScore: {row['score']:.3f} | {row['directorate']} | {row['question_type']}")
    print(f"Q: {row['question'][:150]}...")

print("\n" + "=" * 60)
print("TOP 5 WORST PERFORMING QUESTIONS")
print("=" * 60)
worst_questions = df.nsmallest(5, 'score')[['question', 'score', 'directorate', 'question_type']]
for idx, row in worst_questions.iterrows():
    print(f"\nScore: {row['score']:.3f} | {row['directorate']} | {row['question_type']}")
    print(f"Q: {row['question'][:150]}...")

# %% Response length analysis
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(df['target_length'], df['score'], alpha=0.6)
plt.xlabel('Target Answer Length (words)')
plt.ylabel('Score')
plt.title('Score vs Target Answer Length')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(df['response_length'], df['score'], alpha=0.6)
plt.xlabel('Model Response Length (words)')
plt.ylabel('Score')
plt.title('Score vs Model Response Length')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# %% Create a detailed results table
print("\n" + "=" * 60)
print("DETAILED RESULTS TABLE")
print("=" * 60)
results_table = df[['question', 'score', 'directorate', 'question_type']].copy()
results_table['question'] = results_table['question'].str[:80] + '...'
results_table = results_table.sort_values('score', ascending=False)
display(results_table.head(10))

# %% Export to CSV for further analysis
output_filename = "oldbench_evaluation_results.csv"
df.to_csv(output_filename, index=False)
print(f"\nResults exported to: {output_filename}")

# %% Create a summary report
summary = {
    'total_samples': len(df),
    'mean_score': df['score'].mean(),
    'std_score': df['score'].std(),
    'best_directorate': directorate_stats.index[0],
    'worst_directorate': directorate_stats.index[-1],
    'best_question_type': question_type_stats.index[0],
    'worst_question_type': question_type_stats.index[-1],
}

print("\n" + "=" * 60)
print("EXECUTIVE SUMMARY")
print("=" * 60)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key.replace('_', ' ').title()}: {value:.3f}")
    else:
        print(f"{key.replace('_', ' ').title()}: {value}")

In [2]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
