# Multi-Sample BigCode Results Comparison

Enhanced analysis for comparing CAA vs Unsteered results with n_samples > 1.
Includes task-level pass@k analysis, per-sample success rates, and completion patterns.

## Configuration

In [1]:
import pandas as pd
import json
import numpy as np
from collections import defaultdict

In [31]:
ROOT_PATH = "/workspace/wisent-guard/"

# File paths - modify these to compare different results
CAA_RESULTS_PATH = ROOT_PATH + "evaluation_results/caa_mbppplus_20250822_131751/consolidated_results_20250822_131751.json"
UNSTEERED_RESULTS_PATH = ROOT_PATH + "evaluation_results/unsteered_mbppplus_20250822_144202/consolidated_results_20250822_144202.json"

print(f"CAA Results: {CAA_RESULTS_PATH}")
print(f"Unsteered Results: {UNSTEERED_RESULTS_PATH}")

CAA Results: /workspace/wisent-guard/evaluation_results/caa_mbppplus_20250822_131751/consolidated_results_20250822_131751.json
Unsteered Results: /workspace/wisent-guard/evaluation_results/unsteered_mbppplus_20250822_144202/consolidated_results_20250822_144202.json


## Load and Examine Data

In [32]:
# Load data
with open(CAA_RESULTS_PATH, "r") as f:
    caa_data = json.load(f)
with open(UNSTEERED_RESULTS_PATH, "r") as f:
    unsteered_data = json.load(f)

# Create DataFrames
df_caa = pd.DataFrame(caa_data["detailed_results"])
df_unsteered = pd.DataFrame(unsteered_data["detailed_results"])

print("=== Data Overview ===")
print(f"CAA: {len(df_caa)} completions")
print(f"Unsteered: {len(df_unsteered)} completions")

if 'completion_id' in df_caa.columns:
    caa_tasks = df_caa['task_id'].nunique()
    caa_samples_per_task = len(df_caa) / caa_tasks
    print(f"CAA: {caa_tasks} tasks, {caa_samples_per_task:.1f} samples per task")

if 'completion_id' in df_unsteered.columns:
    unsteered_tasks = df_unsteered['task_id'].nunique()
    unsteered_samples_per_task = len(df_unsteered) / unsteered_tasks
    print(f"Unsteered: {unsteered_tasks} tasks, {unsteered_samples_per_task:.1f} samples per task")

print(f"\nCAA columns: {list(df_caa.columns)}")
print(f"Unsteered columns: {list(df_unsteered.columns)}")

=== Data Overview ===
CAA: 3780 completions
Unsteered: 3780 completions
CAA: 378 tasks, 10.0 samples per task
Unsteered: 378 tasks, 10.0 samples per task

CAA columns: ['task_id', 'completion_id', 'generation', 'reference', 'passed', 'result']
Unsteered columns: ['task_id', 'completion_id', 'generation', 'reference', 'passed', 'result']


## Task-Level Analysis (Pass@K Logic)

In [34]:
# Group by task_id and calculate task-level pass status (pass@k logic)
def calculate_task_level_results(df, method_name):
    """Calculate task-level pass/fail status using pass@k logic (ANY completion passes = task passes)"""
    task_results = df.groupby('task_id').agg({
        'passed': ['any', 'sum', 'count']  # any=task_passed, sum=num_passed, count=total_attempts
    }).round(3)
    
    # Flatten column names
    task_results.columns = [f'{method_name}_task_passed', f'{method_name}_num_passed', f'{method_name}_total_attempts']
    
    # Calculate pass rate per task (num_passed / total_attempts)
    task_results[f'{method_name}_pass_rate'] = (
        task_results[f'{method_name}_num_passed'] / task_results[f'{method_name}_total_attempts']
    ).round(3)
    
    return task_results

# Calculate task-level results
caa_task_results = calculate_task_level_results(df_caa, 'caa')
unsteered_task_results = calculate_task_level_results(df_unsteered, 'unsteered')

# Merge task-level results
task_comparison = pd.merge(caa_task_results, unsteered_task_results, left_index=True, right_index=True, how='inner')

print("=== Task-Level Results ===")
print(f"Total tasks compared: {len(task_comparison)}")
print(f"\nCAA: {task_comparison['caa_task_passed'].sum()} / {len(task_comparison)} tasks passed ({task_comparison['caa_task_passed'].mean():.3f})")
print(f"Unsteered: {task_comparison['unsteered_task_passed'].sum()} / {len(task_comparison)} tasks passed ({task_comparison['unsteered_task_passed'].mean():.3f})")

print(f"\nAverage pass rate per task:")
print(f"CAA: {task_comparison['caa_pass_rate'].mean():.3f}")
print(f"Unsteered: {task_comparison['unsteered_pass_rate'].mean():.3f}")

task_comparison.head(10)

=== Task-Level Results ===
Total tasks compared: 378

CAA: 295 / 378 tasks passed (0.780)
Unsteered: 296 / 378 tasks passed (0.783)

Average pass rate per task:
CAA: 0.660
Unsteered: 0.673


Unnamed: 0_level_0,caa_task_passed,caa_num_passed,caa_total_attempts,caa_pass_rate,unsteered_task_passed,unsteered_num_passed,unsteered_total_attempts,unsteered_pass_rate
task_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,True,10,10,1.0,True,10,10,1.0
1,True,10,10,1.0,True,7,10,0.7
2,True,10,10,1.0,True,10,10,1.0
3,False,0,10,0.0,False,0,10,0.0
4,True,10,10,1.0,True,10,10,1.0
5,True,10,10,1.0,True,10,10,1.0
6,True,2,10,0.2,True,1,10,0.1
7,True,7,10,0.7,True,9,10,0.9
8,True,10,10,1.0,True,10,10,1.0
9,True,10,10,1.0,True,10,10,1.0


## XOR Analysis: Tasks Where Methods Differ

In [35]:
# Find tasks where pass/fail status differs (XOR at task level)
task_comparison['xor_diff'] = task_comparison['caa_task_passed'] ^ task_comparison['unsteered_task_passed']

# Categorize differences
caa_wins = task_comparison[
    task_comparison['caa_task_passed'] & ~task_comparison['unsteered_task_passed']
]

unsteered_wins = task_comparison[
    ~task_comparison['caa_task_passed'] & task_comparison['unsteered_task_passed']
]

both_pass = task_comparison[
    task_comparison['caa_task_passed'] & task_comparison['unsteered_task_passed']
]

both_fail = task_comparison[
    ~task_comparison['caa_task_passed'] & ~task_comparison['unsteered_task_passed']
]

print("=== XOR ANALYSIS (Task-Level Differences) ===")
print(f"Tasks where both methods agree: {len(both_pass) + len(both_fail)}")
print(f"  - Both pass: {len(both_pass)} tasks")
print(f"  - Both fail: {len(both_fail)} tasks")
print(f"\nTasks where methods differ: {len(caa_wins) + len(unsteered_wins)}")
print(f"  - CAA passes, Unsteered fails: {len(caa_wins)} tasks")
print(f"  - Unsteered passes, CAA fails: {len(unsteered_wins)} tasks")

print(f"\n=== TASK IDs WHERE RESULTS DIFFER ===")
if len(caa_wins) > 0:
    print(f"\nCAA WINS ({len(caa_wins)} tasks): {sorted(caa_wins.index.tolist())}")
    
if len(unsteered_wins) > 0:
    print(f"\nUNSTEERED WINS ({len(unsteered_wins)} tasks): {sorted(unsteered_wins.index.tolist())}")

=== XOR ANALYSIS (Task-Level Differences) ===
Tasks where both methods agree: 357
  - Both pass: 285 tasks
  - Both fail: 72 tasks

Tasks where methods differ: 21
  - CAA passes, Unsteered fails: 10 tasks
  - Unsteered passes, CAA fails: 11 tasks

=== TASK IDs WHERE RESULTS DIFFER ===

CAA WINS (10 tasks): [25, 30, 33, 53, 235, 275, 288, 321, 330, 366]

UNSTEERED WINS (11 tasks): [52, 61, 71, 148, 168, 177, 206, 223, 276, 308, 339]


## Per-Sample Success Rate Analysis

In [36]:
# Analyze per-sample (completion_id) success rates
def analyze_per_sample_rates(df, method_name):
    """Calculate success rate for each completion_id (sample)"""
    if 'completion_id' not in df.columns:
        print(f"{method_name}: No completion_id column found - assuming single sample per task")
        return pd.DataFrame({f'{method_name}_sample_0': [df['passed'].mean()]})
    
    per_sample = df.groupby('completion_id')['passed'].agg(['count', 'sum', 'mean']).round(3)
    per_sample.columns = [f'{method_name}_count', f'{method_name}_passed', f'{method_name}_rate']
    return per_sample

# Calculate per-sample rates
caa_per_sample = analyze_per_sample_rates(df_caa, 'caa')
unsteered_per_sample = analyze_per_sample_rates(df_unsteered, 'unsteered')

# Merge per-sample results
per_sample_comparison = pd.merge(
    caa_per_sample, unsteered_per_sample, 
    left_index=True, right_index=True, how='outer'
).fillna(0)

print("=== PER-SAMPLE SUCCESS RATES ===")
print(per_sample_comparison)

if len(per_sample_comparison) > 1:
    print(f"\n=== AVERAGE OF AVERAGES ===")
    caa_avg_of_avgs = per_sample_comparison['caa_rate'].mean()
    unsteered_avg_of_avgs = per_sample_comparison['unsteered_rate'].mean()
    
    print(f"CAA average across all samples: {caa_avg_of_avgs:.3f}")
    print(f"Unsteered average across all samples: {unsteered_avg_of_avgs:.3f}")
    print(f"CAA improvement: {caa_avg_of_avgs - unsteered_avg_of_avgs:+.3f}")

=== PER-SAMPLE SUCCESS RATES ===
               caa_count  caa_passed  caa_rate  unsteered_count  \
completion_id                                                     
0                    378         248     0.656              378   
1                    378         252     0.667              378   
2                    378         252     0.667              378   
3                    378         247     0.653              378   
4                    378         250     0.661              378   
5                    378         248     0.656              378   
6                    378         242     0.640              378   
7                    378         259     0.685              378   
8                    378         247     0.653              378   
9                    378         248     0.656              378   

               unsteered_passed  unsteered_rate  
completion_id                                    
0                           250           0.661  
1           

## Completion Pattern Analysis

In [37]:
# Analyze when tasks first succeed (completion timing)
def analyze_first_success_patterns(df, method_name):
    """Find the first completion_id where each task succeeds"""
    if 'completion_id' not in df.columns:
        return None
        
    # Find first success for each task
    successful_completions = df[df['passed'] == True].copy()
    first_success = successful_completions.groupby('task_id')['completion_id'].min()
    
    # Count distribution of first success timing
    first_success_dist = first_success.value_counts().sort_index()
    
    return first_success_dist, first_success

# Analyze completion patterns
caa_first_success_dist, caa_first_success = analyze_first_success_patterns(df_caa, 'caa')
unsteered_first_success_dist, unsteered_first_success = analyze_first_success_patterns(df_unsteered, 'unsteered')

if caa_first_success_dist is not None and unsteered_first_success_dist is not None:
    print("=== FIRST SUCCESS TIMING DISTRIBUTION ===")
    timing_comparison = pd.DataFrame({
        'caa_count': caa_first_success_dist,
        'unsteered_count': unsteered_first_success_dist
    }).fillna(0).astype(int)
    
    print(timing_comparison)
    
    print(f"\n=== SUCCESS TIMING STATS ===")
    if len(caa_first_success) > 0:
        print(f"CAA average first success at completion: {caa_first_success.mean():.2f}")
    if len(unsteered_first_success) > 0:
        print(f"Unsteered average first success at completion: {unsteered_first_success.mean():.2f}")
else:
    print("=== COMPLETION PATTERN ANALYSIS ===")
    print("Single sample per task - no timing patterns to analyze")

=== FIRST SUCCESS TIMING DISTRIBUTION ===
               caa_count  unsteered_count
completion_id                            
0                    248              250
1                     23               20
2                      7               10
3                      4                4
4                      5                2
5                      1                5
6                      0                1
7                      4                1
8                      2                0
9                      1                3

=== SUCCESS TIMING STATS ===
CAA average first success at completion: 0.43
Unsteered average first success at completion: 0.42


## Detailed Analysis of Differing Tasks

In [38]:
# Detailed breakdown of differing tasks
def analyze_task_details(task_ids, df_caa, df_unsteered, title):
    """Show detailed completion patterns for specific tasks"""
    if len(task_ids) == 0:
        return
        
    print(f"\n=== {title} ===")
    
    for task_id in sorted(task_ids):  # Limit to first 10 for readability
        print(f"\nTask {task_id}:")
        
        # CAA details
        caa_task = df_caa[df_caa['task_id'] == task_id]
        if len(caa_task) > 0:
            caa_passed = caa_task['passed'].sum()
            caa_total = len(caa_task)
            print(f"  CAA: {caa_passed}/{caa_total} completions passed ({caa_passed/caa_total:.2f})")
            
        # Unsteered details
        unsteered_task = df_unsteered[df_unsteered['task_id'] == task_id]
        if len(unsteered_task) > 0:
            unsteered_passed = unsteered_task['passed'].sum()
            unsteered_total = len(unsteered_task)
            print(f"  Unsteered: {unsteered_passed}/{unsteered_total} completions passed ({unsteered_passed/unsteered_total:.2f})")

# Show details for differing tasks
if len(caa_wins) > 0:
    analyze_task_details(caa_wins.index.tolist(), df_caa, df_unsteered, "CAA WINS - DETAILED BREAKDOWN")

if len(unsteered_wins) > 0:
    analyze_task_details(unsteered_wins.index.tolist(), df_caa, df_unsteered, "UNSTEERED WINS - DETAILED BREAKDOWN")


=== CAA WINS - DETAILED BREAKDOWN ===

Task 25:
  CAA: 2/10 completions passed (0.20)
  Unsteered: 0/10 completions passed (0.00)

Task 30:
  CAA: 1/10 completions passed (0.10)
  Unsteered: 0/10 completions passed (0.00)

Task 33:
  CAA: 2/10 completions passed (0.20)
  Unsteered: 0/10 completions passed (0.00)

Task 53:
  CAA: 1/10 completions passed (0.10)
  Unsteered: 0/10 completions passed (0.00)

Task 235:
  CAA: 1/10 completions passed (0.10)
  Unsteered: 0/10 completions passed (0.00)

Task 275:
  CAA: 5/10 completions passed (0.50)
  Unsteered: 0/10 completions passed (0.00)

Task 288:
  CAA: 1/10 completions passed (0.10)
  Unsteered: 0/10 completions passed (0.00)

Task 321:
  CAA: 1/10 completions passed (0.10)
  Unsteered: 0/10 completions passed (0.00)

Task 330:
  CAA: 1/10 completions passed (0.10)
  Unsteered: 0/10 completions passed (0.00)

Task 366:
  CAA: 1/10 completions passed (0.10)
  Unsteered: 0/10 completions passed (0.00)

=== UNSTEERED WINS - DETAILED BREA

In [39]:
# Find task_ids where CAA wins
caa_win_task_ids = caa_wins.index.tolist()

# Filter all_detailed_results for these tasks and both methods
caa_vs_unsteered = all_detailed_results[
    (all_detailed_results['task_id'].isin(caa_win_task_ids))
]

# For each task, check if any unsteered generation does NOT end with '"""'
def unsteered_produced_output(group):
    unsteered = group[group['method'] == 'unsteered']
    # True if any unsteered generation does NOT end with triple quotes
    return any(not str(gen).strip().endswith('"""') for gen in unsteered['generation'])

# Group by task_id and filter
tasks_with_unsteered_output = (
    caa_vs_unsteered.groupby('task_id')
    .filter(unsteered_produced_output)
    ['task_id']
    .unique()
)

# Prepare a DataFrame with these tasks, showing both CAA and unsteered generations
examples = caa_vs_unsteered[
    caa_vs_unsteered['task_id'].isin(tasks_with_unsteered_output)
].sort_values(['task_id', 'method', 'completion_id'])

# Show a sample (first 10 tasks) for review
examples.task_id.unique()

array([ 25,  30,  33,  53, 235, 275, 288, 321, 330, 366])

## Summary Export

In [40]:
print("=== EXPORTING RESULTS TO EXCEL ===")

# Export all detailed results
all_detailed_results.to_excel(ROOT_PATH + "all_detailed_results.xlsx", index=False)
print(f"✅ All detailed results saved to: {ROOT_PATH}all_detailed_results.xlsx")

# Export task-level comparison
task_comparison.to_excel(ROOT_PATH + "task_level_comparison.xlsx")
print(f"✅ Task-level comparison saved to: {ROOT_PATH}task_level_comparison.xlsx")

# Export per-sample comparison
per_sample_comparison.to_excel(ROOT_PATH + "per_sample_comparison.xlsx")
print(f"✅ Per-sample comparison saved to: {ROOT_PATH}per_sample_comparison.xlsx")


=== EXPORTING RESULTS TO EXCEL ===
✅ All detailed results saved to: /workspace/wisent-guard/all_detailed_results.xlsx
✅ Task-level comparison saved to: /workspace/wisent-guard/task_level_comparison.xlsx
✅ Per-sample comparison saved to: /workspace/wisent-guard/per_sample_comparison.xlsx
