In [None]:
import orjson
import multiprocessing as mp
import os
from utils.analysis_files.analysis import load_all_data
import numpy as np
import pandas as pd
from utils.analysis_files.analysis import load_all_data, identify_error_atomic_actions
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def process_line(line):
    if not line.strip():
        return None
    item = orjson.loads(line)
    # item.pop("rollout_trajectories", None)
    return item

def load_jsonl_parallel(data_dir, n_workers=32):
    with open(data_dir, "rb") as f:
        lines = f.readlines()

    with mp.Pool(n_workers) as pool:
        data = [x for x in pool.map(process_line, lines) if x is not None]
    
    return data

def find_jsonl_file(directory):
    for file in os.listdir(directory):
        if file.endswith('.jsonl'):
            return os.path.join(directory, file)
    return None

In [None]:
def calculate_sample_pass1_auc(df, start=2, end=15):
    """
    Calculate pass@1 (pass_mean) @T curve area for each sample
    """
    results = []
    
    for sample_idx in df['sample_idx'].unique():
        sample_df = df[df['sample_idx'] == sample_idx]
        
        # Calculate pass@1@T for this sample (success status of each traj)
        traj_summary = sample_df.groupby(['sample_idx', 'traj_idx']).agg(
            max_step=('step_idx', 'max'),
            traj_success=('success', 'max')
        )
        traj_summary['traj_length'] = traj_summary['max_step'] + 1
        traj_summary['first_success_step'] = np.where(
            traj_summary['traj_success'].astype(bool),
            traj_summary['traj_length'],
            np.inf
        )
        
        # Calculate pass@1@T curve (average of whether each traj succeeds within T steps)
        pass1_values = []
        for t in range(start, end + 1):
            pass1_t = (traj_summary['first_success_step'] <= t).mean()
            pass1_values.append(pass1_t)
        
        # Calculate area under curve (subtract minimum)
        min_pass1 = min(pass1_values)
        adjusted_pass1 = [value - min_pass1 for value in pass1_values]
        auc = max(np.trapz(adjusted_pass1, dx=1), 0.0)
        
        # Normalize
        span = max(end - start, 1)
        normalized_auc = auc / span
        
        results.append({
            'sample_idx': sample_idx,
            'pass1_auc': normalized_auc
        })
    
    return pd.DataFrame(results)

def calculate_sample_loop_ratio(df):
    """
    Calculate loop ratio for each sample (using is_repeating)
    """
    # Apply identification function to each trajectory
    df_sorted = df.sort_values(['sample_idx', 'traj_idx', 'step_idx']).reset_index(drop=True)
    
    result_list = []
    for (sample_idx, traj_idx), group in df_sorted.groupby(['sample_idx', 'traj_idx']):
        group = group.reset_index(drop=True)
        marks = identify_error_atomic_actions(group)
        marks['sample_idx'] = sample_idx
        marks['traj_idx'] = traj_idx
        marks['step_idx'] = group['step_idx'].values
        result_list.append(marks)
    
    marks_df = pd.concat(result_list, ignore_index=True)
    
    # Merge back to original data
    df_sorted = df_sorted.merge(
        marks_df,
        on=['sample_idx', 'traj_idx', 'step_idx'],
        how='left'
    )
    
    # Calculate loop ratio for each traj
    traj_stats = df_sorted.groupby(['sample_idx', 'traj_idx']).agg(
        total_steps=('step_idx', 'count'),
        repeating_steps=('is_repeating', 'sum')
    ).reset_index()
    
    traj_stats['loop_ratio'] = traj_stats['repeating_steps'] / traj_stats['total_steps']
    
    # Calculate average loop ratio for each sample
    sample_loop_ratio = traj_stats.groupby('sample_idx')['loop_ratio'].mean().reset_index()
    sample_loop_ratio.columns = ['sample_idx', 'avg_loop_ratio']
    
    return sample_loop_ratio


In [None]:
# Load data
data_dir = "res/frozen_lake/Qwen3-30B-A3B_user_assistant_format_20251202_001"
jsonl_path = find_jsonl_file(data_dir)
raw_data = load_jsonl_parallel(jsonl_path, n_workers=128)
df = load_all_data(raw_data)


In [None]:

# Calculate AUC
auc_df = calculate_sample_pass1_auc(df, start=1, end=20)

# Calculate loop ratio
loop_ratio_df = calculate_sample_loop_ratio(df)

# Merge data
scatter_df = auc_df.merge(loop_ratio_df, on='sample_idx', how='inner')

print(f"Total samples: {len(scatter_df)}")
print(scatter_df.head())

# Plot scatter plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6), dpi=150)

ax.scatter(scatter_df['avg_loop_ratio'], scatter_df['pass1_auc'], alpha=0.6, s=50)
ax.set_xlabel('Average Loop Ratio (per sample)', fontsize=12)
ax.set_ylabel('Pass@1 AUC (Normalized)', fontsize=12)
ax.set_title('Pass@1 AUC vs. Loop Ratio', fontsize=14)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Calculate correlation coefficient
correlation = scatter_df['avg_loop_ratio'].corr(scatter_df['pass1_auc'])
print(f"\nCorrelation coefficient: {correlation:.4f}")


In [None]:

# Export data
scatter_output = scatter_df[['avg_loop_ratio', 'pass1_auc']].copy()
# scatter_output.rename(columns={'avg_loop_ratio': 'x', 'pass1_auc': 'y'}, inplace=True)
scatter_output.to_json('scatter_auc_loop.json', orient='records', indent=4)
print("\nScatter plot data exported to scatter_auc_loop.json")