In [269]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json

## Note that 0 FPS means just first and last frames.

In [270]:
# potentially combine paths!
paths = [
    '/workspaces/ares/data/eval_dump/pi_results.csv'

    # "/workspaces/ares/data/eval_dump/eval_results_gpt-4o-mini_2025-02-13_16-59-19_video.csv",
    # "/workspaces/ares/data/eval_dump/eval_results_gpt-4o_2025-02-13_17-22-06_video.csv"
]

In [None]:
import traceback
# collect all the DFs and assign labels according to success_flag
# calculate accuracy of mean and median
# calculate vote performance
df = pd.concat([pd.read_csv(path) for path in paths])

df['label'] = df['success_flag'] if not isinstance(df.success_flag.iloc[0], str) else df.success_flag.apply(lambda x: float(x == 'success'))
# calculate if each prediction is correct
df['accuracy_of_mean'] = df['label'] == (df['mean_performance'] > 0.5)
df['accuracy_of_median'] = df['label'] == (df['median_performance'] > 0.5)
# transform performance into a list of floats for each row
df['vote_str'] = df['performance']

def converter(performance):
    performance = performance.replace('nan', 'null') # convert for json loads
    return json.loads(performance)

df['votes_float'] = df['performance'].apply(converter)

for i in range(5):
    df[f'mean_performance_n={i+1}'] = df['votes_float'].apply(lambda x: np.mean(x[:i+1]))
    df[f'median_performance_n={i+1}'] = df['votes_float'].apply(lambda x: np.median(x[:i+1]))
    df[f'accuracy_of_mean_n={i+1}'] = df['label'] == (df[f'mean_performance_n={i+1}'] > 0.5)
    df[f'accuracy_of_median_n={i+1}'] = df['label'] == (df[f'median_performance_n={i+1}'] > 0.5)

In [None]:
for k, v in df.groupby(['vlm', 'method', 'fps']):
    print(k, len(v))

In [None]:
print(f"{df['accuracy_of_mean'].mean():.3f}, {df['accuracy_of_median'].mean():.3f}, {df['votes_float'].apply(lambda x: len(x)).mean():.3f}")

In [None]:
for i in range(5):
    print(f"{df[f'accuracy_of_mean_n={i+1}'].mean():.3f}, {df[f'accuracy_of_median_n={i+1}'].mean():.3f}")

In [275]:
# Helper function for computing means and std errors
def mean_stderr(data):
    mean = np.mean(data)
    stderr = np.std(data) / np.sqrt(len(data))
    return mean, stderr

In [276]:
from collections import defaultdict
from itertools import product

def _create_line_plot(ax, df, primary_axis, other_axes, unique_values, score_title):
    """Helper function to create a line plot on given axis"""
    other_combinations = list(product(*(unique_values[ax_name] for ax_name in other_axes)))
    
    for combo in other_combinations:
        filtered_df = df.copy()
        for ax_name, value in zip(other_axes, combo):
            filtered_df = filtered_df[filtered_df[ax_name] == value]
        
        groups = filtered_df.groupby(primary_axis)
        xs = []
        ys = []
        
        for name, group in groups:
            xs.append(name)
            ys.append(group[score_title].mean())
        
        label = ", ".join(f"{ax}={val}" for ax, val in zip(other_axes, combo))
        ax.plot(xs, ys, marker='o', label=label, linewidth=2, markersize=8)

def _create_bar_plot(ax, df, primary_axis, other_axes, unique_values, score_title):
    """Helper function to create a bar plot on given axis"""
    other_combinations = list(product(*(unique_values[ax_name] for ax_name in other_axes)))
    primary_values = unique_values[primary_axis]
    
    x = np.arange(len(primary_values))
    total_bars = len(other_combinations)
    width = 0.8 / total_bars
    
    for combo_idx, combo in enumerate(other_combinations):
        filtered_df = df.copy()
        for ax_name, value in zip(other_axes, combo):
            filtered_df = filtered_df[filtered_df[ax_name] == value]
        
        groups = filtered_df.groupby(primary_axis)
        values = {}
        
        for name, group in groups:
            values[name] = group[score_title].mean()
        
        heights = [values.get(val, 0) for val in primary_values]
        offset = (combo_idx - total_bars/2 + 0.5) * width
        
        label = ", ".join(f"{ax}={val}" for ax, val in zip(other_axes, combo))
        ax.bar(x + offset, heights, width, label=label, alpha=0.8)
    
    ax.set_xticks(x)
    ax.set_xticklabels(primary_values, rotation=45, ha='right')

def _style_axis(ax, primary_axis, metric_name):
    """Helper function to apply consistent styling to an axis"""
    ax.set_title(f'{metric_name} Performance by {primary_axis}', fontsize=12, pad=15)
    ax.set_xlabel(primary_axis, fontsize=10)
    ax.set_ylabel('Accuracy', fontsize=10)
    ax.grid(True, alpha=0.3, linestyle='--')
    ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')
    ax.set_ylim(0.4, 1.0)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title=f'{metric_name} Performance', title_fontsize=10)

def plot_along_axes(df, axis_names: list[str], score_title_list: list[str], chart_type='line'):
    """Main plotting function that creates individual metric plots and optionally a combined plot"""
    # Create figure with appropriate number of subplots
    n_metrics = len(score_title_list)
    n_plots = n_metrics + 1 if n_metrics > 1 else n_metrics  # Only add combined plot if multiple metrics
    fig, axes = plt.subplots(1, n_plots, figsize=(12 * n_plots, 6))
    
    # Ensure axes is always a numpy array
    if not isinstance(axes, np.ndarray):
        axes = np.array([axes])
    axes = np.atleast_1d(axes)
    
    # Get unique values for each axis
    unique_values = {name: sorted(df[name].unique()) for name in axis_names}
    primary_axis = axis_names[0]
    other_axes = axis_names[1:]
    
    # Create individual metric plots
    for ax_idx, (ax, score_title) in enumerate(zip(axes[:n_metrics], score_title_list)):
        if chart_type == 'line':
            _create_line_plot(ax, df, primary_axis, other_axes, unique_values, score_title)
        elif chart_type == 'bar':
            _create_bar_plot(ax, df, primary_axis, other_axes, unique_values, score_title)
        
        metric_name = score_title.replace('accuracy_of_', '').replace("_", " ").title()
        _style_axis(ax, primary_axis, metric_name)
    
    # Create combined plot only if there are multiple metrics
    if n_metrics > 1:
        if chart_type == 'line':
            other_combinations = list(product(*(unique_values[ax_name] for ax_name in other_axes)))
            
            for combo in other_combinations:
                filtered_df = df.copy()
                for ax_name, value in zip(other_axes, combo):
                    filtered_df = filtered_df[filtered_df[ax_name] == value]
                
                groups = filtered_df.groupby(primary_axis)
                xs = []
                ys = defaultdict(list)
                
                for name, group in groups:
                    xs.append(name)
                    for score_title in score_title_list:
                        ys[score_title].append(group[score_title].mean())
                
                combo_label = ", ".join(f"{ax}={val}" for ax, val in zip(other_axes, combo))
                for score_title in score_title_list:
                    metric_name = score_title.replace('accuracy_of_', '').replace("_", " ").title()
                    axes[-1].plot(xs, ys[score_title], marker='o', 
                                label=f'{metric_name} - {combo_label}', 
                                linewidth=2, markersize=8)
        elif chart_type == 'bar':
            other_combinations = list(product(*(unique_values[ax_name] for ax_name in other_axes)))
            primary_values = unique_values[primary_axis]
            
            x = np.arange(len(primary_values))
            total_bars = len(other_combinations) * len(score_title_list)
            width = 0.8 / total_bars
            
            bar_idx = 0
            for combo in other_combinations:
                filtered_df = df.copy()
                for ax_name, value in zip(other_axes, combo):
                    filtered_df = filtered_df[filtered_df[ax_name] == value]
                
                groups = filtered_df.groupby(primary_axis)
                values = defaultdict(dict)
                
                for name, group in groups:
                    for score_title in score_title_list:
                        values[score_title][name] = group[score_title].mean()
                
                combo_label = ", ".join(f"{ax}={val}" for ax, val in zip(other_axes, combo))
                
                for score_title in score_title_list:
                    heights = [values[score_title].get(val, 0) for val in primary_values]
                    offset = (bar_idx - total_bars/2 + 0.5) * width
                    
                    metric_name = score_title.replace('accuracy_of_', '').replace("_", " ").title()
                    axes[-1].bar(x + offset, heights, width, 
                               label=f'{metric_name} - {combo_label}', alpha=0.8)
                    bar_idx += 1
            
            axes[-1].set_xticks(x)
            axes[-1].set_xticklabels(primary_values, rotation=45, ha='right')


        _style_axis(axes[-1], primary_axis, "Combined")
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_along_axes(df, 
              axis_names=['fps', 'vlm'], 
              score_title_list=['accuracy_of_mean_n=5', 'accuracy_of_median_n=5'], 
              chart_type='line')

plot_along_axes(df, 
              axis_names=['fps', 'vlm'], 
              score_title_list=['accuracy_of_mean_n=5', 'accuracy_of_median_n=5'], 
              chart_type='bar')

plot_along_axes(df, 
              axis_names=['method'], 
              score_title_list=['accuracy_of_mean_n=5', 'accuracy_of_median_n=5'], 
              chart_type='bar')

plot_along_axes(df, 
              axis_names=['task'], 
              score_title_list=['accuracy_of_mean_n=5', 'accuracy_of_median_n=5'], 
              chart_type='bar')


In [None]:
# Create melted dataframe for both mean and median accuracies
plot_df = df.copy()
plot_df = pd.melt(plot_df, 
                  id_vars=['fps', 'vlm', 'method', 'task'], 
                  value_vars=[f'accuracy_of_mean_n={i+1}' for i in range(5)] + 
                            [f'accuracy_of_median_n={i+1}' for i in range(5)],
                  var_name='n_votes',
                  value_name='accuracy')

# Extract n value and type (mean/median) from the column name
plot_df['n'] = plot_df['n_votes'].str.extract('(\d+)').astype(int)
plot_df['type'] = plot_df['n_votes'].str.extract('accuracy_of_(\w+)_n=')

# Pivot to create separate mean and median columns
plot_df = plot_df.pivot_table(
    index=['fps', 'vlm', 'method', 'task', 'n'],
    columns='type',
    values='accuracy'
).reset_index()

# Now we can plot with n as our primary axis
plot_along_axes(plot_df, 
              axis_names=['n', 'vlm'], 
              score_title_list=['mean', 'median'], 
              chart_type='line')

# Or with different groupings
plot_along_axes(plot_df, 
              axis_names=['n', 'fps'], 
              score_title_list=['mean', 'median'], 
              chart_type='line')

# Or both as bars
plot_along_axes(plot_df, 
              axis_names=['n', 'vlm'], 
              score_title_list=['mean', 'median'], 
              chart_type='bar')