In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json

## Note that 0 FPS means just first and last frames.

In [16]:
# potentially combine paths!
paths = [
    "/workspaces/ares/data/eval_dump/eval_results_gpt-4o-mini_2025-01-06_21-54-15_video.csv",
    "/workspaces/ares/data/eval_dump/eval_results_gpt-4o_2025-01-06_22-00-33_video.csv",
    "/workspaces/ares/data/eval_dump/eval_results_gemini-1.5-pro_2025-01-06_22-04-12_video.csv",
    "/workspaces/ares/data/eval_dump/eval_results_gemini-1.5-pro_2025-01-06_23-07-26_frame_descriptions.csv",
    "/workspaces/ares/data/eval_dump/eval_results_gpt-4o_2025-01-07_00-33-35_frame_descriptions.csv",
    "/workspaces/ares/data/eval_dump/eval_results_gpt-4o-mini_2025-01-07_01-11-20_frame_descriptions.csv"
]

In [17]:
df = pd.concat([pd.read_csv(path) for path in paths])
df['label'] = df['success_flag'].apply(lambda x: 1 if x== 'success' else 0)
df['accuracy_of_mean'] = df['label'] == (df['mean_performance'] > 0.5)
df['accuracy_of_median'] = df['label'] == (df['median_performance'] > 0.5)

df['vote_str'] = df['performance']
df['votes_float'] = df['performance'].apply(lambda x: json.loads(x))

In [None]:
for k, v in df.groupby(['vlm', 'method', 'fps']):
    print(k, len(v))

In [None]:
print(f"{df['accuracy_of_mean'].mean():.3f}, {df['accuracy_of_median'].mean():.3f}, {df['votes_float'].apply(lambda x: len(x)).mean():.3f}")
df.head()

In [20]:
# Helper function for computing means and std errors
def mean_stderr(data):
    mean = np.mean(data)
    stderr = np.std(data) / np.sqrt(len(data))
    return mean, stderr

In [None]:
# Group by task, VLM, FPS, and method to calculate mean accuracy
task_model_fps_results = df.groupby(['task', 'vlm', 'fps', 'method']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

# Get unique values
tasks = task_model_fps_results['task'].unique()
vlms = task_model_fps_results['vlm'].unique()
fps_values = sorted(task_model_fps_results['fps'].unique())
methods = task_model_fps_results['method'].unique()

# Create subplot for each task
fig, axes = plt.subplots(len(tasks), 1, figsize=(12, 5*len(tasks)))
if len(tasks) == 1:
    axes = [axes]

for task_idx, (task, ax) in enumerate(zip(tasks, axes)):
    task_data = task_model_fps_results[task_model_fps_results['task'] == task]
    x = np.arange(len(fps_values))
    width = 0.8 / (len(vlms) * len(methods))  # Adjusted width for methods

    # Plot bars for each VLM and method combination
    bar_idx = 0
    for vlm in vlms:
        for method in methods:
            # Calculate offset for this combination's bars
            offset = (bar_idx - (len(vlms) * len(methods))/2 + 0.5) * width
            
            accuracies = []
            errors = []
            positions = []
            
            for j, fps in enumerate(fps_values):
                combo_data = task_data[
                    (task_data['vlm'] == vlm) & 
                    (task_data['fps'] == fps) & 
                    (task_data['method'] == method)
                ]
                if not combo_data.empty:
                    positions.append(j + offset)
                    accuracies.append(combo_data['accuracy'].iloc[0])
                    errors.append(combo_data['std_err'].iloc[0])
            
            if positions:  # Only plot if we have data
                ax.bar(positions, accuracies, width, 
                       label=f'{vlm} ({method})',
                       alpha=0.7)
                
                ax.errorbar(positions, accuracies,
                           yerr=errors, fmt='none', color='black', capsize=3)
            
            bar_idx += 1

    # Customize each subplot
    ax.set_ylabel('Accuracy')
    ax.set_title(f'Task: {task}')
    ax.set_xticks(x)
    ax.set_xticklabels([f'{fps} FPS' for fps in fps_values])
    ax.grid(True, alpha=0.3)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Add horizontal line at 0.5 for random chance
    ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')

plt.tight_layout()
plt.show()

# Print best combinations
print("\nBest performing combinations for each task:")
for task in tasks:
    task_data = task_model_fps_results[task_model_fps_results['task'] == task]
    best_row = task_data.loc[task_data['accuracy'].idxmax()]
    print(f"\n{task}:")
    print(f"  VLM: {best_row['vlm']}")
    print(f"  FPS: {best_row['fps']}")
    print(f"  Method: {best_row['method']}")
    print(f"  Accuracy: {best_row['accuracy']:.3f} ± {best_row['std_err']:.3f}")


In [None]:
# Create a new dataframe for vote analysis
vote_analysis = pd.DataFrame()

# Extract performance lists and analyze different numbers of votes
for idx, row in df.iterrows():
    perf = row['votes_float']
    if isinstance(perf, list):
        # For each number of votes (1, 3, 5)
        for n_votes in [1, 3, 5]:
            # Take first n_votes if available
            votes = perf[:n_votes]
            if len(votes) >= n_votes:
                # Calculate mean performance with this many votes
                mean_perf = np.mean(votes)
                vote_analysis = pd.concat([vote_analysis, pd.DataFrame({
                    'vlm': [row['vlm']],
                    'task': [row['task']],
                    'method': [row['method']],  # Added method
                    'n_votes': [n_votes],
                    'mean_performance': [mean_perf],
                    'true_success': [1 if row['success_flag'] == 'success' else 0]
                })])
vote_analysis['accuracy_of_mean'] = vote_analysis['true_success'] == (vote_analysis['mean_performance'] > 0.5)

# Calculate accuracy for each VLM, method, and number of votes
vote_results = vote_analysis.groupby(['vlm', 'method', 'n_votes']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()


# Create separate plots for each method
methods = vote_results['method'].unique()
fig, axes = plt.subplots(len(methods), 1, figsize=(12, 5*len(methods)))
if len(methods) == 1:
    axes = [axes]

for method_idx, (method, ax) in enumerate(zip(methods, axes)):
    # Filter data for this method
    method_data = vote_results[vote_results['method'] == method]
    
    # Plot each VLM for this method
    for vlm in method_data['vlm'].unique():
        vlm_data = method_data[method_data['vlm'] == vlm]
        if not vlm_data.empty:
            ax.errorbar(vlm_data['n_votes'], 
                       vlm_data['accuracy'], 
                       yerr=vlm_data['std_err'],
                       label=vlm, 
                       marker='o', 
                       capsize=5)
    
    # Customize subplot
    ax.set_xlabel('Number of Votes')
    ax.set_ylabel('Accuracy')
    ax.set_title(f'Accuracy vs Number of Votes by Model ({method})')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')

plt.tight_layout()
plt.show()

In [None]:
# For task_results
task_results = df.groupby(['task', 'method']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

# For fps_results and related dataframes
fps_results = df.groupby(['vlm', 'fps', 'method']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

fps_model_mean_accs = df.groupby(['vlm', 'fps', 'method']).apply(
    lambda x: x['accuracy_of_mean'].mean()
).reset_index(name='accuracy')

fps_model_median_accs = df.groupby(['vlm', 'fps', 'method']).apply(
    lambda x: x['accuracy_of_median'].mean()
).reset_index(name='accuracy')

# For task_model_results
task_model_results = df.groupby(['task', 'vlm', 'method']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()


In [None]:
# Get unique methods
methods = fps_model_mean_accs['method'].unique()

# Create subplot for each method
fig, axes = plt.subplots(len(methods), 1, figsize=(10, 6*len(methods)))
if len(methods) == 1:
    axes = [axes]

# Plot for each method
for method_idx, (method, ax) in enumerate(zip(methods, axes)):
    # Filter data for this method
    method_mean_data = fps_model_mean_accs[fps_model_mean_accs['method'] == method]
    method_median_data = fps_model_median_accs[fps_model_median_accs['method'] == method]
    
    # Plot for each VLM within this method
    for i, vlm in enumerate(method_mean_data['vlm'].unique()):
        # Plot mean accuracy
        vlm_data = method_mean_data[method_mean_data['vlm'] == vlm]
        ax.plot(vlm_data['fps'], vlm_data['accuracy'], 
                marker='o', label=f'{vlm} (mean)', linestyle='-')
        
        # Plot median accuracy
        vlm_data = method_median_data[method_median_data['vlm'] == vlm]
        ax.plot(vlm_data['fps'], vlm_data['accuracy'], 
                marker='s', label=f'{vlm} (median)', linestyle='--')
    
    # Customize subplot
    ax.set_xlabel('FPS')
    ax.set_ylabel('Accuracy')
    ax.set_title(f'Performance Accuracy vs FPS ({method})')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')

plt.tight_layout()
plt.show()

In [None]:
# per task performance
# Group by task and calculate mean accuracy
task_results = df.groupby(['task']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

# Create bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar(task_results['task'], task_results['accuracy'])
plt.errorbar(task_results['task'], task_results['accuracy'], 
             yerr=task_results['std_err'], fmt='none', color='black', capsize=5)

# add horizontal line at 0.5
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Chance')
# Customize plot
plt.xlabel('Task')
plt.ylabel('Accuracy')
plt.title('Performance Accuracy by Task')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.legend()
plt.show()

print("\nTask-wise Performance:")
print(task_results.to_string(index=False))


In [None]:
# Group by task and VLM to calculate mean accuracy
task_model_results = df.groupby(['task', 'vlm']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

plt.figure(figsize=(12, 6))

# Get unique tasks and VLMs
# Get unique tasks and VLMs
tasks = task_model_results['task'].unique()
vlms = task_model_results['vlm'].unique()
x = np.arange(len(tasks))
width = 0.8 / len(vlms)  # Width of bars with spacing

# Plot bars for each VLM
for i, vlm in enumerate(vlms):
    vlm_data = task_model_results[task_model_results['vlm'] == vlm]
    # Calculate offset for this VLM's bars
    offset = (i - len(vlms)/2 + 0.5) * width
    # Match tasks with the current VLM's data
    accuracies = []
    positions = []
    errors = []
    
    for task_idx, task in enumerate(tasks):
        task_data = vlm_data[vlm_data['task'] == task]
        if not task_data.empty:
            positions.append(task_idx + offset)
            accuracies.append(task_data['accuracy'].iloc[0])
            errors.append(task_data['std_err'].iloc[0])
    
    plt.bar(positions, accuracies, width, label=vlm)
    plt.errorbar(positions, accuracies,
                yerr=errors, fmt='none', color='black', capsize=3)

# Customize plot
plt.xlabel('Task')
plt.ylabel('Accuracy')
plt.title('Performance Accuracy by Task and Model')
plt.xticks(x, tasks, rotation=45, ha='right')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()


In [None]:
# Group by task, VLM, and FPS to calculate mean accuracy
task_model_fps_results = df.groupby(['task', 'vlm', 'fps']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

# Get unique values
tasks = task_model_fps_results['task'].unique()
vlms = task_model_fps_results['vlm'].unique()
fps_values = sorted(task_model_fps_results['fps'].unique())

# Create subplot for each task
fig, axes = plt.subplots(len(tasks), 1, figsize=(12, 5*len(tasks)))
if len(tasks) == 1:
    axes = [axes]

for task_idx, (task, ax) in enumerate(zip(tasks, axes)):
    task_data = task_model_fps_results[task_model_fps_results['task'] == task]
    x = np.arange(len(fps_values))
    width = 0.8 / len(vlms)  # Width of bars with spacing

    # Plot bars for each VLM
    for i, vlm in enumerate(vlms):
        # Calculate offset for this VLM's bars
        offset = (i - len(vlms)/2 + 0.5) * width
        
        accuracies = []
        errors = []
        positions = []
        
        for j, fps in enumerate(fps_values):
            vlm_data = task_data[(task_data['vlm'] == vlm) & (task_data['fps'] == fps)]
            if not vlm_data.empty:
                positions.append(j + offset)
                accuracies.append(vlm_data['accuracy'].iloc[0])
                errors.append(vlm_data['std_err'].iloc[0])
        
        ax.bar(positions, accuracies, width, 
               label=vlm,
               alpha=0.7)
        
        ax.errorbar(positions, accuracies,
                   yerr=errors, fmt='none', color='black', capsize=3)

    # Customize each subplot
    ax.set_ylabel('Accuracy')
    ax.set_title(f'Task: {task}')
    ax.set_xticks(x)
    ax.set_xticklabels([f'{fps} FPS' for fps in fps_values])
    ax.grid(True, alpha=0.3)
    ax.legend()
    
    # Add horizontal line at 0.5 for random chance
    ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')

plt.tight_layout()
plt.show()

# Print best combinations
print("\nBest performing combinations for each task:")
for task in tasks:
    task_data = task_model_fps_results[task_model_fps_results['task'] == task]
    best_row = task_data.loc[task_data['accuracy'].idxmax()]
    print(f"\n{task}:")
    print(f"  VLM: {best_row['vlm']}")
    print(f"  FPS: {best_row['fps']}")
    print(f"  Accuracy: {best_row['accuracy']:.3f} ± {best_row['std_err']:.3f}")

In [None]:
# Overall method performance
method_results = df.groupby(['method']).agg({
    'accuracy_of_mean': ['mean', lambda x: x.std() / np.sqrt(len(x))]
}).reset_index()
method_results.columns = ['method', 'accuracy', 'std_err']

# Method performance by model
model_method_results = df.groupby(['vlm', 'method']).agg({
    'accuracy_of_mean': ['mean', lambda x: x.std() / np.sqrt(len(x))]
}).reset_index()
model_method_results.columns = ['vlm', 'method', 'accuracy', 'std_err']

# Create two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Overall method performance
ax1.bar(method_results['method'], method_results['accuracy'])
ax1.errorbar(method_results['method'], method_results['accuracy'], 
             yerr=method_results['std_err'], fmt='none', color='black', capsize=5)
ax1.set_title('Overall Method Performance')
ax1.set_ylabel('Accuracy')
ax1.grid(True, alpha=0.3)
ax1.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')

# Plot 2: Method performance by model
vlms = model_method_results['vlm'].unique()
methods = model_method_results['method'].unique()
x = np.arange(len(vlms))
width = 0.35

for i, method in enumerate(methods):
    method_data = model_method_results[model_method_results['method'] == method]
    offset = (i - len(methods)/2 + 0.5) * width
    
    # Ensure data aligns with x-axis positions
    accuracies = []
    errors = []
    for vlm in vlms:
        vlm_data = method_data[method_data['vlm'] == vlm]
        if not vlm_data.empty:
            accuracies.append(vlm_data['accuracy'].iloc[0])
            errors.append(vlm_data['std_err'].iloc[0])
        else:
            accuracies.append(0)
            errors.append(0)
    
    ax2.bar(x + offset, accuracies, width, label=method)
    ax2.errorbar(x + offset, accuracies, 
                yerr=errors, fmt='none', color='black', capsize=3)

ax2.set_ylabel('Accuracy')
ax2.set_title('Method Performance by Model')
ax2.set_xticks(x)
ax2.set_xticklabels(vlms, rotation=45)
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')

plt.tight_layout()
plt.show()

# Print numerical results
print("\nOverall Method Performance:")
print(method_results.to_string(index=False))
print("\nMethod Performance by Model:")
print(model_method_results.to_string(index=False))