# Import Libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel
from itertools import combinations
import seaborn as sns


# Load saved metrics

In [6]:
df = pd.read_csv("../../metrics/model_performance_across_20k_datasets.csv")

# Generate summary statistics for all models

For each metric (weighted_f1, accuracy1, accuracy2), find mean, std, cv for each model.

In [7]:


# List of metrics to analyze
metrics = ['weighted_f1_score', 'accuracy_1', 'accuracy_2']

# Summary statistics per model
summary = df.groupby('model')[metrics].agg(['mean', 'std'])
summary['weighted_f1_score', 'cv'] = summary['weighted_f1_score', 'std'] / summary['weighted_f1_score', 'mean']
summary['accuracy_1', 'cv'] = summary['accuracy_1', 'std'] / summary['accuracy_1', 'mean']
summary['accuracy_2', 'cv'] = summary['accuracy_2', 'std'] / summary['accuracy_2', 'mean']

# Flatten column names
summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
summary


Unnamed: 0_level_0,weighted_f1_score_mean,weighted_f1_score_std,accuracy_1_mean,accuracy_1_std,accuracy_2_mean,accuracy_2_std,weighted_f1_score_cv,accuracy_1_cv,accuracy_2_cv
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
neural_networks,0.798299,0.009982,0.86135,0.007147,0.98125,0.004471,0.012504,0.008298,0.004556
ordinalgbt,0.543285,0.013245,0.4392,0.014692,0.7295,0.013161,0.02438,0.033452,0.018041
random_forest,0.621418,0.012381,0.52855,0.014428,0.7532,0.016366,0.019923,0.027297,0.021729
svm,0.192938,0.015345,0.12855,0.010526,0.4335,0.015398,0.079531,0.081879,0.035521
xgboost,0.672371,0.012739,0.58645,0.015703,0.782,0.017625,0.018947,0.026776,0.022538


# Paired t-test for F1 score

The paired t-test tests whether the mean difference between the two models' F1 scores is significantly different from zero. Here's what we do:
1. Perform the t-test:
- Compare the models' weighted F1 scores on the same datasets.
2. Interpret the p-value:
- If p-value < 0.05, there is a statistically significant difference in F1 scores between the two models.
- If p-value ≥ 0.05, there is no significant difference, and the models are performing similarly.



In [8]:


# Assuming df looks like: model | dataset | weighted_f1_score

results = []
models = df['model'].unique()

# Step 1: Run pairwise t-tests
for model_a, model_b in combinations(models, 2):
    scores_a = df[df['model'] == model_a]["weighted_f1_score"].values
    scores_b = df[df['model'] == model_b]["weighted_f1_score"].values

    stat, p = ttest_rel(scores_a, scores_b)

    mean_a = np.mean(scores_a)
    mean_b = np.mean(scores_b)

    # Who's better
    better = model_a if mean_a > mean_b else model_b

    results.append({
        'model_a': model_a,
        'model_b': model_b,
        'mean_a': mean_a,
        'mean_b': mean_b,
        'better_model': better,
        't_stat': stat,
        'p_value': p,
        'significant': p < 0.05
    })

df_compare = pd.DataFrame(results)
df_compare

Unnamed: 0,model_a,model_b,mean_a,mean_b,better_model,t_stat,p_value,significant
0,xgboost,random_forest,0.672371,0.621418,xgboost,21.667593,7.391096e-15,True
1,xgboost,neural_networks,0.672371,0.798299,neural_networks,-33.907568,1.840637e-18,True
2,xgboost,svm,0.672371,0.192938,xgboost,116.814994,1.309393e-28,True
3,xgboost,ordinalgbt,0.672371,0.543285,xgboost,43.598021,1.644595e-20,True
4,random_forest,neural_networks,0.621418,0.798299,neural_networks,-45.478021,7.427598e-21,True
5,random_forest,svm,0.621418,0.192938,random_forest,109.393788,4.549006e-28,True
6,random_forest,ordinalgbt,0.621418,0.543285,random_forest,28.024802,6.423494000000001e-17,True
7,neural_networks,svm,0.798299,0.192938,neural_networks,129.955427,1.731803e-29,True
8,neural_networks,ordinalgbt,0.798299,0.543285,neural_networks,58.470974,6.476908000000001e-23,True
9,svm,ordinalgbt,0.192938,0.543285,ordinalgbt,-74.011369,7.494359e-25,True


# Pick the best model

In [9]:
# Filter only significant comparisons
significant = df_compare[df_compare['significant']]

# Count how many times each model was significantly better
win_counts = significant['better_model'].value_counts()

# Show ranking
print("Model Ranking by Significant Wins:")
print(win_counts)

# Pick the top model
best_model = win_counts.idxmax()
print(f"\nBest overall model: {best_model}")

# Mean F1 score per model across all datasets
mean_f1 = df.groupby("model")["weighted_f1_score"].mean().sort_values(ascending=False)

print("\n Mean F1 Score per Model:")
print(mean_f1)


Model Ranking by Significant Wins:
better_model
neural_networks    4
xgboost            3
random_forest      2
ordinalgbt         1
Name: count, dtype: int64

Best overall model: neural_networks

 Mean F1 Score per Model:
model
neural_networks    0.798299
xgboost            0.672371
random_forest      0.621418
ordinalgbt         0.543285
svm                0.192938
Name: weighted_f1_score, dtype: float64
