In [None]:
# Model Performance Analysis Notebook

# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.precision', 4)
sns.set(style="whitegrid", font_scale=1.1)

# 2. Load All CSV Files
files = [
    "./results/metrics/decision_tree.csv",
    "./results/metrics/knn.csv",
    "./results/metrics/logistic_regression.csv",
    "./results/metrics/mlp.csv",
    "./results/metrics/random_forest.csv",
    "./results/metrics/xgboost.csv"
]

# Read and merge
dataframes = []
for file in files:
    df = pd.read_csv(file)
    dataframes.append(df)

# Combine all
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df

In [None]:
# Data Cleaning / Preparation
# Extract model name and version
combined_df['Model_Name'] = combined_df['Model'].apply(lambda x: x.split(' - ')[0])
combined_df['Version'] = combined_df['Model'].apply(lambda x: x.split(' - ')[1])

# Reorder columns
cols = ['Model_Name', 'Version', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'AUC_ROC']
combined_df = combined_df[cols]

combined_df

In [None]:
# Find Best Models for each metric
metrics = ['Accuracy', 'Precision', 'Recall', 'F1_Score', 'AUC_ROC']

best_models = {}
for metric in metrics:
    idx = combined_df[metric].idxmax()
    best_models[metric] = combined_df.loc[idx, ['Model_Name', 'Version', metric]]

best_models_df = pd.DataFrame(best_models).T
best_models_df

In [None]:
# Visualization

# -- a. Bar Plot for Each Metric (showing all 4 versions per model) --
for metric in metrics:
    plt.figure(figsize=(12,6))
    sns.barplot(data=combined_df, x='Model_Name', y=metric, hue='Version', palette='viridis')
    plt.title(f'{metric} Comparison Across All Model Versions')
    plt.ylabel(metric)
    plt.xlabel('Model')
    plt.xticks(rotation=45)
    plt.legend(title='Version')
    plt.tight_layout()
    plt.show()

# -- b. Heatmap of all model performances --
plt.figure(figsize=(12, 10))
# Create a combined label for the heatmap
combined_df['Model_Version'] = combined_df['Model_Name'] + ' - ' + combined_df['Version']
heatmap_data = combined_df.set_index('Model_Version')[metrics]

sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.4f', linewidths=.5)
plt.title('Model Performance Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Best Model for each metric
for metric in metrics:
    best_model = combined_df.loc[combined_df[metric].idxmax()]
    print(f"Best Model for {metric}: {best_model['Model_Name']} - {best_model['Version']} ({best_model[metric]:.4f})")

In [None]:
# Overall Best Model (All Metrics Combined)

# Compute mean performance of each model across all metrics
overall_df = (
    combined_df.groupby(['Model_Name', 'Version'])
    [metrics].mean()
    .reset_index()
)

# Normalize metrics to [0, 1] scale for fair comparison
normalized_df = overall_df.copy()
for metric in metrics:
    min_val = normalized_df[metric].min()
    max_val = normalized_df[metric].max()
    normalized_df[metric] = (normalized_df[metric] - min_val) / (max_val - min_val)

# Compute an overall score (average of all normalized metrics)
normalized_df['Overall_Score'] = normalized_df[metrics].mean(axis=1)

# Find the best model overall
best_overall = normalized_df.loc[normalized_df['Overall_Score'].idxmax()]
print("Best Overall Model (Across All Metrics):")
print(f"Model: {best_overall['Model_Name']} - {best_overall['Version']}")
print(f"Overall Score: {best_overall['Overall_Score']:.4f}")

# Plot overall scores
plt.figure(figsize=(12,8))
normalized_df['Model_Version'] = normalized_df['Model_Name'] + ' - ' + normalized_df['Version']
sns.barplot(
    data=normalized_df.sort_values('Overall_Score', ascending=False),
    x='Model_Version', y='Overall_Score', palette='coolwarm'
)
plt.title('Overall Performance Across All Metrics')
plt.ylabel('Normalized Average Score (0-1)')
plt.xlabel('Model Version')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Display top 5 models
top5 = normalized_df.sort_values('Overall_Score', ascending=False).head(5)
print("\nTop 5 Models:")
for idx, row in top5.iterrows():
    print(f"{row['Model_Name']} - {row['Version']}: {row['Overall_Score']:.4f}")