In [8]:
import pandas as pd
df_indicators = pd.read_csv('country_indicators.csv')

In [9]:
df_preds = pd.read_csv('test_predictions.csv')

In [10]:
df = df_preds.merge(df_indicators, left_on='iso3', right_on='iso3', how='inner')

In [11]:
import numpy as np
from sklearn import metrics
import seaborn as sns

column_defining_groups = 'fsi_category'
# lets group countries by Human Development index category.
# could also group by continent, language, bins of under-5 mortality, etc.... any grouping you use or define
chosen_metric = 'accuracy_score'
# sklearn.metrics has many metrics appropriate for binary classification tasks.
# nice explanations here: https://neptune.ai/blog/evaluation-metrics-binary-classification
# could also chosose, e.g., 'recall_score' (which is sensitivity; but, 'recall_score' 
# will alternatively be specificity if `pos_label=False` is added as noted below), 
# 'auc' (threshold doesn't matter), 'f1_score', 'cohen_kappa_score', etc.
metric_function = getattr(metrics, chosen_metric)

chosen_metric_for_chosen_groups = list()
chosen_metric_back_into_original_data = {'ffnn': pd.Series(index=df.index, dtype=np.float64), 
                                         'xgboost': pd.Series(index=df.index, dtype=np.float64), 
                                         'transformer': pd.Series(index=df.index, dtype=np.float64)}
    
for g, rows in df.groupby(column_defining_groups):
    for model in ['ffnn', 'xgboost', 'transformer']:        
        chosen_metric_value = metric_function(rows[f"y_true_{model}"], rows[f"y_pred_{model}"])
        # Use the commented version below when using 'recall_score' to make it "specificity" 
        # rather than sensitivity which is what 'recall_score' gives if `pos_label=False` isn't used
        #chosen_metric_value = metric_function(rows[f"y_true_{model}"], 
        #                                      rows[f"y_pred_{model}"], pos_label=False) # specificity 
        
        chosen_metric_back_into_original_data[model][rows.index] = chosen_metric_value
        chosen_metric_for_chosen_groups.append((model, g, chosen_metric_value))

cm_bigger_better = sns.light_palette("green", as_cmap=True)
# https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html#Styler-Object-and-Customising-the-Display
styler = ( pd.DataFrame(chosen_metric_for_chosen_groups, 
                        columns=('model', column_defining_groups, chosen_metric))
            .sort_values(column_defining_groups).style
            .background_gradient(cmap=cm_bigger_better)
            .format(precision=3).hide(axis="index"))
styler

model,fsi_category,accuracy_score
ffnn,Alert,0.386
xgboost,Alert,0.3
transformer,Alert,0.471
ffnn,Stable,0.898
xgboost,Stable,0.75
transformer,Stable,0.716
ffnn,Sustainable,0.943
xgboost,Sustainable,0.829
transformer,Sustainable,0.943
ffnn,Warning,0.737


In [16]:
ffnn = chosen_metric_back_into_original_data['ffnn']
xgboost = chosen_metric_back_into_original_data['xgboost']
transformer = chosen_metric_back_into_original_data['transformer']

# plot the distribution of the metric for each model
import plotly.express as px
fig = px.histogram(df, x=ffnn, title='FFNN')
fig.show()