In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from numpy.random import uniform, normal, seed
import random
from sklearn.metrics import roc_auc_score, balanced_accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

import joblib

def optim_thresh(X_test, y_test, model):
    # Find optimal threshold
    step_factor = 0.05 
    threshold_value = 0.05
    bal_acc=0
    predicted_proba = model.predict_proba(X_test) #probability of prediction
    while threshold_value <=0.8: #continue to check best threshold upto probability 0.8
        temp_thresh = threshold_value
        predicted = (predicted_proba [:,1] >= temp_thresh).astype('int') #change the class boundary for prediction
        if bal_acc<balanced_accuracy_score(y_test, predicted): #store the threshold for best classification
            bal_acc = balanced_accuracy_score(y_test,predicted)
            thrsh_score = threshold_value
        threshold_value = threshold_value + step_factor

    optim_thresh = thrsh_score
    
    return thrsh_score

## Load Data and Model

In [None]:
outcome = 'EPDS_risk2'

X_test = pd.read_excel("EPDS_X_test.xlsx")
y_test = pd.read_excel("EPDS_y_test.xlsx")

del X_test[X_test.columns[0]]
del y_test[y_test.columns[0]]

In [None]:
best_glm_epds_no_reweigh = joblib.load('best_glm_epds_no_reweigh.pkl')
best_glm_epds_reweigh = joblib.load('best_glm_epds_reweigh.pkl')

best_rf_epds_no_reweigh = joblib.load('best_rf_epds_no_reweigh.pkl')
best_rf_epds_reweigh = joblib.load('best_rf_epds_reweigh.pkl')

best_xgb_epds_no_reweigh = joblib.load('best_xgb_epds_no_reweigh.pkl')
best_xbg_epds_reweigh = joblib.load('best_xgb_epds_reweigh.pkl')

## Boostrap Models with No Reweighing

In [None]:
no_reweigh_results = []
no_reweigh_models = [best_glm_epds_no_reweigh,best_rf_epds_no_reweigh,best_xgb_epds_no_reweigh]
no_reweighs_labs = ['LR','RF','XGB']

# Base group is non-Hispanic White
races = ['MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
         'MOM_RACE_Black or African American',
         'MOM_RACE_Hispanic White',
         'MOM_RACE_Multiracial',
         'MOM_RACE_Other',
         'MOM_RACE_Unknown']

for i in range(3):
    model = no_reweigh_models[i]
    label = no_reweighs_labs[i]
    optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=model)
    for j in range(100):
        test_set = pd.concat([y_test,X_test],axis=1).sample(n=len(X_test), replace=True, random_state=j, ignore_index=True)
        x = test_set.drop([outcome], axis=1)
        y = test_set[outcome]
        y_pred = pd.DataFrame((model.predict_proba(x)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])
        test_set = pd.concat([y_pred,test_set],axis=1)
        
        # Demographic parity
        p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

        # TP and FN
        pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
        pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
        fn_white = np.mean(pos_lab_set_white['fn'])
        pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
        tp_white = np.mean(pos_lab_set_white['tp'])

        for r in races:
            pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
            pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
            pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
            no_reweigh_results.append({'Model':label,
                                       'BA':balanced_accuracy_score(y, y_pred),
                                       'AUC':roc_auc_score(y, model.predict_proba(x)[:,1]),
                                       'Race': r,
                                       'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                                       'FN':np.mean(pos_lab['fn'])-fn_white,
                                       'TP':np.mean(pos_lab['tp'])-tp_white})

In [None]:
no_reweigh_results = pd.DataFrame(no_reweigh_results)

In [None]:
sns.catplot(data=no_reweigh_results[no_reweigh_results['BA'].notnull()], x='Model', y='BA', hue='Model', kind='bar')
plt.ylim(0.5, 0.67)

## Boostrap Models with Reweighing

In [None]:
reweigh_results = []
reweigh_models = [best_glm_epds_reweigh,best_rf_epds_reweigh,best_xbg_epds_reweigh]
reweigh_models_labs = ['LR','RF','XGB']

# Base group is non-Hispanic White
races = ['MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
         'MOM_RACE_Black or African American',
         'MOM_RACE_Hispanic White',
         'MOM_RACE_Multiracial',
         'MOM_RACE_Other',
         'MOM_RACE_Unknown']

for i in range(3):
    model = reweigh_models[i]
    label = reweigh_models_labs[i]
    optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=model)
    for j in range(100):
        test_set = pd.concat([y_test,X_test],axis=1).sample(n=len(X_test), replace=True, random_state=j, ignore_index=True)
        x = test_set.drop([outcome], axis=1)
        y = test_set[outcome]
        y_pred = pd.DataFrame((model.predict_proba(x)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])
        test_set = pd.concat([y_pred,test_set],axis=1)
        
        # Demographic parity
        p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

        # TP and FN
        pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
        pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
        fn_white = np.mean(pos_lab_set_white['fn'])
        pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
        tp_white = np.mean(pos_lab_set_white['tp'])

        for r in races:
            pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
            pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
            pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
            reweigh_results.append({'Model':label,
                                    'BA':balanced_accuracy_score(y, y_pred),
                                    'AUC':roc_auc_score(y, model.predict_proba(x)[:,1]),
                                    'Race': r,
                                    'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                                    'FN':np.mean(pos_lab['fn'])-fn_white,
                                    'TP':np.mean(pos_lab['tp'])-tp_white})

In [None]:
reweigh_results = pd.DataFrame(reweigh_results)

In [None]:
sns.catplot(data=reweigh_results[reweigh_results['BA'].notnull()], x='Model', y='BA', hue='Model', kind='bar', errorbar=('ci', 95))
plt.ylim(0.5, 0.67)

In [None]:
no_reweigh_results.to_excel("EPDS_no_reweigh_results_boot.xlsx")
reweigh_results.to_excel("EPDS_reweigh_results_boot.xlsx")