Author: Emily Wong \
February 16, 2023

# Resources

https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/#4

https://towardsdatascience.com/having-an-imbalanced-dataset-here-is-how-you-can-solve-it-1640568947eb

https://neptune.ai/blog/cross-validation-mistakes

# 1. Import libraries, methods, and data

## 1.1. Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Data wrangling
import pandas as pd
import numpy as np
from numpy.random import uniform, normal, seed

# Machine learning
import sklearn
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split
import scipy
from scipy.stats import randint
import xgboost as xgb
from imblearn.over_sampling import SMOTENC, RandomOverSampler, SMOTE
from imblearn.under_sampling import TomekLinks, NeighbourhoodCleaningRule, EditedNearestNeighbours, RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
import optuna

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
import matplotlib.pyplot as plt
import seaborn as sns # for kernel density plots

# for nested dictionary
import collections
def makehash():
    return collections.defaultdict(makehash)

# Fairness
import aif360
import fairlearn
from fairlearn.metrics import demographic_parity_difference, demographic_parity_ratio, equalized_odds_difference, equalized_odds_ratio, false_negative_rate

The __demographic parity difference__ of 0 means that all groups have the same selection rate. For multiple groups, average across all pairwise differences. Ranges between 0 and 1.

The __demographic parity ratio__ of 1 means that all groups have the same selection rate.

The __equalized odds difference__ of 0 means that all groups have the same true positive, true negative, false positive, and false negative rates.

The __equalized odds ratio__ of 1 means that all groups have the same true positive, true negative, false positive, and false negative rates.

## 1.2 Reweighing Method

In [2]:
def calc_weights(df, sens_features_name, outcome_name):
    ''' Calculate sample weights according to calculationg given in 
           F. Kamiran and T. Calders,  "Data Preprocessing Techniques for
           Classification without Discrimination," Knowledge and Information
           Systems, 2012.
    ''' 
    
    # combination of label and groups (outputs a table)
    sens_features = df[sens_features_name]
    outcome = df[outcome_name]
    tab = pd.DataFrame(pd.crosstab(index=sens_features, columns=outcome))

    # reweighing weights
    w = makehash()
    n = len(df)
    for r in tab.index:
        key1 = str(r)
        row_sum = tab.loc[r].sum(axis=0)
        for c in tab.columns:
            key2 = str(c)
            col_sum = tab[c].sum()
            if tab.loc[r,c] == 0:
                n_combo = 1
            else:
                n_combo = tab.loc[r,c]
            val = (row_sum*col_sum)/(n*n_combo)
            w[key1][key2] = val
    
    # Instance weights
    instance_weights = []
    for index, row in df.iterrows():
        race = row[sens_features_name]
        out = row[outcome_name]
        instance_weights.append(w[race][str(out)])

    return instance_weights

In [3]:
def display_performance(X_train, y_train, X_test, y_test, model):
    # Train performance
    y_train_pred = model.predict(X_train)
    print("Train Accuracy:", np.round(accuracy_score(y_train, y_train_pred),5))
    print("Train Balanced Acc:",np.round(sklearn.metrics.balanced_accuracy_score(y_train, y_train_pred),5))
    cm = confusion_matrix(y_train, y_train_pred)
    print("Train Confusion Matrix:")
    print(cm)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(cmap=plt.cm.Greens);

    print("")

    # Test performance
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", np.round(test_accuracy,5))
    print("Test Balanced Acc:",np.round(sklearn.metrics.balanced_accuracy_score(y_test, y_pred),5))
    cm = confusion_matrix(y_test, y_pred)
    print("Test Confusion Matrix:")
    print(cm)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(cmap=plt.cm.Greens);

In [None]:
def optim_thresh(X_test, y_test, model):
    # Find optimal threshold
    step_factor = 0.05 
    threshold_value = 0.05
    roc_score=0
    predicted_proba = model.predict_proba(X_test) #probability of prediction
    while threshold_value <=0.8: #continue to check best threshold upto probability 0.8
        temp_thresh = threshold_value
        predicted = (predicted_proba [:,1] >= temp_thresh).astype('int') #change the class boundary for prediction
        if roc_score<roc_auc_score(y_test, predicted, multi_class='ovo'): #store the threshold for best classification
            roc_score = roc_auc_score(y_test, predicted)
            thrsh_score = threshold_value
        threshold_value = threshold_value + step_factor
    print('---Optimum Threshold ---',np.round(thrsh_score,5),'--ROC--',np.round(roc_score,5))

    optim_thresh = thrsh_score
    y_pred_optim = (predicted_proba [:,1] >= optim_thresh).astype('int')
    print("Optimal Test Accuracy:",np.round(accuracy_score(y_test, y_pred_optim),5))
    print("Optimal Test Balanced Accuracy:",np.round(balanced_accuracy_score(y_test, y_pred_optim),5))
    print("Optimal Test AUC:",np.round(sklearn.metrics.roc_auc_score(y_test, y_pred_optim, multi_class='ovo'),5))

    # Create the confusion matrix
    cm = confusion_matrix(y_test, y_pred_optim)
    print("Test Confusion Matrix w/ Optimal Threshold:")
    print(cm)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(cmap=plt.cm.Greens);
    
    return thrsh_score
    

## 1.3. Data

In [4]:
all_data = pd.read_excel("Eynav cleaned data.xlsx")

In [None]:
print(all_data.shape)

In [None]:
print(all_data.columns)

This below chuck is only for reporting demographic information in manuscript:

In [None]:
demo = all_data[['MOM_AGE','MOM_RACE','ETHNIC_GROUP','ZIP','MARITAL_STATUS','FINANCIAL_CLASS',
                 'LBW','PTB',
                 'DELIVERY_METHOD','NICU_ADMIT','MFCU_ADMIT',
                 'PREE','GDM','GHTN',
                 'MOM_BMI','MOM_LOS','CHILD_LOS',
                 'HIST_ANXIETY','HIST_DEPRESS','HIST_BIPOLAR','HIST_PMAD','MENTAL_HEALTH_DX_CUTOFF',
                 'MED_PSYCH','MED_CARDIO','PMAD_risk']]
demo = demo.dropna()            # keep only complete data (for now)
demo = demo.sample(len(demo))   # randomly shuffle rows
demo.shape

print("Min Age:",min(demo['MOM_AGE']))
print("Max Age:",max(demo['MOM_AGE']))
print("Mean Age:",np.mean(demo['MOM_AGE']))
print("SD Age:",np.std(demo['MOM_AGE']))

print("------------RACE/ETHNIC COUNTS------------")
race = demo['MOM_RACE']
ethnic = demo['ETHNIC_GROUP']
print(pd.DataFrame(pd.crosstab(index=race, columns=ethnic)))

In [None]:
print(demo.shape)

Extract relevant variables for model fitting:

In [None]:
outcome = 'PHQ9_risk2'

data = all_data[['MOM_AGE','MOM_RACE','ETHNIC_GROUP','ZIP','MARITAL_STATUS','FINANCIAL_CLASS',
                 'LBW','PTB',
                 'DELIVERY_METHOD','NICU_ADMIT','MFCU_ADMIT',
                 'PREE','GDM','GHTN',
                 'MOM_BMI','MOM_LOS','CHILD_LOS',
                 'HIST_ANXIETY','HIST_DEPRESS','HIST_BIPOLAR','HIST_PMAD','MENTAL_HEALTH_DX_CUTOFF',
                 'MED_PSYCH','MED_CARDIO',
                 outcome,'PHQ9_VALUE']]

## 1.3.3. Curate Data

In [None]:
data = data.dropna()            # keep only complete data (for now)
data = data.sample(len(data))   # randomly shuffle rows
data.shape

In [None]:
scale_data = data[['MOM_RACE','ETHNIC_GROUP','PHQ9_VALUE','PHQ9_risk2']]

In [None]:
scale_data2 = scale_data[scale_data.PHQ9_risk2==1]

# create a grid 
g = sns.FacetGrid(scale_data2, col='MOM_RACE', hue='MOM_RACE', col_wrap=3)

# draw density plots
g = g.map(sns.kdeplot,"PHQ9_VALUE", cut=0, fill=True, common_norm=False, alpha=1, legend=False)

# control the title of each facet
g = g.set_titles("{col_name}")

# show the graph
#plt.show()

plt.savefig('Figure 1.png',dpi=600)

In [None]:
data = data.drop(['PHQ9_VALUE'], axis=1)

In [None]:
race = data['MOM_RACE']
ethnic = data['ETHNIC_GROUP']
out = data[outcome]

print("------------MEDIAN AGE------------")
print(pd.crosstab(index=race, columns=ethnic, values=data['MOM_AGE'], aggfunc=np.median))
print("Overall median age:",np.median(data[['MOM_AGE']]))

print("------------RACE/ETHNIC COUNTS------------")
print(pd.DataFrame(pd.crosstab(index=race, columns=ethnic)))

print("------------RACE/ETHNIC PMAD------------")
print(pd.crosstab(index=[ethnic,race], columns=out, normalize='index'))

print("Overall PMAD:",np.mean(data[[outcome]]))

In [None]:
# binary-class
count0, count1 = data[outcome].value_counts()
print(count0, count1)

x = ['0','1']
y = [count0, count1]
plt.bar(x, y)

In [None]:
print("N:",data.shape)

## 1.3.4. Weight Data

In [None]:
data['w'] = calc_weights(df=data, sens_features_name="MOM_RACE", outcome_name=outcome)

In [None]:
data[['w',outcome,'MOM_RACE']].drop_duplicates()

## 1.3.5. Get Dummies and Split

In [None]:
# get dummy variables
data = pd.get_dummies(data)

Split data. Can specify whether to use stratify sampling or not.

In [None]:
# split into X and y
X = data.drop([outcome], axis=1)
y = data[[outcome]]

race = data[['MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
             'MOM_RACE_Black or African American',
             'MOM_RACE_Multiracial',
             'MOM_RACE_Other',
             'MOM_RACE_Unknown',
             'MOM_RACE_White',
             'MOM_RACE_Hispanic White']]
strat_df = pd.concat([y,race],axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, test_size=0.15, shuffle=True, stratify=strat_df, random_state=0)
X_test = X_test.drop(['w'], axis=1)

# Sensitive features
race_feature = X_test[['MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
                       'MOM_RACE_Black or African American',
                       'MOM_RACE_Multiracial',
                       'MOM_RACE_Other',
                       'MOM_RACE_Unknown',
                       'MOM_RACE_White',
                       'MOM_RACE_Hispanic White']]

In [None]:
# binary-class
count0_train, count1_train = y_train.value_counts()
print(count0_train, count1_train)

count0_test, count1_test = y_test.value_counts()
print(count0_test, count1_test)

In [None]:
X_test

# 2. Handle imbalanced data

## 2.1. Simple Over Sampling Minority (PMAD)

In [None]:
ros = RandomOverSampler(sampling_strategy = "auto",random_state=0)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
weights_ros = X_train_ros['w']
X_train_ros = X_train_ros.drop(['w'], axis=1)
y_train_ros.value_counts()

## 2.2. Simple Under Sampling Majority (PMAD)

In [None]:
rus = RandomUnderSampler(sampling_strategy = "auto", random_state=0)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
weights_rus = X_train_rus['w']
X_train_rus = X_train_rus.drop(['w'], axis=1)
y_train_rus.value_counts()

# 3. Modeling

In [None]:
# Extract weights and drop from training and test data frames
weights = X_train['w']
X_train = X_train.drop(['w'], axis=1)

In [None]:
# Base group is non-Hispanic White
races = ['MOM_RACE_Asian or Native Hawaiian or Other Pacific Islander',
         'MOM_RACE_Black or African American',
         'MOM_RACE_Hispanic White',
         'MOM_RACE_Multiracial',
         'MOM_RACE_Other',
         'MOM_RACE_Unknown']

reweigh_results = []
no_reweigh_results = []

## XG Boost Classifier

https://www.kaggle.com/code/stuarthallows/using-xgboost-with-scikit-learn

### Finetune XG Boost Classifier without Reweighing

In [None]:
x = X_train_rus
y = y_train_rus.values.ravel()

def objective(trial):
    params = {
        "seed":0,
        "objective": "binary:hinge",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = xgb.XGBClassifier(**params,random_state=0)
    model.fit(x, y, verbose=False)
    predictions = model.predict(X_test)
    BA = balanced_accuracy_score(y_test, predictions)
    return BA

#optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = optuna.samplers.TPESampler(seed=0) 
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=250)

best_xgb = xgb.XGBClassifier(objective='binary:hinge',n_estimators=1000, verbosity=0, seed=0, **study.best_params)
best_xgb.fit(x,y,verbose=False)
y_pred = best_xgb.predict(X_test)
test_balanced_acc = sklearn.metrics.balanced_accuracy_score(y_test,y_pred)
print('Test Balanced Accuracy:', np.round(test_balanced_acc,3))

optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=best_xgb)
y_pred_optim = pd.DataFrame((best_xgb.predict_proba(X_test)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])

test_set = pd.concat([y_pred_optim,y_test.reset_index(drop=True),X_test.reset_index(drop=True)],axis=1)

# Demographic parity
p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

# TP and FN
pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
fn_white = np.mean(pos_lab_set_white['fn'])
pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
tp_white = np.mean(pos_lab_set_white['tp'])

for r in races:
    pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
    pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
    pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
    no_reweigh_results.append({'Model':'XGB',
                               'Race': r,
                               'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                               'FN':np.mean(pos_lab['fn'])-fn_white,
                               'TP':np.mean(pos_lab['tp'])-tp_white})

In [None]:
import joblib
joblib.dump(best_xgb, 'best_xgb_phq9_no_reweigh.pkl') 

# to use later:
#best_glm_phq9_no_reweigh = joblib.load('best_xgb_phq9_no_reweigh.pkl') 
#best_glm_phq9_no_reweigh.predict([[300,85,5,5,5,8,1]])

In [None]:
optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=best_xgb)
y_pred_optim = pd.DataFrame((best_xgb.predict_proba(X_test)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])

print('AUCROC:',roc_auc_score(y_test,y_pred_optim))

In [None]:
gain = best_xgb.get_booster().get_score(importance_type='gain')
gain_sorted = dict(sorted(gain.items(), key=lambda x: x[1], reverse=False))

In [None]:
features = list(gain_sorted.keys())
values = list(gain_sorted.values())

fig, ax = plt.subplots()
ax.barh(features,values)
plt.yticks(fontsize=5.5)
ax.set_xlabel("Gain")
plt.savefig('PHQ-9 XGB Feature Importance.png',dpi=600, bbox_inches='tight')

In [None]:
pd.DataFrame(no_reweigh_results)

### Finetune XG Boost Classifier with Reweighing

In [None]:
x = X_train_rus
y = y_train_rus.values.ravel()
w = weights_rus

def objective(trial):
    params = {
        "seed":0,
        "objective": "binary:hinge",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = xgb.XGBClassifier(**params)
    model.fit(x, y, sample_weight=w, verbose=False)
    predictions = model.predict(X_test)
    BA = balanced_accuracy_score(y_test, predictions)
    return BA

#optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = optuna.samplers.TPESampler(seed=0) 
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=250)

best_xgb2 = xgb.XGBClassifier(objective='binary:hinge',n_estimators=1000, verbosity=0, seed=0, **study.best_params)
best_xgb2.fit(x,y,sample_weight=w,verbose=False)
y_pred = best_xgb2.predict(X_test)
test_balanced_acc = sklearn.metrics.balanced_accuracy_score(y_test,y_pred)
print('Test Balanced Accuracy:', np.round(test_balanced_acc,3))

optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=best_xgb2)
y_pred_optim = pd.DataFrame((best_xgb2.predict_proba(X_test)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])

test_set = pd.concat([y_pred_optim,y_test.reset_index(drop=True),X_test.reset_index(drop=True)],axis=1)

# Demographic parity
p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

# TP and FN
pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
fn_white = np.mean(pos_lab_set_white['fn'])
pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
tp_white = np.mean(pos_lab_set_white['tp'])

for r in races:
    pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
    pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
    pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
    reweigh_results.append({'Model':'XGB',
                               'Race': r,
                               'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                               'FN':np.mean(pos_lab['fn'])-fn_white,
                               'TP':np.mean(pos_lab['tp'])-tp_white})

In [None]:
joblib.dump(best_xgb2, 'best_xgb_phq9_reweigh.pkl') 

# to use later:
#best_xgb_phq9_reweigh = joblib.load('best_xgb_phq9_reweigh.pkl') 
#best_xgb_phq9_reweigh.predict([[300,85,5,5,5,8,1]])

In [None]:
gain = best_xgb2.get_booster().get_score(importance_type='gain')
gain_sorted = dict(sorted(gain.items(), key=lambda x: x[1], reverse=False))

features = list(gain_sorted.keys())
values = list(gain_sorted.values())

fig, ax = plt.subplots()
ax.barh(features,values)
plt.yticks(fontsize=5.5)
ax.set_xlabel("Gain")
plt.savefig('PHQ-9 XGB Feature Importance Reweigh.png',dpi=600, bbox_inches='tight')

In [None]:
pd.DataFrame(reweigh_results)

## Random Forest

### Finetune Random Forest without Reweighing

In [None]:
x = X_train_rus
y = y_train_rus.values.ravel()

def objective(trial):
    params = {'random_state':trial.suggest_int('random_state', 0, 50),
             'max_features':'sqrt',
             'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 50),
             'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
             'n_estimators': trial.suggest_int('n_estimators', 2, 20),
             'max_depth': trial.suggest_int('max_depth', 1, 32)
             }
    model = RandomForestClassifier(**params)
    model.fit(x, y)
    predictions = model.predict(X_test)
    BA = balanced_accuracy_score(y_test, predictions)
    return BA

sampler = optuna.samplers.TPESampler(seed=0) 
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=250)

print('Best hyperparameters:', study.best_params)
print('Best Balanced Accuracy:', study.best_value)

best_rf = RandomForestClassifier(max_features='sqrt',**study.best_params)
best_rf.fit(x,y)
y_pred = best_rf.predict(X_test)
test_balanced_acc = sklearn.metrics.balanced_accuracy_score(y_test,y_pred)
print('Test Balanced Accuracy:', np.round(test_balanced_acc,3))

optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=best_rf)
y_pred_optim = pd.DataFrame((best_rf.predict_proba(X_test)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])
test_set = pd.concat([pd.DataFrame(y_pred_optim),y_test.reset_index(drop=True),X_test.reset_index(drop=True)],axis=1)

# Demographic parity
p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

# Equalized odds
pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
fn_white = np.mean(pos_lab_set_white['fn'])
pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
tp_white = np.mean(pos_lab_set_white['tp'])

for r in races:
    pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
    pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
    pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
    no_reweigh_results.append({'Model':'RF',
                               'Race': r,
                               'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                               'FN':np.mean(pos_lab['fn'])-fn_white,
                               'TP':np.mean(pos_lab['tp'])-tp_white})
    
print('AUCROC:',roc_auc_score(y_test,y_pred_optim))

In [None]:
joblib.dump(best_rf, 'best_rf_phq9_no_reweigh.pkl') 

# to use later:
#best_rf_phq9_no_reweigh = joblib.load('best_rf_phq9_no_reweigh.pkl') 
#best_rf_phq9_no_reweigh.predict([[300,85,5,5,5,8,1]])

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(
    best_rf, X_test, y_test, n_repeats=10, random_state=2024, n_jobs=2
)

importance_mean = result.importances_mean
importance_sd = result.importances_std

ind = np.argpartition(importance_mean, -10)[-10:]
top_feat = X_test.columns[ind]
top_vals = importance_mean[ind]
top_std = importance_sd[ind]

fig, ax = plt.subplots()
ax.barh(top_feat,top_vals,xerr=top_std)
ax.set_xlabel("Mean accuracy decrease")
plt.savefig('PHQ-9 RF Feature Importance.png',dpi=600, bbox_inches='tight')

In [None]:
pd.DataFrame(no_reweigh_results)

### Finetune Random Forest with Reweighing

In [None]:
x = X_train_rus
y = y_train_rus.values.ravel()
w = weights_rus

def objective(trial):
    params = {'random_state':trial.suggest_int('random_state', 0, 50),
             'max_features':'sqrt',
             'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 50),
             'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
             'n_estimators': trial.suggest_int('n_estimators', 2, 20),
             'max_depth': trial.suggest_int('max_depth', 1, 32)
             }
    model = RandomForestClassifier(**params)
    model.fit(x, y, sample_weight=w)
    predictions = model.predict(X_test)
    BA = balanced_accuracy_score(y_test, predictions)
    return BA

sampler = optuna.samplers.TPESampler(seed=0) 
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=250)

print('Best hyperparameters:', study.best_params)
print('Best Balanced Accuracy:', study.best_value)

best_rf = RandomForestClassifier(max_features='sqrt',**study.best_params)
best_rf.fit(x,y,sample_weight=w)
y_pred = best_rf.predict(X_test)
test_balanced_acc = sklearn.metrics.balanced_accuracy_score(y_test,y_pred)
print('Test Balanced Accuracy:', np.round(test_balanced_acc,3))

optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=best_rf)
y_pred_optim = pd.DataFrame((best_rf.predict_proba(X_test)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])
test_set = pd.concat([pd.DataFrame(y_pred_optim),y_test.reset_index(drop=True),X_test.reset_index(drop=True)],axis=1)

# Demographic parity
p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

# Equalized odds
pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
fn_white = np.mean(pos_lab_set_white['fn'])
pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
tp_white = np.mean(pos_lab_set_white['tp'])

for r in races:
    pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
    pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
    pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
    reweigh_results.append({'Model':'RF',
                               'Race': r,
                               'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                               'FN':np.mean(pos_lab['fn'])-fn_white,
                               'TP':np.mean(pos_lab['tp'])-tp_white})
print('AUCROC:',roc_auc_score(y_test,y_pred_optim))

In [None]:
joblib.dump(best_rf, 'best_rf_phq9_reweigh.pkl') 

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(
    best_rf, X_test, y_test, n_repeats=10, random_state=2024, n_jobs=2
)

importance_mean = result.importances_mean
importance_sd = result.importances_std

ind = np.argpartition(importance_mean, -10)[-10:]
top_feat = X_test.columns[ind]
top_vals = importance_mean[ind]
top_std = importance_sd[ind]

fig, ax = plt.subplots()
ax.barh(top_feat,top_vals,xerr=top_std)
ax.set_xlabel("Mean accuracy decrease")
plt.savefig('PHQ-9 RF Feature Importance Reweigh.png',dpi=600, bbox_inches='tight')

In [None]:
pd.DataFrame(reweigh_results)

## Logistic Regression

### Finetune Logistic Regression without Reweighing

In [None]:
x = X_train_rus
y = y_train_rus.values.ravel()

def objective(trial):
    params = {'penalty':'l2',
             'C':trial.suggest_loguniform("C", 1e-2, 1),
             'tol':trial.suggest_uniform('tol' , 1e-6 , 1e-3)
             }
    model = LogisticRegression(**params)
    model.fit(x,y)
    predictions = model.predict(X_test)
    BA = balanced_accuracy_score(y_test, predictions)
    return BA

sampler = optuna.samplers.TPESampler(seed=0) 
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=250)

print('Best hyperparameters:', study.best_params)
print('Best Balanced Accuracy:', study.best_value)

best_glm = LogisticRegression(penalty='l2',**study.best_params)
best_glm.fit(x,y)
y_pred = best_glm.predict(X_test)
test_balanced_acc = sklearn.metrics.balanced_accuracy_score(y_test,y_pred)
print('Test Balanced Accuracy:', np.round(test_balanced_acc,3))

optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=best_glm)
y_pred_optim = pd.DataFrame((best_glm.predict_proba(X_test)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])
test_set = pd.concat([pd.DataFrame(y_pred_optim),y_test.reset_index(drop=True),X_test.reset_index(drop=True)],axis=1)

# Demographic parity
p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

# Equalized odds
pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
fn_white = np.mean(pos_lab_set_white['fn'])
pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
tp_white = np.mean(pos_lab_set_white['tp'])

for r in races:
    pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
    pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
    pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
    no_reweigh_results.append({'Model':'LR',
                               'Race': r,
                               'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                               'FN':np.mean(pos_lab['fn'])-fn_white,
                               'TP':np.mean(pos_lab['tp'])-tp_white})
print('AUCROC:',roc_auc_score(y_test,y_pred_optim))

In [None]:
joblib.dump(best_glm, 'best_glm_phq9_no_reweigh.pkl') 

In [None]:
pd.DataFrame(no_reweigh_results)

### Finetune Logistic Regression with Reweighing

In [None]:
x = X_train_rus
y = y_train_rus.values.ravel()
w = weights_rus

def objective(trial):
    params = {'penalty':'l2',
             'C':trial.suggest_loguniform("C", 1e-2, 1),
             'tol':trial.suggest_uniform('tol' , 1e-6 , 1e-3)
             }
    model = LogisticRegression(**params)
    model.fit(x, y, sample_weight=w)
    predictions = model.predict(X_test)
    BA = balanced_accuracy_score(y_test, predictions)
    return BA

sampler = optuna.samplers.TPESampler(seed=0) 
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=250)

print('Best hyperparameters:', study.best_params)
print('Best Balanced Accuracy:', study.best_value)

best_glm = LogisticRegression(penalty='l2',**study.best_params)
best_glm.fit(x,y,sample_weight=w)
y_pred = best_glm.predict(X_test)
test_balanced_acc = sklearn.metrics.balanced_accuracy_score(y_test,y_pred)
print('Test Balanced Accuracy:', np.round(test_balanced_acc,3))

optim_threshold = optim_thresh(X_test=X_test, y_test=y_test, model=best_glm)
y_pred_optim = pd.DataFrame((best_glm.predict_proba(X_test)[:,1] >= optim_threshold).astype('int'),columns=['y_pred'])
test_set = pd.concat([pd.DataFrame(y_pred_optim),y_test.reset_index(drop=True),X_test.reset_index(drop=True)],axis=1)

# Demographic parity
p_white = np.mean(test_set['y_pred'][test_set['MOM_RACE_White']==1])

# Equalized odds
pos_lab_set_white = test_set[(test_set[outcome]==1) & (test_set['MOM_RACE_White']==1)]
pos_lab_set_white['fn'] = np.where(pos_lab_set_white['y_pred']==0,1,0)
fn_white = np.mean(pos_lab_set_white['fn'])
pos_lab_set_white['tp'] = np.where(pos_lab_set_white['y_pred']==1,1,0)
tp_white = np.mean(pos_lab_set_white['tp'])

for r in races:
    pos_lab = test_set[(test_set[outcome]==1) & (test_set[r]==1)]
    pos_lab['fn'] = np.where(pos_lab['y_pred']==0,1,0)
    pos_lab['tp'] = np.where(pos_lab['y_pred']==1,1,0)
    reweigh_results.append({'Model':'LR',
                               'Race': r,
                               'DP':np.mean(test_set['y_pred'][test_set[r]==1])-p_white,
                               'FN':np.mean(pos_lab['fn'])-fn_white,
                               'TP':np.mean(pos_lab['tp'])-tp_white})
print('AUCROC:',roc_auc_score(y_test,y_pred_optim))

In [None]:
joblib.dump(best_glm, 'best_glm_phq9_reweigh.pkl') 

In [None]:
pd.DataFrame(reweigh_results)

# Plotting

In [None]:
no_reweigh_results = pd.DataFrame(no_reweigh_results)
reweigh_results = pd.DataFrame(reweigh_results)

In [None]:
no_reweigh_results.to_excel("PHQ9_no_reweigh_results.xlsx")
reweigh_results.to_excel("PHQ9_reweigh_results.xlsx")

In [None]:
no_reweigh_results = pd.read_excel("PHQ9_no_reweigh_results.xlsx")
reweigh_results = pd.read_excel("PHQ9_reweigh_results.xlsx")

In [None]:
no_reweigh_results['Race'] = no_reweigh_results['Race'].str[9:]
reweigh_results['Race'] = reweigh_results['Race'].str[9:]

In [None]:
no_reweigh_results['Race'] = no_reweigh_results['Race'].replace({'Asian or Native Hawaiian or Other Pacific Islander':'AAPI', 
                                                                 'Black or African American':'Black',
                                                                 'Multiracial':'Multi',
                                                                 'Hispanic White':'Hispanic'})
reweigh_results['Race'] = reweigh_results['Race'].replace({'Asian or Native Hawaiian or Other Pacific Islander':'AAPI', 
                                                                 'Black or African American':'Black',
                                                                 'Multiracial':'Multi',
                                                                 'Hispanic White':'Hispanic'})

## No Reweighing

In [None]:
sns.barplot(data=no_reweigh_results, x='Race', y='DP', hue='Model')
plt.ylim((-0.8,0.8))
plt.title('PHQ-9 Disparate Impact Before Reweighing')
plt.savefig('No Reweigh DI.png',dpi=600)

In [None]:
sns.barplot(data=no_reweigh_results, x='Race', y='FN', hue='Model')
plt.ylim((-0.8,0.8))
plt.title('PHQ-9 False Negatives Before Reweighing')
plt.savefig('No Reweigh FN.png',dpi=600)

In [None]:
sns.barplot(data=no_reweigh_results, x='Race', y='TP', hue='Model')
plt.ylim((-0.8,0.8))
plt.title('PHQ-9 True Positives Before Reweighing')
plt.savefig('No Reweigh TP.png',dpi=600)

## With Reweighing

In [None]:
sns.barplot(data=reweigh_results, x='Race', y='DP', hue='Model')
plt.ylim((-0.8,0.8))
plt.title('PHQ-9 Disparate Impact After Reweighing')
plt.savefig('Reweigh DI.png',dpi=600)

In [None]:
sns.barplot(data=reweigh_results, x='Race', y='FN', hue='Model')
plt.ylim((-0.8,0.8))
plt.title('PHQ-9 False Negatives After Reweighing')
plt.savefig('Reweigh FN.png',dpi=600)

In [None]:
sns.barplot(data=reweigh_results, x='Race', y='TP', hue='Model')
plt.ylim((-0.8,0.8))
plt.title('PHQ-9 True Positives After Reweighing')
plt.savefig('Reweigh TP.png',dpi=600)

In [None]:
# save this file and output as html
import os
os.system('jupyter nbconvert --to html data_analysis_PHQ9.ipynb')

In [None]:
X_train.to_excel("X_train.xlsx")
y_train.to_excel("y_train.xlsx")
X_test.to_excel("X_test.xlsx")
y_test.to_excel("y_test.xlsx")