In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, make_scorer, roc_auc_score
import os
import pickle
from dotenv import load_dotenv

load_dotenv(override=True)
specificity = make_scorer(recall_score, pos_label=0)

In [2]:
dat = pd.read_parquet(os.getenv('TRAIN_PARQUET'))
dat.shape

(11467, 432)

In [3]:
from preprocess import preprocess

preprocess(dat)
dat.shape

parity is NaN 4022; will be filled with 0
dropping 26 of census tract columns
found columns with NaN: bmi, NaN number = 337, dropping these patients
found columns with NaN: mother_age, NaN number = 39, dropping these patients
found columns with NaN: mother_height, NaN number = 495, dropping these patients
found columns with NaN: smoking, NaN number = 7, dropping these patients
found columns with NaN: tobacco, NaN number = 7, dropping these patients
removing patients without prenatal and postpartum visit 1055
dropping columns: counts_of_visits_3m_after_delivery, counts_of_visits_6m_after_delivery
removing patients without screening using edinburgh or phq9 2636
removing columns: 'F53_label','edinburgh_max','phq9_total_max', 'PPD_delete_label'


(7183, 402)

In [4]:
# check to make sure there is no NaN before continuing
dat.isna().mean().sort_values(ascending=False)

label                             0.0
birthid                           0.0
phq_or_edinburgh_21012953_isna    0.0
phq_or_edinburgh_21012953_max     0.0
phq_or_edinburgh_21012951_isna    0.0
                                 ... 
gest_age_in_days                  0.0
mother_is_white                   0.0
mother_is_hispanic                0.0
mother_is_black                   0.0
mother_age                        0.0
Length: 402, dtype: float64

In [5]:
# set up the X, y for model fitting
X = dat.drop(columns=['birthid', 'label'])
y = dat['label']
y.mean(), X.shape

(np.float64(0.21857162745371014), (7183, 400))

In [6]:
# set up the model
# the param range should be okay, but check the best_params from get_many_acus to make sure
clf = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200),
                   param_grid={'min_samples_split': [10,15,20,25,30,50,70,100,120,150,170,200]},
                   verbose=3,
                   scoring={'f1':'f1',
                            'roc_auc':'roc_auc',
                            'sensitivity':'recall',
                            'precision':'precision',
                            'specificity': specificity},
                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                   refit='roc_auc',
                   n_jobs=int(os.getenv('NUM_CPUS')))

In [7]:
from collections import defaultdict

aucs = defaultdict(list)
specs = defaultdict(list)
sens = defaultdict(list)
race_cols = ['all', 'mother_is_black', 'mother_is_hispanic', 'mother_is_white']

def get_many_aucs(X, y, nseeds=1):
    for seed in range(nseeds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        clf.verbose = 0
        clf.fit(X_train, y_train)
        print(clf.best_params_, clf.best_score_)

        y_pred_prob = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)
    
        # also calculate the performance among different races here
        for col in race_cols:
            if col == 'all':
                y_test_selected = y_test
                y_pred_selected = y_pred
                y_pred_prob_selected = y_pred_prob
            else:
                idx = (X_test[col] == 1)
                y_test_selected = y_test[idx]
                y_pred_selected = y_pred[idx]
                y_pred_prob_selected = y_pred_prob[idx]
            aucs[col].append(roc_auc_score(y_test_selected, y_pred_prob_selected[:, 1]))
            specs[col].append(recall_score(y_test_selected, y_pred_selected, pos_label=0))
            sens[col].append(recall_score(y_test_selected, y_pred_selected))

    for col in race_cols:
        print(f"====== for {col}: ======")
        print('auc= {0:.3f}+-{1:.3f}'.format(np.mean(aucs[col]), np.std(aucs[col])))
        print('specificity= {0:.3f}+-{1:.3f}'.format(np.mean(specs[col]), np.std(specs[col])))
        print('sensitivity= {0:.3f}+-{1:.3f}'.format(np.mean(sens[col]), np.std(sens[col])))

In [8]:
# do the training
get_many_aucs(X, y, nseeds=10)

{'min_samples_split': 20} 0.7251588035477364
{'min_samples_split': 50} 0.7310275754737272
{'min_samples_split': 30} 0.7209901110743074
{'min_samples_split': 25} 0.720260990978146
{'min_samples_split': 15} 0.7155565515076668
{'min_samples_split': 50} 0.70821064993776
{'min_samples_split': 25} 0.717721507074607
{'min_samples_split': 25} 0.7197969006156624
{'min_samples_split': 20} 0.7271484387620932
{'min_samples_split': 50} 0.7158569194154472
auc= 0.726+-0.017
specificity= 0.885+-0.038
sensitivity= 0.381+-0.067
auc= 0.706+-0.036
specificity= 0.868+-0.046
sensitivity= 0.397+-0.088
auc= 0.689+-0.043
specificity= 0.934+-0.023
sensitivity= 0.257+-0.094
auc= 0.722+-0.019
specificity= 0.866+-0.047
sensitivity= 0.407+-0.072


In [9]:
aucs

defaultdict(list,
            {'all': [np.float64(0.7131024524461927),
              np.float64(0.6926555299539171),
              np.float64(0.7341739443489571),
              np.float64(0.7343500166795118),
              np.float64(0.7445080267301636),
              np.float64(0.7547588931979972),
              np.float64(0.7253317356275505),
              np.float64(0.7266985673517626),
              np.float64(0.7114137049319443),
              np.float64(0.7241036768229322)],
             'mother_is_black': [np.float64(0.6565499926155663),
              np.float64(0.6470140737923165),
              np.float64(0.7282213184931507),
              np.float64(0.7052753623188406),
              np.float64(0.7609126984126984),
              np.float64(0.7581662162802826),
              np.float64(0.7177753411306043),
              np.float64(0.7042386185243328),
              np.float64(0.6805963406370003),
              np.float64(0.7020754932147337)],
             'mother_is_hispanic':

In [10]:
# check the output from get_many_acus to make sure what the best param is for your own data
# use 25 for now to get the final model
clf = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200, min_samples_split=25)
clf.fit(X, y)

In [11]:
# save the file
with open(os.getenv('ML_MODEL'), 'wb') as fid:
    pickle.dump(clf, fid)