In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, make_scorer, roc_auc_score
import os
import pickle
from dotenv import load_dotenv

load_dotenv(override=True)
specificity = make_scorer(recall_score, pos_label=0)

In [2]:
dat = pd.read_parquet(os.getenv('TRAIN_PARQUET'))
dat.shape

(11467, 432)

In [3]:
from preprocess import preprocess

preprocess(dat)
dat.shape

dropping 26 of census tract columns
found columns with NaN: bmi, NaN number = 318, dropping these patients
found columns with NaN: mother_age, NaN number = 36, dropping these patients
found columns with NaN: mother_height, NaN number = 502, dropping these patients
found columns with NaN: smoking, NaN number = 6, dropping these patients
found columns with NaN: tobacco, NaN number = 6, dropping these patients
removing patients without prenatal and postpartum visit 1056
dropping columns: counts_of_visits_3m_after_delivery, counts_of_visits_6m_after_delivery
removing patients without screening using edinburgh or phq9 2616
removing columns: 'F53_label','edinburgh_max','phq9_total_max', 'PPD_delete_label'


(7208, 402)

In [4]:
# check to make sure there is no NaN before continuing
dat.isna().mean().sort_values(ascending=False)

birthid      0.0
rx_597959    0.0
rx_759717    0.0
rx_755109    0.0
rx_754766    0.0
            ... 
dx_R10.2     0.0
dx_R10.1     0.0
dx_R07.9     0.0
dx_R06.0     0.0
label        0.0
Length: 402, dtype: float64

In [5]:
# set up the X, y for model fitting
X = dat.drop(columns=['birthid', 'label'])
y = dat['label']
y.mean(), X.shape

(0.22017203107658156, (7208, 400))

In [6]:
# set up the model
# the param range should be okay, but check the best_params from get_many_acus to make sure
clf = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200),
                   param_grid={'min_samples_split': [10,15,20,25,30,50,70,100,120,150,170,200]},
                   verbose=3,
                   scoring={'f1':'f1',
                            'roc_auc':'roc_auc',
                            'sensitivity':'recall',
                            'precision':'precision',
                            'specificity': specificity},
                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                   refit='roc_auc',
                   n_jobs=int(os.getenv('NUM_CPUS')))

In [7]:
from collections import defaultdict

aucs = defaultdict(list)
specs = defaultdict(list)
sens = defaultdict(list)
race_cols = ['all', 'mother_is_black', 'mother_is_hispanic', 'mother_is_white']

def get_many_aucs(X, y, nseeds=1):
    for seed in range(nseeds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        clf.verbose = 0
        clf.fit(X_train, y_train)
        print(clf.best_params_, clf.best_score_)

        y_pred_prob = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)
    
        # also calculate the performance among different races here
        for col in race_cols:
            if col == 'all':
                y_test_selected = y_test
                y_pred_selected = y_pred
                y_pred_prob_selected = y_pred_prob
            else:
                idx = (X_test[col] == 1)
                y_test_selected = y_test[idx]
                y_pred_selected = y_pred[idx]
                y_pred_prob_selected = y_pred_prob[idx]
            aucs[col].append(roc_auc_score(y_test_selected, y_pred_prob_selected[:, 1]))
            specs[col].append(recall_score(y_test_selected, y_pred_selected, pos_label=0))
            sens[col].append(recall_score(y_test_selected, y_pred_selected))

    for col in race_cols:
        print(f"====== for {col}: ======")
        print('auc= {0:.3f}+-{1:.3f}'.format(np.mean(aucs[col]), np.std(aucs[col])))
        print('specificity= {0:.3f}+-{1:.3f}'.format(np.mean(specs[col]), np.std(specs[col])))
        print('sensitivity= {0:.3f}+-{1:.3f}'.format(np.mean(sens[col]), np.std(sens[col])))

In [8]:
# do the training
get_many_aucs(X, y, nseeds=10)

{'min_samples_split': 30} 0.7233366392003362
{'min_samples_split': 50} 0.7288837467058835
{'min_samples_split': 50} 0.7191993697036743
{'min_samples_split': 25} 0.7170535844033878
{'min_samples_split': 25} 0.7198113720726553
{'min_samples_split': 15} 0.7236762363540022
{'min_samples_split': 25} 0.7272578099676492
{'min_samples_split': 15} 0.7166446348345686
{'min_samples_split': 10} 0.7216655059009448
{'min_samples_split': 20} 0.7215106199123251
auc= 0.723+-0.018
specificity= 0.896+-0.040
sensitivity= 0.349+-0.088
auc= 0.714+-0.029
specificity= 0.878+-0.046
sensitivity= 0.385+-0.090
auc= 0.669+-0.040
specificity= 0.934+-0.026
sensitivity= 0.237+-0.095
auc= 0.718+-0.022
specificity= 0.879+-0.049
sensitivity= 0.368+-0.103


In [9]:
aucs

defaultdict(list,
            {'all': [0.7234039470672265,
              0.6921123106876037,
              0.7328724128220773,
              0.7438189845474613,
              0.7413518598928932,
              0.7154088050314465,
              0.6958961845607808,
              0.7476169272379445,
              0.7226042143042439,
              0.7147635726795096],
             'mother_is_black': [0.7089420995670995,
              0.6667240587695134,
              0.7097337407245744,
              0.744982290436836,
              0.7244674902140207,
              0.7304801485542487,
              0.7302477183833115,
              0.7306122448979593,
              0.7406903257170636,
              0.6537704918032786],
             'mother_is_hispanic': [0.60575,
              0.6196587649101616,
              0.656952380952381,
              0.6715126978284872,
              0.6600929708409636,
              0.6772325510515087,
              0.696030136192408,
              0.759460635058

In [10]:
# check the output from get_many_acus to make sure what the best param is for your own data
# use 25 for now to get the final model
clf = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200, min_samples_split=25)
clf.fit(X, y)

In [11]:
# save the file
with open(os.getenv('ML_MODEL'), 'wb') as fid:
    pickle.dump(clf, fid)