In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, make_scorer, roc_auc_score
import os
import pickle
from dotenv import load_dotenv

load_dotenv(override=True)
specificity = make_scorer(recall_score, pos_label=0)

In [2]:
dat = pd.read_parquet(os.getenv('TRAIN_PARQUET'))
dat.shape

(11467, 432)

In [3]:
from preprocess import preprocess

preprocess(dat)
dat.shape

dropping 26 of census tract columns
found columns with NaN: bmi, NaN number = 318, dropping these patients
found columns with NaN: mother_age, NaN number = 36, dropping these patients
found columns with NaN: mother_height, NaN number = 502, dropping these patients
found columns with NaN: smoking, NaN number = 6, dropping these patients
found columns with NaN: tobacco, NaN number = 6, dropping these patients
removing patients without prenatal and postpartum visit 1056
dropping columns: counts_of_visits_3m_after_delivery, counts_of_visits_6m_after_delivery


(9824, 405)

In [4]:
# check to make sure there is not too many NaN before continuing
dat.isna().mean().sort_values(ascending=False)

phq9_total_max    0.924674
edinburgh_max     0.280639
birthid           0.000000
rx_628530         0.000000
rx_763484         0.000000
                    ...   
dx_R10.3          0.000000
dx_R10.2          0.000000
dx_R10.1          0.000000
dx_R07.9          0.000000
parity_isna       0.000000
Length: 405, dtype: float64

In [6]:
# fix to use F53 or edinburgh_max or phq9

# use only patients been screened
dat_selected = dat[ (dat['edinburgh_max'].isna() == False) | (dat['phq9_total_max'].isna() == False)].copy()

# set up the X, y for model fitting
X = dat_selected.drop(columns=['F53_label','edinburgh_max','phq9_total_max', 'birthid', 'PPD_delete_label'])
y = np.where( (dat_selected['edinburgh_max'] >= 10) | 
              (dat_selected['phq9_total_max'] >= 10) | 
              (dat_selected['F53_label'] == 1), 1, 0)

In [7]:
# set up the model
# the param range should be okay, but check the best_params from get_many_acus to make sure
clf = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200),
                   param_grid={'min_samples_split': [10,15,20,25,30,50,70,100,120,150,170,200]},
                   verbose=3,
                   scoring={'f1':'f1',
                            'roc_auc':'roc_auc',
                            'sensitivity':'recall',
                            'precision':'precision',
                            'specificity': specificity},
                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                   refit='roc_auc')

In [8]:
def get_many_aucs(X, y, nseeds=1):
    aucs, specs, sens = [], [], []
    best_params = []
    for seed in range(nseeds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        clf.verbose = 0
        clf.fit(X_train, y_train)
        best_params.append(clf.best_params_)

        y_pred_prob = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)
        aucs.append(roc_auc_score(y_test, y_pred_prob[:, 1]))
        specs.append(recall_score(y_test, y_pred, pos_label=0))
        sens.append(recall_score(y_test, y_pred))
    
    print('auc= {0:.3f}+-{1:.3f}'.format(np.mean(aucs), np.std(aucs)))
    print('specificity= {0:.3f}+-{1:.3f}'.format(np.mean(specs), np.std(specs)))
    print('sensitivity= {0:.3f}+-{1:.3f}'.format(np.mean(sens), np.std(sens)))
    return best_params

In [9]:
# do the training
best_params = get_many_aucs(X, y, nseeds=10)

auc= 0.723+-0.018
specificity= 0.896+-0.040
sensitivity= 0.349+-0.088


In [10]:
best_params

[{'min_samples_split': 30},
 {'min_samples_split': 50},
 {'min_samples_split': 50},
 {'min_samples_split': 25},
 {'min_samples_split': 25},
 {'min_samples_split': 15},
 {'min_samples_split': 25},
 {'min_samples_split': 15},
 {'min_samples_split': 10},
 {'min_samples_split': 20}]

In [11]:
# use 30 for now to get the final model
clf = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200, min_samples_split=30)
clf.fit(X, y)

In [12]:
# save the file
with open(os.getenv('ML_MODEL'), 'wb') as fid:
    pickle.dump(clf, fid)