In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, make_scorer, roc_auc_score
import os
import pickle
from dotenv import load_dotenv

load_dotenv(override=True)
specificity = make_scorer(recall_score, pos_label=0)

In [2]:
dat = pd.read_parquet(os.getenv('TRAIN_PARQUET'))
dat.shape

(14334, 432)

In [3]:
from preprocess import preprocess

preprocess(dat)
dat.shape

dropping 26 of census tract columns
found columns with NaN: bmi, NaN number = 407, dropping these patients
found columns with NaN: mother_age, NaN number = 43, dropping these patients
found columns with NaN: mother_height, NaN number = 623, dropping these patients
found columns with NaN: smoking, NaN number = 8, dropping these patients
found columns with NaN: tobacco, NaN number = 8, dropping these patients
removing patients without prenatal and postpartum visit 1316
dropping columns: counts_of_visits_3m_after_delivery, counts_of_visits_6m_after_delivery
removing patients without screening using edinburgh or phq9 3290
removing columns: 'F53_label','edinburgh_max','phq9_total_max', 'PPD_delete_label'


(8994, 402)

In [None]:
# check to make sure there is no NaN before continuing
dat.isna().mean().sort_values(ascending=False)

In [None]:
# fix to use F53 or edinburgh_max or phq9

# use only patients been screened
dat_selected = dat[ (dat['edinburgh_max'].isna() == False) | (dat['phq9_total_max'].isna() == False)].copy()

# set up the X, y for model fitting
X = dat_selected.drop(columns=['F53_label','edinburgh_max','phq9_total_max', 'birthid', 'PPD_delete_label'])
y = np.where( (dat_selected['edinburgh_max'] >= 10) | 
              (dat_selected['phq9_total_max'] >= 10) | 
              (dat_selected['F53_label'] == 1), 1, 0)

In [None]:
# set up the model
# the param range should be okay, but check the best_params from get_many_acus to make sure
clf = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200),
                   param_grid={'min_samples_split': [10,15,20,25,30,50,70,100,120,150,170,200]},
                   verbose=3,
                   scoring={'f1':'f1',
                            'roc_auc':'roc_auc',
                            'sensitivity':'recall',
                            'precision':'precision',
                            'specificity': specificity},
                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                   refit='roc_auc')

In [None]:
from collections import defaultdict

aucs = defaultdict(list)
specs = defaultdict(list)
sens = defaultdict(list)
race_cols = ['all', 'mother_is_black', 'mother_is_hispanic', 'mother_is_white']

def get_many_aucs(X, y, nseeds=1):
    for seed in range(nseeds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        clf.verbose = 0
        clf.fit(X_train, y_train)
        print(clf.best_params_, clf.best_score_)

        y_pred_prob = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)
        aucs['all'].append(roc_auc_score(y_test, y_pred_prob[:, 1]))
        specs['all'].append(recall_score(y_test, y_pred, pos_label=0))
        sens['all'].append(recall_score(y_test, y_pred))
    
        # also calculate the performance among different races here
        for col in race_cols:
            if col == 'all':
                y_test_selected = y_test[idx]
                y_pred_selected = y_pred[idx]
                y_pred_prob_selected = y_pred_prob[idx]
            else:
                idx = (X_test[col] == 1)
                y_test_selected = y_test[idx]
                y_pred_selected = y_pred[idx]
                y_pred_prob_selected = y_pred_prob[idx]
            aucs[col].append(roc_auc_score(y_test_selected, y_pred_prob_selected[:, 1]))
            specs[col].append(recall_score(y_test_selected, y_pred_selected, pos_label=0))
            sens[col].append(recall_score(y_test_selected, y_pred_selected))

    for col in race_cols:
        print(f"====== for {col}: ======")
        print('auc= {0:.3f}+-{1:.3f}'.format(np.mean(aucs[col]), np.std(aucs[col])))
        print('specificity= {0:.3f}+-{1:.3f}'.format(np.mean(specs[col]), np.std(specs[col])))
        print('sensitivity= {0:.3f}+-{1:.3f}'.format(np.mean(sens[col]), np.std(sens[col])))

In [None]:
# do the training
get_many_aucs(X, y, nseeds=10)

In [None]:
aucs

In [None]:
# check the output from get_many_acus to make sure what the best param is for your own data
# use 25 for now to get the final model at WFU
clf = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200, min_samples_split=25)
clf.fit(X, y)

In [None]:
# save the file
with open(os.getenv('ML_MODEL'), 'wb') as fid:
    pickle.dump(clf, fid)