In [None]:
import pandas as pd
import numpy as np
import sqlalchemy
from sqlqueries_unc_preeclampsia import *
import os
import dotenv

### 1. load and massage the dataset

In [None]:
dotenv.load_dotenv()
connection_string = f"mssql+pymssql://@{os.getenv('HOSTNAME')}/{os.getenv('DATABASE')}"
engine = sqlalchemy.create_engine(connection_string)
sql_string = f'''
select
    *
from {os.getenv('DATABASE')}.{os.getenv('SCHEMA')}.rpt_preeclampsia
'''
dat = pd.read_sql(sql_string, con=engine)

In [None]:
dat_bp = pd.read_sql(blood_pressure_sql_string, con=engine)
dat_vital = pd.read_sql(vital_sql_string, con=engine)
dat_obs = pd.read_sql(obs_sql_string, con=engine)
dat_preeclampsia = pd.read_sql(preeclampsia_sql_string, con=engine)
dat_med = pd.read_sql(get_rx_sql_string, con=engine)
dat_age = pd.read_sql(age_sql_string, con=engine)
dat_age = dat_age.drop(columns=['gest_age_in_days']) # Prof Page found this problem!
dat_race = pd.read_sql(race_sql_string, con=engine)

In [None]:
assert(all(dat_bp.BIRTHID==dat_vital.BIRTHID))
assert(all(dat_bp.BIRTHID==dat_preeclampsia.BIRTHID))
assert(all(dat_bp.BIRTHID==dat_age.BIRTHID))
assert(all(dat_bp.BIRTHID==dat_race.BIRTHID))

In [None]:
# the preeclampsia label is not decided already in the sql query
dat = pd.merge(left=dat_bp, right=dat_preeclampsia[['BIRTHID','preeclampsia_label']], left_on='BIRTHID', right_on='BIRTHID', how='left')
dat = dat.rename(columns={'preeclampsia_label': 'diagnosis'})
dat

In [None]:
dat.corrwith(dat['diagnosis'], method='spearman')

In [None]:
dat.diagnosis.value_counts()

In [None]:
# pivot the obs table
dat_obs = dat_obs.pivot(columns=['RAW_OBSCLIN_NAME'], index='BIRTHID', values=['min_VALUE','max_VALUE','mean_VALUE','median_VALUE'])
dat_obs.columns = [x.replace('VALUE', y) for x, y in dat_obs.columns.to_flat_index()]
dat_obs

In [None]:
dat_vital.isna().mean()

In [None]:
dat_vital

In [None]:
# fill a few percentage of BMI that's nan with availabel weight data
for measure in ['max','min','mean','median']:
    tmp = dat_vital[f'{measure}_WEIGHT'] / dat_vital['mean_HEIGHT'] ** 2 * 705
    dat_vital[f'{measure}_BMI'] = np.where(dat_vital[f'{measure}_BMI'].isna(), tmp, dat_vital[f'{measure}_BMI'])

In [None]:
dat_vital

In [None]:
# now merge the obs, vital, age and race table
dat = pd.merge(left=dat, right=dat_obs, left_on='BIRTHID', right_on='BIRTHID', how='left')
dat = pd.merge(left=dat, right=dat_vital, left_on='BIRTHID', right_on='BIRTHID', how='left')
dat = pd.merge(left=dat, right=dat_age, left_on='BIRTHID', right_on='BIRTHID', how='left')
dat = pd.merge(left=dat, right=dat_race, left_on='BIRTHID', right_on='BIRTHID', how='left')
dat.shape

In [None]:
dat.head()

In [None]:
# lastly the medication table
med_col = list(dat_med.columns)
med_col.remove('BIRTHID')
dat_med

In [None]:
dat = pd.merge(left=dat, right=dat_med, left_on='BIRTHID', right_on='BIRTHID', how='left')
dat

In [None]:
for col in med_col:
    dat[col] = dat[col].fillna(0)

In [None]:
# the black and non-black rate is different now
dat[dat.is_black==1].diagnosis.value_counts(normalize=True), dat[dat.is_black==0].diagnosis.value_counts(normalize=True)

### 2. take a look at the Nans

In [None]:
[(x,y) for x,y in zip(dat.columns.to_list(), dat.isna().mean().to_list())]

In [None]:
dat.corrwith(dat['diagnosis'], method='spearman').sort_values().tail(50)

### 3. build the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('clf',RandomForestClassifier(class_weight='balanced'))
])

In [None]:
# define the specificity
from sklearn.metrics import recall_score, make_scorer
from copy import deepcopy

specificity = make_scorer(recall_score, pos_label=0)

In [None]:
clf = GridSearchCV(pipeline,
                #    param_grid={'clf__min_samples_split': [20,50,100,120,150,], 'imputer__n_neighbors':[5,10,30,50,70,100]},
                   param_grid={'clf__min_samples_split': [20,50,100,120,150]}, 
                   verbose=3,
                   scoring={'f1':'f1',
                            'roc_auc':'roc_auc',
                            'sensitivity':'recall',
                            'precision':'precision',
                            'specificity': specificity},
                   refit='roc_auc')

In [None]:
X = dat.drop(columns=['BIRTHID','diagnosis'])
cols_todrop = []
for col in X.columns:
    if X[col].isna().mean() > 0.3:  cols_todrop.append(col)
print('dropping these columns:', cols_todrop)
X = X.drop(columns=cols_todrop)
y = dat['diagnosis']
clf.fit(X, y)
clf_baseline = deepcopy(clf)

In [None]:
clf.best_params_, clf.best_score_

### 4. initial look at feature importance

In [None]:
import shap

In [None]:
clf_baseline.best_estimator_

In [None]:
X_processed = clf_baseline.best_estimator_[:2].transform(X)

In [None]:
explainer = shap.TreeExplainer(clf_baseline.best_estimator_[2])
shap_values = explainer(X_processed, check_additivity=True)

In [None]:
shap_values.values = shap_values.values[:,:,1]
shap_values.feature_names = X.columns.to_list()
shap.plots.beeswarm(shap_values=shap_values, max_display=30)