In [None]:
%matplotlib inline
from B00_util import *
%reload_ext autoreload
%autoreload 2

# Get  trigger positive data for ML4

In [None]:
dataset = extractDataset("B00_ML4TrgPos_Y2016", {  "AllMed",  "HF","NonVAMed", "DispensedDrug","Only10daysPrior30DaysAfter"})

#  Cohorts

In [None]:
cohorts = dataset['cohort']
dizzy_cohort_df, abdpain_cohort_df = separate_cohorts(cohorts)

#  Demographic data
- for dizzy df, there are four labeled patients for which we do not have demo or cohort records

In [None]:
demog = dataset['Demorgraphics']
dizzy_demo_coded, abdpain_demo_coded = separate_demog(demog,dizzy_cohort_df,abdpain_cohort_df)
print('Dizzy demo df = ', dizzy_demo_coded.shape, ' Abdpain demo df = ', abdpain_demo_coded.shape)    

#  ED vitals
- vitals during ED visit
    - systolic, diastolic (per visit)
    - pulse, respiration, pulse oximetry, pain, temperature (count, min, max, first for multiple readings)


In [None]:
vitals = dataset['Vital']

dizzy_EDvitals_df,abdpain_EDvitals_df = separate_cohorts_EDvitals(vitals,dizzy_cohort_df,abdpain_cohort_df)
print(dizzy_EDvitals_df.shape,abdpain_EDvitals_df.shape)

In [None]:
print(dizzy_EDvitals_df.isna().sum())

# Vitals during the hospitalizations subsequent to ED visits

In [None]:
# get vitals from hospital visits
dizzy_hosp_vitals_df,abdpain_hosp_vitals_df = separate_cohorts_hosp_vitals(vitals,dizzy_cohort_df,abdpain_cohort_df)
print(dizzy_hosp_vitals_df.shape,abdpain_hosp_vitals_df.shape)
dizzy_hosp_vitals_df.isna().sum()

# Consults  ordered during ED visit
- count up top three consult depts for dizzy
- count up top seven consult depts for abdpain

Cardiology consult count useful for dizzy
EKG consult count useful for abdpain

In [None]:
consults = dataset['Consult']
dizzy_consults, abdpain_consults = separate_cohorts_consults(consults,dizzy_cohort_df,abdpain_cohort_df)

# look at which departments are being consulted (pick top N)
dizzy_topN_consult_counts = topN_consult_counts(dizzy_consults,dizzy_demo_coded,3)
abdpain_topN_consult_counts = topN_consult_counts(abdpain_consults,abdpain_demo_coded,7)
print(dizzy_topN_consult_counts.shape, abdpain_topN_consult_counts.shape)
print(dizzy_topN_consult_counts.isna().sum())

# Imaging features
- ct images: - how many ordered, how many w/contrast, how many abnormal
- xr images: how many ordered, how many abnormal
- us images: how many ordered, how many abnormal


In [None]:
# get imaging records for dizzy and abdpain
images = dataset['Rad']

dizzy_images, abdpain_images = separate_cohorts_images(images,dizzy_cohort_df,abdpain_cohort_df)
print(dizzy_images.shape, abdpain_images.shape)
print(dizzy_images.isna().sum())


# Labs: ED visit only
- for select labs, get count, min, max, abnormal_count

In [None]:
labs = dataset['Lab']
dizzy_labs, abdpain_labs = separate_cohorts_labs(labs,dizzy_cohort_df,abdpain_cohort_df)
print(dizzy_labs.shape,abdpain_labs.shape)

labs = ['WBC','glucose','albumin','potassium','calcium','lact','chloride','bun','creat','troponin','CO2','ast','alt',
        'alkphos','lipase','amylase','hgb']
lab_fns = ['matches_' + lab + '_loinc' for lab in labs]

# collect all labs and then filter later on
dizzy_lab_dict = {}
abdpain_lab_dict = {}
for i in range(len(labs)):
    lab, labfn = labs[i], lab_fns[i]
    dizzy_lab_dict[lab] = get_labs_data(dizzy_labs,lab,eval(labfn),dizzy_demo_coded)
    abdpain_lab_dict[lab] = get_labs_data(abdpain_labs,lab,eval(labfn),abdpain_demo_coded)


In [None]:
# merge lab dataframes  

dizzy_merged_labs = pd.DataFrame(dizzy_cohort_df.PtSSN.unique(),columns=['PtSSN'])
abdpain_merged_labs = pd.DataFrame(abdpain_cohort_df.PtSSN.unique(),columns=['PtSSN'])
for lab in labs:
    dizzy_merged_labs = pd.merge(dizzy_merged_labs,dizzy_lab_dict[lab],on='PtSSN')
    abdpain_merged_labs = pd.merge(abdpain_merged_labs,abdpain_lab_dict[lab],on='PtSSN')
print(dizzy_merged_labs.shape, abdpain_merged_labs.shape)

# drop cols with more than 10% NA
def check_nas(df,cols,thresh):
    drop_cols = []
    for col in cols:
        pct_na = df[col].isna().sum()/df.shape[0] * 100
        if pct_na > thresh:
            #print(col,pct_na)
            drop_cols = drop_cols + [col]
    return drop_cols

def clean_lab_df(merged_df,cols,thresh):
    drop_labs = check_nas(merged_df,cols,thresh)
    #print('Lab cols to drop:',drop_labs)
    lab_retain = list(set(merged_df.columns).difference(set(drop_labs)))
    print(merged_df[lab_retain].shape,merged_df[lab_retain].dropna().shape)
    
    
    return merged_df[lab_retain]

dizzy_clean_labs = clean_lab_df(dizzy_merged_labs,dizzy_merged_labs.columns[1:],10)
abdpain_clean_labs = clean_lab_df(abdpain_merged_labs,abdpain_merged_labs.columns[1:],10)
print(dizzy_clean_labs.shape,abdpain_clean_labs.shape)
print(dizzy_clean_labs.isna().sum())

# History
- for dizzy, use Viral's ICD list
- for abdpain, use Adel's ICD list refined by Andy Z

In [None]:
icds = dataset['ICD']
dizzy_icds,abdpain_icds = separate_cohorts_icds(icds,dizzy_cohort_df,abdpain_cohort_df)
print(dizzy_icds.shape,abdpain_icds.shape)

# get dizzy risk factors
dizzy_rf_df = get_dizzy_rf(dizzy_icds,dizzy_cohort_df)
bool_dizzy_rf_df = pd.concat([dizzy_rf_df.PtSSN, dizzy_rf_df.iloc[:,1:].astype(bool).astype(int)],axis=1 )
print(dizzy_rf_df.shape, bool_dizzy_rf_df.shape)

# get abdpain risk factors
abdpain_rf_df = get_abdpain_rf(abdpain_icds,abdpain_cohort_df)
bool_abdpain_rf_df = pd.concat([abdpain_rf_df.PtSSN, abdpain_rf_df.iloc[:,1:].astype(bool).astype(int)],axis=1 )
print(abdpain_rf_df.shape, bool_abdpain_rf_df.shape)

# add a column which is the number of risk factors
bool_dizzy_rf_df['total_rf'] = bool_dizzy_rf_df.iloc[:,1:].sum(axis=1)
bool_abdpain_rf_df['total_rf'] = bool_abdpain_rf_df.iloc[:,1:].sum(axis=1)

# Get labeled data

In [None]:
dizzy_df, dizzy_labels_df = retrieveLabels_dizzy(dizzy_fname)
abdpain_df, abdpain_labels_df = retrieveLabels_abdpain(abdpain_fname)
print(dizzy_labels_df.shape,abdpain_labels_df.shape)

print(dizzy_labels_df.label.value_counts())
print(abdpain_labels_df.label.value_counts())
print('\nPPV for dizzy = ', dizzy_labels_df[dizzy_labels_df.label=='MOD'].shape[0]/dizzy_labels_df.shape[0])
print('PPV for abdpain = ', np.round(abdpain_labels_df[abdpain_labels_df.label=='MOD'].shape[0]/abdpain_labels_df.shape[0],3))

# Data for Table 1

In [None]:
dizzy_cohort_df_labeled = pd.merge(dizzy_cohort_df,dizzy_labels_df,on='PtSSN')
dizzy_demo_coded_labeled = pd.merge(dizzy_demo_coded,dizzy_labels_df,on='PtSSN')
print(dizzy_demo_coded_labeled.columns)
dizzy_subset = dizzy_demo_coded_labeled[dizzy_demo_coded_labeled.label.isin(['MOD','NoMOD'])]

# age analysis
display(dizzy_subset['age_at_index_visit'].describe().T.loc[["mean","std"]])
display(dizzy_subset.groupby('label')['age_at_index_visit'].describe().T.loc[["mean","std"],:])

# gender analysis
display(dizzy_subset.groupby('label')['Gender'].value_counts())

from scipy.stats import fisher_exact
# from scipy.stats.contingency import crosstab   
    

ttest_fields(dizzy_subset,
             dizzy_subset.columns[1:-1],
            ['c','d','d','d','d','d','d','d'],show=True)

# Labeled versions of all dataframes

In [None]:
# labeled versions of the cohort datasets
dizzy_cohort_df_labeled = pd.merge(dizzy_cohort_df,dizzy_labels_df,on='PtSSN')
abdpain_cohort_df_labeled = pd.merge(abdpain_cohort_df,abdpain_labels_df,on='PtSSN')
print('Cohort:',dizzy_cohort_df_labeled.shape, abdpain_cohort_df_labeled.shape)

# labeled versions of demog datasets
dizzy_demo_coded_labeled = pd.merge(dizzy_demo_coded,dizzy_labels_df,on='PtSSN')
abdpain_demo_coded_labeled = pd.merge(abdpain_demo_coded,abdpain_labels_df,on='PtSSN')
print('Demo:', dizzy_demo_coded_labeled.shape,abdpain_demo_coded_labeled.shape)

# get labeled version of ED vitals
dizzy_EDvitals_labeled = pd.merge(dizzy_EDvitals_df,dizzy_labels_df,on='PtSSN')
abdpain_EDvitals_labeled = pd.merge(abdpain_EDvitals_df,abdpain_labels_df,on='PtSSN')
print('EDVitals:', dizzy_EDvitals_labeled.shape,abdpain_EDvitals_labeled.shape)

# get labeled versions of hosp vitals
dizzy_hosp_vitals_labeled = pd.merge(dizzy_hosp_vitals_df,dizzy_labels_df,on='PtSSN')
abdpain_hosp_vitals_labeled = pd.merge(abdpain_hosp_vitals_df,abdpain_labels_df,on='PtSSN')
print('Hosp vitals:', dizzy_hosp_vitals_labeled.shape,abdpain_hosp_vitals_labeled.shape)

# get labeled versions of consults
dizzy_topN_consult_counts_labeled = pd.merge(dizzy_topN_consult_counts,dizzy_labels_df,on='PtSSN').fillna(0)
abdpain_topN_consult_counts_labeled = pd.merge(abdpain_topN_consult_counts,abdpain_labels_df,on='PtSSN').fillna(0)
print('Consults: ', dizzy_topN_consult_counts_labeled.shape,abdpain_topN_consult_counts_labeled.shape)

# get labeled version of images
dizzy_images_labeled = pd.merge(dizzy_images,dizzy_labels_df,on='PtSSN')
abdpain_images_labeled = pd.merge(abdpain_images,abdpain_labels_df,on='PtSSN')
print('Imaging:', dizzy_images_labeled.shape,abdpain_images_labeled.shape)

# get labeled versions of risk factors
dizzy_rf_df_labeled = pd.merge(dizzy_rf_df,dizzy_labels_df,on='PtSSN')
abdpain_rf_df_labeled = pd.merge(abdpain_rf_df,abdpain_labels_df,on='PtSSN')
print('Risk Factors:', dizzy_rf_df_labeled.shape,abdpain_rf_df_labeled.shape)
bool_dizzy_rf_df_labeled = pd.merge(bool_dizzy_rf_df,dizzy_labels_df,on='PtSSN')
bool_abdpain_rf_df_labeled = pd.merge(bool_abdpain_rf_df,abdpain_labels_df,on='PtSSN')
print('Boolean Risk Factors:', dizzy_rf_df_labeled.shape,abdpain_rf_df_labeled.shape)

# get labeled versions of clean labs
dizzy_clean_labs_labeled = pd.merge(dizzy_clean_labs,dizzy_labels_df,on='PtSSN')
abdpain_clean_labs_labeled = pd.merge(abdpain_clean_labs,abdpain_labels_df,on='PtSSN')
print('Labs: ', dizzy_clean_labs_labeled.shape, abdpain_clean_labs_labeled.shape)


# Check if any of the features  are useful for MOD prediction

In [None]:
# cohort fields ttest
cohort_fields = ['ed_duration', 'ed_first_inp_delta','sum_hosp_stay', 'num_hosp','num_ED_visits']
cohort_ftypes = ['c','c','c','c','c']
dizzy_cohort_sig = ttest_fields(dizzy_cohort_df_labeled[dizzy_cohort_df_labeled.label.isin(['MOD','NoMOD'])],cohort_fields,cohort_ftypes)
print('Dizziness cohort:', dizzy_cohort_sig)
abdpain_cohort_sig = ttest_fields(abdpain_cohort_df_labeled,cohort_fields,cohort_ftypes)
print('Abdpain cohort:',abdpain_cohort_sig)

# demo field ttest
demo_fields = dizzy_demo_coded.columns[1:]
demo_ftypes = ['c','d','d','d','d','d','d','d']
dizzy_demo_sig = ttest_fields(dizzy_demo_coded_labeled[dizzy_demo_coded_labeled.label.isin(['MOD','NoMOD'])],demo_fields,demo_ftypes,show=False)
print('Dizziness demo:',dizzy_demo_sig)
abdpain_demo_sig = ttest_fields(abdpain_demo_coded_labeled,demo_fields,demo_ftypes,show=False)
print('Abdpain demo:',abdpain_demo_sig)

# ED Vitals ttest
# do a ttest with all fields with respect to MOD
ED_vitals_fields= dizzy_EDvitals_labeled.columns[3:-1]
ED_vitals_ftypes = len(ED_vitals_fields)*['c']
dizzy_vitals_sig = ttest_fields(dizzy_EDvitals_labeled[dizzy_EDvitals_labeled.label.isin(['MOD','NoMOD'])],
                                ED_vitals_fields,ED_vitals_ftypes)
print('Dizziness ED vitals:',dizzy_vitals_sig)
abdpain_vitals_sig = ttest_fields(abdpain_EDvitals_labeled,ED_vitals_fields,ED_vitals_ftypes)
print('Abdpain ED vitals:',abdpain_vitals_sig)

# Hosp vitals ttest
# do a ttest with all fields with respect to MOD
hosp_vitals_fields = dizzy_hosp_vitals_df.columns[3:]
hosp_vitals_ftypes = len(hosp_vitals_fields) * ['c']
dizzy_hosp_vitals_sig = ttest_fields(dizzy_hosp_vitals_labeled[dizzy_hosp_vitals_labeled.label.isin(['MOD','NoMOD'])],
                                     hosp_vitals_fields,hosp_vitals_ftypes)
print('Dizziness hosp vitals:',dizzy_hosp_vitals_sig)
abdpain_hosp_vitals_sig = ttest_fields(abdpain_hosp_vitals_labeled,hosp_vitals_fields,hosp_vitals_ftypes)
print('Abdpain hosp vitals:',abdpain_hosp_vitals_sig)

# Consult ttest
dizzy_ccounts_fields = dizzy_topN_consult_counts_labeled.columns[1:-1]
dizzy_ccounts_ftypes = ['d','d','d']
dizzy_consults_sig = ttest_fields(dizzy_topN_consult_counts_labeled,dizzy_ccounts_fields,dizzy_ccounts_ftypes)
print('Dizziness consults: ',dizzy_consults_sig)
abdpain_ccounts_fields = abdpain_topN_consult_counts_labeled.columns[1:-1]
abdpain_ccounts_ftypes = len(abdpain_ccounts_fields) * ['d']
abdpain_consults_sig = ttest_fields(abdpain_topN_consult_counts_labeled,abdpain_ccounts_fields,abdpain_ccounts_ftypes)
print('Abdpain consults: ',abdpain_consults_sig)

# Imaging ttest
dizzy_images_fields = dizzy_images_labeled.columns[2:-1]
dizzy_images_ftypes = len(dizzy_images_fields) * ['c']
dizzy_images_sig = ttest_fields(dizzy_images_labeled[dizzy_images_labeled.label.isin(['MOD','NoMOD'])],dizzy_images_fields,dizzy_images_ftypes)
print('Dizziness imaging: ', dizzy_images_sig)
abdpain_images_fields = abdpain_images_labeled.columns[2:-1]
abdpain_images_ftypes = len(abdpain_images_fields) * ['c']
abdpain_images_sig = ttest_fields(abdpain_images_labeled,abdpain_images_fields,abdpain_images_ftypes)
print('Abdpain imaging: ', abdpain_images_sig)

# ttest for risk factors
dizzy_rf_sig = ttest_fields(bool_dizzy_rf_df_labeled,bool_dizzy_rf_df.columns[1:],len(bool_dizzy_rf_df.columns[1:])*['d'],show=False)
print('Dizzy RF: ', dizzy_rf_sig)
abdpain_rf_sig = ttest_fields(bool_abdpain_rf_df_labeled,bool_abdpain_rf_df.columns[1:],len(bool_abdpain_rf_df.columns[1:])*['d'],show=False)
print('Abdpain RF: ', abdpain_rf_sig)

# ttest for labs
dizzy_labs_sig = ttest_fields(dizzy_clean_labs_labeled,dizzy_clean_labs_labeled.columns[1:-1],
                             len(dizzy_clean_labs_labeled.columns[1:])*['d'],show=False)
abdpain_labs_sig = ttest_fields(abdpain_clean_labs_labeled,abdpain_clean_labs_labeled.columns[1:-1],
                             len(abdpain_clean_labs_labeled.columns[1:])*['d'],show=False)


    
print('Dizzy Lab: ',dizzy_labs_sig)
print('Abdpain Lab: ',abdpain_labs_sig)

dizzy_good_cols = dizzy_cohort_sig + dizzy_demo_sig + dizzy_vitals_sig + dizzy_hosp_vitals_sig + dizzy_consults_sig 
                + dizzy_images_sig + dizzy_rf_sig + dizzy_labs_sig

# Merge different dataframes

In [None]:
# start with cohort, then demo, then vitals, then consults, imaging, labs, history

dizzy_cohort_cols = ['TriggerType', 'PtSSN', 'EDStartDateTime', 'EDEndDateTime',
       'EDVisitReason', 'AdmitDateTime', 'DischargeDateTime','hosp_stay',
       'ed_duration', 'FirstAdmission', 'ed_first_inp_delta', 'num_ED_visits',
       'num_hosp', 'sum_hosp_stay']

dizzy_all = pd.DataFrame(dizzy_demo_coded.PtSSN,columns=['PtSSN'])
dizzy_all = pd.merge(dizzy_all,dizzy_cohort_df[dizzy_cohort_cols],on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_demo_coded,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_EDvitals_df,on=['PtSSN','EDStartDateTime'],how='left')
dizzy_all = pd.merge(dizzy_all,dizzy_hosp_vitals_df,on=['PtSSN','AdmitDateTime'],how='left')
dizzy_all = pd.merge(dizzy_all,dizzy_topN_consult_counts,on=['PtSSN'])
dizzy_all = pd.merge(dizzy_all,dizzy_images,on=['PtSSN','EDStartDateTime'])
dizzy_all = pd.merge(dizzy_all,dizzy_clean_labs,on=['PtSSN'],how='left')
dizzy_all = pd.merge(dizzy_all,bool_dizzy_rf_df,on='PtSSN')
print(dizzy_all.shape)

In [None]:
# handle missing values

cols_with_missing = []
for col in dizzy_all.columns:
    missing = dizzy_all[col].isna().sum()
    if missing > 0:
        cols_with_missing.append(col)
        print(col,missing)

In [None]:
combo_label_df.columns

In [None]:
# find a way to plot the labeled data in the combined dataframe


combo_label_df = pd.merge(dizzy_all,dizzy_labels_df,on='PtSSN',how='left')
combo_label_df.loc[combo_label_df.label.isna(),'label'] = 'unknown'
useful_cols = list(set(combo_label_df.columns[7:-1]).difference(['FirstAdmission']))
reduced_combo = combo_label_df[combo_label_df.label.isin(['MOD','NoMOD','unknown'])].dropna()
X = reduced_combo[useful_cols]
y = reduced_combo['label']
print(X.shape,y.shape)
label_list = list(reduced_combo.label.value_counts().index)
u = umap_plot_label(X,y,1,3,label_list,'upper left');


In [None]:
# analyze the resulting clusters
clust = cluster_umap(u,5,reduced_combo[useful_cols+['label']],'upper left')
analyze_clusters(clust,reduced_combo[useful_cols+['label']],useful_cols) 

In [None]:
imp_cols = ['ed_first_inp_delta','age_at_index_visit','new_race_WHITE','new_race_BLACK OR AFRICAN AMERICAN','glucose_count',
           'glucose_min','glucose_max','glucose_abnormal_count','CO2_count','hgb_abnormal_count','Systolic_max','Systolic_first',
           'Diastolic_max','Diastolic_first','PULSE_min','PULSE_first','HOSP_Systolic_max','HOSP_Diastolic_max','HOSP_PULSE_min',
           'HOSP_PULSE_first','ct_count','ct_abnormal_count','Diabetes','Hypertension','Coronary artery disease (CAD)']

X = reduced_combo[imp_cols]
y = reduced_combo['label']
label_list = list(reduced_combo.label.value_counts().index)
u = umap_plot_label(X,y,1,7,label_list,'upper right');


In [None]:
# analyze the resulting clusters
clust = cluster_umap(u,3,reduced_combo[imp_cols+['label']],'upper right')
analyze_clusters(clust,reduced_combo[imp_cols+['label']],imp_cols) 

# Need to get red flags to recreate Paarth classifier on our data
- headache
- diplopia


In [None]:
notes.columns

In [None]:
tmp = notes[notes.PatientSSN==ptssn]
start_time, end_time = dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn].EDStartDateTime.values[0],dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn].DischargeDateTime.values[0]
tmp1 = tmp[(tmp.EntryDateTime >= start_time) & (tmp.EntryDateTime <= end_time)]
for i in range(tmp1.shape[0]):
    print(i,'---------------------------------------------')
    print(tmp1.TIUStandardTitle.iloc[i])
    print(tmp1.ReportText.iloc[i])


In [None]:
notes = dataset['withRole']
notes.PatientSSN = notes.PatientSSN.astype('int64')
notes.EntryDateTime = pd.to_datetime(notes.EntryDateTime)
ed_notes = notes[notes.TIUStandardTitle=='EMERGENCY DEPT NOTE'].copy()
ed_notes.rename(columns={'PatientSSN':'PtSSN'},inplace=True)
ed_notes.PtSSN = ed_notes.PtSSN.astype('int64')


ed_notes_mod = pd.merge(ed_notes,dizzy_labels_df,on='PtSSN')


In [None]:
ptssn = ed_notes_mod.iloc[0].PtSSN
#print(ed_notes_mod.ReportText.iloc[0])
display(dizzy_df[dizzy_df.PtSSN==ptssn].CaseSummaryER.values)
combo_df[combo_df.PtSSN==ptssn]
dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn]

In [None]:
# UMAP the lab dataframe for abdpain

XX = abdpain_clean_labs.dropna()
X = XX[set(XX.columns).difference(['PtSSN'])]
u = umap_plot_nolabel(X,1,7)

# analyze the resulting clusters
clust = cluster_umap_nolabel(u,3,X,'upper left')
analyze_clusters_nolabel(clust,X,X.columns)

# create a tapestry plot to visualize the clusters according to median values


In [None]:
# start merging all dataframes and UMAP them for dizzy and abdpain


# Make predictive model for dizzy with all the ttest relevant fields

In [None]:
dizzy_cohort_tmp = dizzy_cohort_df_labeled[['PtSSN'] + dizzy_cohort_sig]
dizzy_demo_tmp = dizzy_demo_coded_labeled[['PtSSN'] + dizzy_demo_sig]
dizzy_vitals_tmp = dizzy_EDvitals_labeled[['PtSSN'] + dizzy_vitals_sig]
dizzy_hosp_vitals_tmp = dizzy_hosp_vitals_labeled[['PtSSN'] + dizzy_hosp_vitals_sig]
#dizzy_consults_tmp = dizzy_topN_consult_counts_labeled[['PtSSN'] + dizzy_consults_sig]

# images
dizzy_images_xr_count_labeled = pd.merge(dizzy_images_xr_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')
dizzy_images_ct_count_labeled = pd.merge(dizzy_images_ct_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')
dizzy_images_ct_abnormal_count_labeled = pd.merge(dizzy_images_ct_abnormal_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')

# convert image count fields into int
dizzy_images_xr_count_labeled.xr_count = dizzy_images_xr_count_labeled.xr_count.astype(int)
dizzy_images_ct_count_labeled.ct_count = dizzy_images_ct_count_labeled.ct_count.astype(int)
dizzy_images_ct_abnormal_count_labeled.ct_abnormal_count = dizzy_images_ct_abnormal_count_labeled.ct_abnormal_count.astype(int)

dizzy_images_xr_count_tmp = dizzy_images_xr_count_labeled[['PtSSN','xr_count']].drop_duplicates()
dizzy_images_ct_count_tmp = dizzy_images_ct_count_labeled[['PtSSN','ct_count']].drop_duplicates()
dizzy_images_ct_abnormal_count_tmp = dizzy_images_ct_abnormal_count_labeled[['PtSSN','ct_abnormal_count']].drop_duplicates()


# labs is rel_dizzy_labs
# risk factors is dizzy_rf_df_all_labeled

# merge them all

labels_tmp = dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])][['PtSSN','label']]
dizzy_all = pd.merge(labels_tmp,dizzy_cohort_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_demo_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_vitals_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_hosp_vitals_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_consults_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,rel_dizzy_labs,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_images_xr_count_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_images_ct_count_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_images_ct_abnormal_count_tmp,on='PtSSN')
dizzy_all = pd.merge(dizzy_all,dizzy_rf_df_all_labeled[['PtSSN','Hx aneurysm']],on=['PtSSN'])
print(dizzy_all.shape)
set(dizzy_all.columns).difference(set(all_dizzy_all.columns))

# make feature dataframe for entire data set

In [None]:
# make dataset for all of dizzy (not just the labeled data)
all_dizzy_cohort_tmp = dizzy_cohort_df[['PtSSN'] + dizzy_cohort_sig]
#print(all_dizzy_cohort_tmp.shape,all_dizzy_cohort_tmp.columns)
all_dizzy_demo_tmp = dizzy_demo_coded[['PtSSN'] + dizzy_demo_sig]
#print(all_dizzy_demo_tmp.shape,all_dizzy_demo_tmp.columns)
all_dizzy_vitals_tmp = dizzy_EDvitals_df[['PtSSN'] + dizzy_vitals_sig]
#print(all_dizzy_vitals_tmp.shape,all_dizzy_vitals_tmp.columns)
all_dizzy_hosp_vitals_tmp = dizzy_hosp_vitals_df[['PtSSN'] + dizzy_hosp_vitals_sig]
#all_dizzy_consults_tmp = dizzy_topN_consult_counts[['PtSSN'] + dizzy_consults_sig]
#print(all_dizzy_hosp_vitals_tmp.shape,all_dizzy_hosp_vitals_tmp.columns)

# convert image count fields into int
dizzy_images_xr_count.xr_count = dizzy_images_xr_count.xr_count.astype(int)
dizzy_images_ct_count.ct_count = dizzy_images_ct_count.ct_count.astype(int)
dizzy_images_ct_abnormal_count.ct_abnormal_count = dizzy_images_ct_abnormal_count.ct_abnormal_count.astype(int)

all_dizzy_images_xr_count_tmp = dizzy_images_xr_count[['PtSSN','xr_count']].drop_duplicates()
all_dizzy_images_ct_count_tmp = dizzy_images_ct_count[['PtSSN','ct_count']].drop_duplicates()
all_dizzy_images_ct_abnormal_count_tmp = dizzy_images_ct_abnormal_count[['PtSSN','ct_abnormal_count']].drop_duplicates()


# labs is rel_dizzy_labs
# risk factors is dizzy_rf_df_all_labeled

# merge them all


all_dizzy_all = pd.merge(all_dizzy_cohort_tmp,all_dizzy_demo_tmp,on='PtSSN')

all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_vitals_tmp,on='PtSSN')

all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_hosp_vitals_tmp,on='PtSSN')

#all_dizzy_all = pd.merge(all_dizzy_all,dizzy_consults_tmp,on='PtSSN')
all_dizzy_all = pd.merge(all_dizzy_all,rel_dizzy_labs,on='PtSSN')

all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_xr_count_tmp,on='PtSSN')

all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_ct_count_tmp,on='PtSSN')

all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_ct_abnormal_count_tmp,on='PtSSN')

all_dizzy_all = pd.merge(all_dizzy_all,dizzy_rf_df_all[['PtSSN','Hx aneurysm']],on=['PtSSN'])
print(all_dizzy_all.shape)
print(all_dizzy_all.columns)

In [None]:
# which ones have NAs in them?
# make all  count columns default to 0

limit_dizzy_all = dizzy_all[dizzy_all.label.isin(['MOD','NoMOD'])].copy()

abcols = ['WBC_abnormal_count','glucose_count','glucose_abnormal_count','albumin_abnormal_count','CO2_count',
        'alkphos_abnormal_count','hgb_abnormal_count']
for abcol in abcols:
    limit_dizzy_all[abcol] = limit_dizzy_all[abcol].fillna(0)



for col in limit_dizzy_all.columns[2:]:
    v = limit_dizzy_all[col].isna().sum()
    if v > 0:
        print(col,v )
    
print(limit_dizzy_all.shape)

In [None]:
# for glucose_min and glucose_max use class_specific medians

glucose_cols = ['glucose_min','glucose_max']
for gcol in glucose_cols:
    vals = limit_dizzy_all.groupby('label')[gcol].describe()[['50%']].reset_index()
    mod_val = vals[vals.label=='MOD']['50%'].values[0]
    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]
    print(mod_val,nomod_val)
    

In [None]:
# drop highly correlated columns to support logistic regression
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(limit_dizzy_all.iloc[:,2:].corr(),ax=ax,annot=False)

In [None]:
tmp = limit_dizzy_all.dropna()
print(tmp.shape)
X = tmp.iloc[:,2:]
yy = tmp.label
y = np.array([1 if (x=='MOD') else 0 for x in yy])
print(X.shape,y.shape)
clf = tune_model(X,y)
print('Best C = ',clf.best_params_['C'])

In [None]:
#clf,select_feats = build_L1_model(X,y,clf.best_params_['C'])
clf,select_feats = build_L1_model(X,y,0.0045)
visualize_model(clf,select_feats)

In [None]:
# umap it all
u = umap_plot_label(X,yy,1,7,['NoMOD','MOD'],'upper right')

In [None]:
c = cluster_umap(u,4,tmp,'upper right')

In [None]:
#stat_cols = featimp.iloc[:5].index
analyze_clusters(c,tmp,select_feats)

In [None]:
# find those rows in limit_dizzy_all that have nulls
null_val_rows = limit_dizzy_all[limit_dizzy_all.isnull().any(axis=1)].copy()
null_val_rows.shape

In [None]:
for gcol in ['glucose_min','glucose_max']:
    vals = limit_dizzy_all.groupby('label')[gcol].describe()[['50%']].reset_index()
    mod_val = vals[vals.label=='MOD']['50%'].values[0]
    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]
    print(gcol,mod_val,nomod_val)                                     

In [None]:
null_val_rows.loc[null_val_rows.label=='MOD','glucose_min'] = 156
null_val_rows.loc[null_val_rows.label=='MOD','glucose_max'] = 171
null_val_rows.loc[null_val_rows.label=='NoMOD','glucose_min'] = 126
null_val_rows.loc[null_val_rows.label=='NoMOD','glucose_max'] = 126

In [None]:
# predict on null_val_rows
print(clf.predict(null_val_rows.iloc[:,2:]))
print(null_val_rows['label'])

In [None]:
print(all_dizzy_all.shape)
abcols = ['WBC_abnormal_count','glucose_count','glucose_abnormal_count','albumin_abnormal_count','CO2_count',
        'alkphos_abnormal_count','hgb_abnormal_count']
for abcol in abcols:
    all_dizzy_all[abcol] = all_dizzy_all[abcol].fillna(0)



for col in all_dizzy_all.columns[1:]:
    v = all_dizzy_all[col].isna().sum()
    if v > 0:
        print(col,v )
    
print(all_dizzy_all.shape)

# find those rows in all_dizzy_all that have null
all_null_val_rows = all_dizzy_all[all_dizzy_all.isnull().any(axis=1)].copy()
print(all_null_val_rows.shape)

for gcol in ['glucose_min','glucose_max']:
    vals = all_dizzy_all[gcol].describe()[['50%']]
    g_val = vals['50%']
   
    print(gcol,g_val)     
    
# fill in glucose values that are medians across the entire set.
all_null_val_rows.loc[:,'glucose_min'] = 134
all_null_val_rows.loc[:,'glucose_max'] = 135


tmp1 = pd.concat([all_dizzy_all.dropna(),all_null_val_rows])
print(tmp1.iloc[:,1:].shape)
for col in tmp1.columns[1:]:
    v = tmp1[col].isna().sum()
    if v > 0:
        print(col,v )

print('Logistic regression prediction:')
ypred1 = clf.predict(tmp1.iloc[:,1:])
display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())
display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())

print('Random forest prediction:')
ypred1 = rf.predict(tmp1.iloc[:,1:])
display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())
display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())

In [None]:
ypred1 =rf.predict(tmp1.iloc[:,1:])
display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())
display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())

In [None]:
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(max_depth=7,criterion='entropy')
scores = cross_val_score(dt,tmp.iloc[:,2:],y)
print(np.mean(scores),np.std(scores))

In [None]:
# train and test on all 75 labeled records
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10,max_depth=3)
y = np.array([1 if x=='MOD' else 0 for x in tmp.label])
rf.fit(tmp.iloc[:,2:],y)
ypred = rf.predict(tmp.iloc[:,2:])
metrics.confusion_matrix(y,ypred)

In [None]:
rf.predict(null_val_rows.iloc[:,2:])

In [None]:
featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)
import seaborn as sns

plt.figure(figsize=(8,6))
myplot = sns.barplot(featimp.index,featimp.values)
myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);

In [None]:
# now do an 80-20 split; and run train/test
Xtrain,Xtest,ytrain,ytest = train_test_split(tmp.iloc[:,2:],y,stratify=y,test_size=0.1)
print(Xtrain.shape,Xtest.shape,ytrain.shape,ytest.shape)
rf = RandomForestClassifier(n_estimators=7,max_depth=4)
rf.fit(Xtrain,ytrain)
ypred = rf.predict(Xtest)
print(metrics.confusion_matrix(ytest,ypred))
print('AUC = ',metrics.roc_auc_score(ytest,ypred))
print('F1 = ',metrics.f1_score(ytest,ypred))
print('Accuracy = ',metrics.accuracy_score(ytest,ypred))

In [None]:
ypred_all = rf.predict(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))
y_all = [1 if x =='MOD' else 0 for x in tmp.label] + [1 if x=='MOD' else 0 for x in null_val_rows.label]
metrics.confusion_matrix(y_all,ypred_all)

In [None]:
featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)
import seaborn as sns

plt.figure(figsize=(8,6))
myplot = sns.barplot(featimp.index,featimp.values)
myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);

# Classify unlabeled records in dizzy

In [None]:
tmp1.shape

In [None]:
for col in tmp1.columns[1:]:
    v = tmp1[col].isna().sum()
    if v > 0:
        print(col,v )


In [None]:
tmp2.columns

In [None]:
# merge to create temp labels
tmp2 = pd.merge(tmp1,dizzy_labels_df,on='PtSSN',how='left')
print(tmp2.shape)
#display(tmp2.label.value_counts())
len(set(dizzy_labels_df.PtSSN).difference(tmp2.PtSSN))
tmp2.label = tmp2.label.fillna('unknown')
tmp3 = tmp2[tmp2.label.isin(['MOD','NoMOD','unknown'])]


u = umap_plot_label(tmp3.iloc[:,1:-1],tmp3.label,4,7,['NoMOD','unknown','MOD'],'upper right');


In [None]:
c = cluster_umap(u,3,tmp3,'upper right')

In [None]:
analyze_clusters(c,tmp3,list(featimp.index)[:10])

In [None]:
# plot the predicted labels for the unknowns


# Assemble the abdpain_all dataframe

In [None]:
abdpain_cohort_tmp = abdpain_cohort_df_labeled[['PtSSN'] + cohort_fields].drop_duplicates(subset=['PtSSN'])
abdpain_demo_tmp = abdpain_demo_coded_labeled[['PtSSN'] + list(abdpain_demo_coded_labeled.columns[1:-1])].drop_duplicates(subset=['PtSSN'])
abdpain_vitals_tmp = abdpain_EDvitals_labeled[['PtSSN'] + list(abdpain_EDvitals_labeled.columns[2:-1])].drop_duplicates(subset=['PtSSN'])
abdpain_hosp_vitals_tmp = abdpain_hosp_vitals_labeled[['PtSSN'] + list(abdpain_hosp_vitals_labeled.columns[2:-1])].drop_duplicates(subset=['PtSSN'])
abdpain_consults_tmp = abdpain_topN_consult_counts_labeled[['PtSSN'] + list(abdpain_topN_consult_counts.columns[1:-1])].drop_duplicates(subset=['PtSSN'])

# images
abdpain_images_xr_count_labeled = pd.merge(abdpain_images_xr_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])
abdpain_images_ct_count_labeled = pd.merge(abdpain_images_ct_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])
abdpain_images_ct_abnormal_count_labeled = pd.merge(abdpain_images_ct_abnormal_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])

# convert image count fields into int
abdpain_images_xr_count_labeled.xr_count = abdpain_images_xr_count_labeled.xr_count.astype(int)
abdpain_images_ct_count_labeled.ct_count = abdpain_images_ct_count_labeled.ct_count.astype(int)
abdpain_images_ct_abnormal_count_labeled.ct_abnormal_count = abdpain_images_ct_abnormal_count_labeled.ct_abnormal_count.astype(int)

abdpain_images_xr_count_tmp = abdpain_images_xr_count_labeled[['PtSSN','xr_count']].drop_duplicates()
abdpain_images_ct_count_tmp = abdpain_images_ct_count_labeled[['PtSSN','ct_count']].drop_duplicates()
abdpain_images_ct_abnormal_count_tmp = abdpain_images_ct_abnormal_count_labeled[['PtSSN','ct_abnormal_count']].drop_duplicates()


In [None]:
# labs is rel_abdpain_labs
# risk factors is abdpain_rf_df_all_labeled

# merge them all

abdpain_all = pd.merge(abdpain_labels_df,abdpain_cohort_tmp,on='PtSSN')
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,abdpain_demo_tmp,on='PtSSN')
print(abdpain_all.shape)

abdpain_all = pd.merge(abdpain_all,abdpain_consults_tmp,on='PtSSN')
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,rel_abdpain_labs,on='PtSSN')
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,abdpain_images_xr_count_tmp,on='PtSSN')
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,abdpain_images_ct_count_tmp,on='PtSSN')
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,abdpain_images_ct_abnormal_count_tmp,on='PtSSN')
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,abdpain_rf_df_all_labeled,on=['PtSSN','label'])
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,abdpain_vitals_tmp,on='PtSSN',how='left')
print(abdpain_all.shape)
abdpain_all = pd.merge(abdpain_all,abdpain_hosp_vitals_tmp,on='PtSSN',how='left')
print(abdpain_all.shape)

In [None]:
# who are the missing SSNs in abdpain_EDvitals?
tmp1 = abdpain_EDvitals_labeled.groupby('PtSSN')['PtSSN'].agg('count')
missing = set(abdpain_cohort_df_labeled.PtSSN).difference(set(tmp1.index))
print(missing)
abdpain_cohort_df_labeled[abdpain_cohort_df_labeled.PtSSN.isin(missing)]

In [None]:
# who are the missing SSNs in abdpain_hosp_vitals?
tmp2 = abdpain_hosp_vitals_labeled.groupby('PtSSN')['PtSSN'].agg('count')
missing1 = set(abdpain_cohort_df_labeled.PtSSN).difference(set(tmp2.index))
print(missing1)
abdpain_cohort_df_labeled[abdpain_cohort_df_labeled.PtSSN.isin(missing1)]

In [None]:
abdpain_all.label.value_counts()

In [None]:
abdpain_labels_df.label.value_counts()

In [None]:
for col in abdpain_all.columns[2:]:
   
    v = abdpain_all[col].isna().sum()
    if v > 0:
        print(col,v )
    

In [None]:
abdpain_all.columns

In [None]:
# fill na on the counts to be zero
count_fields = ['bun_abnormal_count','lact_abnormal_count','amylase_abnormal_count']

for col in count_fields:
    abdpain_all[col] = abdpain_all[col].fillna(0)
    
print(abdpain_all.shape)

In [None]:
abdpain_tmp_clean = abdpain_all.dropna(subset=['Systolic_count'])
missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))

In [None]:
# fix ED_vitals fields with NoMOD values

ed_vitals_cols = ['Systolic_count', 'Systolic_max', 'Systolic_min', 'Systolic_first'] + \
                 ['Diastolic_count', 'Diastolic_max', 'Diastolic_min', 'Diastolic_first'] + \
                 ['PULSE_count', 'PULSE_max', 'PULSE_min', 'PULSE_first'] + \
                 ['RESPIRATION_count', 'RESPIRATION_max', 'RESPIRATION_min', 'RESPIRATION_first'] + \
                 ['PAIN_count', 'PAIN_max', 'PAIN_min', 'PAIN_first'] + \
                 ['TEMPERATURE_count', 'TEMPERATURE_max', 'TEMPERATURE_min', 'TEMPERATURE_first'] 

impval_dict = {}
for gcol in ed_vitals_cols:
    vals = abdpain_tmp_clean.groupby('label')[gcol].describe()[['50%']].reset_index()
    mod_val = vals[vals.label=='MOD']['50%'].values[0]
    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]
    print(gcol,mod_val,nomod_val) 
    impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}
    
for PtSSN in missing:
    for gcol in ed_vitals_cols:
        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':
            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['MOD']
        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD':   
            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['NoMOD']
    

In [None]:
abdpain_tmp_clean = abdpain_all.dropna(subset=['HOSP_Systolic_count'])
missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))

# fix hosp_vitals fields with NoMOD/MOD values

hosp_vitals_cols = ['HOSP_' + x for x in ed_vitals_cols]

impval_dict = {}
for gcol in hosp_vitals_cols:
    vals = abdpain_tmp_clean.groupby('label')[gcol].describe()[['50%']].reset_index()
    mod_val = vals[vals.label=='MOD']['50%'].values[0]
    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]
    print(gcol,mod_val,nomod_val) 
    impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}
    
for PtSSN in missing:
    for gcol in hosp_vitals_cols:
        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':
            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['MOD']
        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD':   
            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['NoMOD']
    

In [None]:
for col in abdpain_all.columns[2:]:
   
    v = abdpain_all[col].isna().sum()
    if v > 0:
        print(col,v )
    

In [None]:
# fix the lab values colums for the missing based on class
labval_cols = ['WBC_max','potassium_max','potassium_min','chloride_max','amylase_max']

for col in labval_cols:
    abdpain_tmp_clean = abdpain_all.dropna(subset=[col])
    missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))

    vals = abdpain_tmp_clean.groupby('label')[col].describe()[['50%']].reset_index()
    mod_val = vals[vals.label=='MOD']['50%'].values[0]
    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]
    print(col,mod_val,nomod_val) 
    
    for PtSSN in missing:
        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':
            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,col] = mod_val
        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD':   
            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,col] = nomod_val
    


In [None]:
for col in abdpain_all.columns[2:]:
   
    v = abdpain_all[col].isna().sum()
    if v > 0:
        print(col,v )
    

In [None]:
#rel_cols = set(abdpain_all.columns).difference(set(['amylase_min','amylase_max']))
pred_cols = set(abdpain_all.columns).difference(set(['PtSSN','label']))
abdpain_all_clean = abdpain_all[['PtSSN','label'] + list(pred_cols)].copy()
abdpain_all_clean.columns

In [None]:
# make all count and boolean fields integers
['GI_ENDOSCOPY'  'HOSP_PAIN_count',
       'diverticulitis', 'HOSP_Systolic_count', 'GASTROENTEROLOGY',
       'appendicitis', 'HOSP_RESPIRATION_count',  'pancreatitis',
        'PAIN_count', 
       'cholecystisis',  'RESPIRATION_count',
       'amylase_abnormal_count', 
       'PULSE_count',
        'xr_count',
       'GENERAL SURGERY', 'HOSP_TEMPERATURE_count', 
       
        'HOSP_PULSE_count', 
       'ct_abnormal_count', 
       'bun_abnormal_count', 'HOSP_Diastolic_count', 'Systolic_count',
       'IB',  'diverticulosis',
       'num_ED_visits', 'EKG', 
       
       
       'TEMPERATURE_count', 
       'num_hosp',
        'Diastolic_count', 'cirrhosis',
       'cholelithiasis', 'ct_count',
       'CHART CONSULT',  'lact_abnormal_count',
       'TEMPERATURE_min', 'TEMPERATURE_first', 'ed_duration']

cfields = ['bun_abnormal_count','lact_abnormal_count','amylase_abnormal_count',
           'cirrhosis','crohns_uc','diverticulitis','appendicitis','gallbladder']
for col in cfields:
    abdpain_all_clean[col] = abdpain_all_clean[col].astype(int)
    
abdpain_all_clean.dtypes

In [None]:
tmp = abdpain_all_clean.dropna()
X = tmp.loc[:,pred_cols]
yy = tmp.label
y = np.array([1 if (x=='MOD') else 0 for x in yy])
print(X.shape,y.shape)
clf = tune_model(X,y,5)

In [None]:
#clf,select_feats = build_L1_model(X,y,clf.best_params_['C'])
clf,select_feats = build_L1_model(X,y,1)
visualize_model(clf,select_feats)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=4,max_depth=5)
y = np.array([1 if x=='MOD' else 0 for x in tmp.label])
rf.fit(tmp.iloc[:,2:],y)
ypred = rf.predict(tmp.iloc[:,2:])
metrics.confusion_matrix(y,ypred)

In [None]:
featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)
featimp_nz = featimp[featimp > 0]
import seaborn as sns

plt.figure(figsize=(8,6))
myplot = sns.barplot(featimp_nz.index,featimp_nz.values)
myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);

In [None]:
good_feats = featimp[featimp >= np.mean(featimp)].index

In [None]:
# find those rows in abdpain_clean_all that are not in tmp
null_val_rows = abdpain_all_clean[abdpain_all_clean.isnull().any(axis=1)].copy()
null_val_rows

In [None]:
# potassium_max, chlrode_max, WBC_max, amylase_min, amylase_max filled with class-dependent medians
impval_dict = {}
for gcol in ['potassium_max','chloride_max','WBC_max']:
    vals = abdpain_all_clean.groupby('label')[gcol].describe()[['50%']].reset_index()
    mod_val = vals[vals.label=='MOD']['50%'].values[0]
    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]
    print(gcol,mod_val,nomod_val) 
    impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}
    
for gcol in ['potassium_max','chloride_max','WBC_max']:
    null_val_rows.loc[null_val_rows.label=='MOD',gcol] = impval_dict[gcol]['MOD']
    null_val_rows.loc[null_val_rows.label=='NoMOD',gcol] = impval_dict[gcol]['NoMOD']
    

In [None]:
ytest = [1 if x=='MOD' else 0 for x in null_val_rows.label]
metrics.confusion_matrix(ytest,rf.predict(null_val_rows.iloc[:,2:]))

In [None]:
ypred_all = rf.predict(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))
y_all = [1 if x =='MOD' else 0 for x in tmp.label] + [1 if x=='MOD' else 0 for x in null_val_rows.label]
metrics.confusion_matrix(y_all,ypred_all)

In [None]:
ypred_all_prob = rf.predict_proba(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))[:,1]
print('AUC = ',metrics.roc_auc_score(y_all,ypred_all_prob))
print('F1 = ',metrics.f1_score(y_all,ypred_all))
print('AUPRC = ', metrics.precision_score(y_all,ypred_all))

In [None]:
# umap it all
u = umap_plot_label(X,yy,1,5,['NoMOD','MOD'],'upper left')

In [None]:
c = cluster_umap(u,3,tmp,'upper right')

In [None]:
stat_cols = featimp.iloc[:25].index
analyze_clusters(c,tmp,stat_cols)