In this notebook we are preparing the Hoboken test dataset: Hoboken tables are ingested; exclusion criteria is applied; data is explored; vitals, comorbidities, drugs and labs are appropriately transformed and cleaned; variables are mean centered and standardized; missing values are imputed; table 1 is produced; image features are appended and renamed and tables are joined.

# Environment

In [None]:
!pip install google-colab -q
!pip install shap -q
!pip install seaborn
!pip install tableone -q
!pip install sqldf



In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0  87482      0 --:--:-- --:--:-- --:--:-- 87482
OK
48 packages can be upgraded. Run 'apt list --upgradable' to see them.
gcsfuse is already the newest version (0.35.1).
0 upgraded, 0 newly installed, 0 to remove and 48 not upgraded.


In [None]:
#Standard library imports
from google.colab import auth
auth.authenticate_user()

In [None]:
!mkdir data
!gcsfuse hoboken_structured_data data

!mkdir features
!gcsfuse hoboken_cxr_features features

!mkdir final_validation_datasets
!gcsfuse final_validation_datasets final_validation_datasets

mkdir: cannot create directory ‘data’: File exists
2021/06/08 14:37:22.794635 Using mount point: /content/data
2021/06/08 14:37:22.801705 Opening GCS connection...
2021/06/08 14:37:22.934557 Mounting file system "hoboken_structured_data"...
2021/06/08 14:37:22.935867 File system has been successfully mounted.
mkdir: cannot create directory ‘features’: File exists
2021/06/08 14:37:23.145211 Using mount point: /content/features
2021/06/08 14:37:23.153506 Opening GCS connection...
2021/06/08 14:37:23.272864 Mounting file system "hoboken_cxr_features"...
2021/06/08 14:37:23.273552 File system has been successfully mounted.
mkdir: cannot create directory ‘final_validation_datasets’: File exists
2021/06/08 14:37:23.494324 Using mount point: /content/final_validation_datasets
2021/06/08 14:37:23.501494 Opening GCS connection...
2021/06/08 14:37:23.626064 Mounting file system "final_validation_datasets"...
2021/06/08 14:37:23.626900 File system has been successfully mounted.


# Libraries

In [None]:
import os, sys, math
from tensorflow.python.lib.io import file_io
import glob
import warnings


#Third party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tableone import TableOne
from scipy.stats import uniform, randint
from scipy.stats.mstats import winsorize
import seaborn as sns
import shap
import sqldf as sql
import math
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, precision_recall_curve, plot_precision_recall_curve, average_precision_score, brier_score_loss, roc_curve
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.utils.class_weight import compute_sample_weight

#Global configuration
pd.options.mode.chained_assignment = None
seed = 2020
np.random.seed(seed)

pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


# Tables ingestion

In [None]:
# the only files that contain the variables we are interested in are the following
hoboken_structured_data = pd.read_excel('data/hoboken_structured_data.xlsx', sheet_name='Sheet1')
hoboken_structured_data.set_index('patient ID', inplace=True)

hoboken_image_names = pd.read_csv('features/test-hoboken_image_names.csv', header=None)
#extracting patient id with cxr from path
hoboken_image_names = hoboken_image_names[0].str.extract('(\d+)').astype('int32')
hoboken_image_names=pd.to_numeric(hoboken_image_names[0])

invalid escape sequence \d
invalid escape sequence \d
invalid escape sequence \d


# Data Wrangling

# Exclusion criteria

In [None]:
print(':::Exclusion criteria:::')
print(f'Initial number of cases (hoboken_structured_data): {len(hoboken_structured_data)}')

hoboken_structured_data = hoboken_structured_data[hoboken_structured_data['age'] >= 16]
print(f'After excluding those with <16 Age: {len(hoboken_structured_data)}')



hoboken_structured_data =  hoboken_structured_data[hoboken_structured_data.index.isin(hoboken_image_names)]
print(f'After excluding those missing cxr: {len(hoboken_structured_data)}')

# we don't have admission_datetime in snuh_combined
#snuh_combined = snuh_combined[snuh_combined["admission_datetime"].isnull() != True]
#print(f'After excluding those with missing admission time: {len(snuh_combined)}')

# hospital_outcome is empty when =0 in this dataset, so removing empty ones would remove survivors. 
#hoboken_structured_data = hoboken_structured_data[hoboken_structured_data['hospital_outcome'].isnull()==False]
#print(f'After excluding those with missing hospital_outcome: {len(hoboken_structured_data)}')

# we are replacing missing cells with 0
hoboken_structured_data.hospital_outcome=hoboken_structured_data.hospital_outcome.fillna(0)

# Outcome distribution
print(':::Outcome distribution:::')
## inspect outcome distribution
print('Breakdown of hospital_outcome:')
print(hoboken_structured_data['hospital_outcome'].value_counts())


:::Exclusion criteria:::
Initial number of cases (hoboken_structured_data): 242
After excluding those with <16 Age: 242
After excluding those missing cxr: 201
:::Outcome distribution:::
Breakdown of hospital_outcome:
0.0    114
1.0     87
Name: hospital_outcome, dtype: int64


# Data Cleansing

In [None]:
#Vital signs
features_list = [name for name in list(hoboken_structured_data) if 'vitals' in name]

#Apply some clinical heuristics for valid ranges
limits = {'_sbp_':[20,240],
          '_hr_':[20,300],
          '_spo2_':[1,100],
          '_temp_':[30,45]}
for substr in limits.keys():
    for name in list(hoboken_structured_data):
        if substr in name:
            hoboken_structured_data[name][hoboken_structured_data[name] < limits[substr][0]] = np.nan
            hoboken_structured_data[name][hoboken_structured_data[name] > limits[substr][1]] = np.nan 

keep = [name for name in hoboken_structured_data.columns if 'last' not in name]
hoboken_structured_data = hoboken_structured_data.loc[:,keep]

# encoding sex the same way it is encoded in the reference dataset (HM COVIDDSL dataset)
print(hoboken_structured_data.sex.value_counts())

dictionary = {0: "MALE", 1: "FEMALE"}
hoboken_structured_data = hoboken_structured_data.replace({"sex": dictionary})

hoboken_structured_data.sex.value_counts()

hoboken_structured_data.head()

0    118
1     83
Name: sex, dtype: int64


Unnamed: 0_level_0,age,sex,admission_datetime,ed_diagnosis,mechvent_flag,vitals_temp_ed_first,vitals_sbp_ed_first,vitals_dbp_ed_first,vitals_hr_ed_first,vitals_spo2_ed_first,hospital_outcome,pmhx_diabetes,pmhx_hld,pmhx_htn,pmhx_ihd,pmhx_ckd,pmhx_copd,pmhx_asthma,pmhx_activecancer,pmhx_chronicliver,pmhx_stroke,pmhx_chf,pmhx_dementia,lab_sodium,lab_leukocyte,lab_mean_platelet_volume,lab_neutrophil,lab_ddimer,lab_inr,lab_mch,lab_creatinine,lab_mcv,lab_aptt,lab_platelet,lab_lymphocyte_percentage,lab_glucose,lab_neutrophil_percentage,lab_ldh,lab_prothrombin_activity,lab_urea,lab_lymphocyte,lab_crp,lab_rdw,lab_hemoglobin,lab_rbc,lab_hct,lab_potassium,lab_alt,lab_ast
patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
1,86,FEMALE,2020-04-05,,0,37.333333,123,66,120,99.0,0.0,1.0,,1.0,,,,,,,,,,139.0,7.8,10.7,4.6,274.0,1.1,31.1,1.8,93.8,35.6,96.0,12.820513,272.0,58.974359,921.0,12.9,57.0,1.0,156.9,13.6,13.7,4.42,41.4,4.3,15.0,39.0
2,97,MALE,2020-04-05,,0,38.444444,99,65,118,100.0,1.0,,1.0,1.0,,1.0,1.0,,,,,,1.0,150.0,5.7,8.0,4.2,,1.5,28.2,1.8,87.2,40.7,142.0,14.035088,110.0,73.684211,3059.0,17.7,37.0,0.8,44.0,15.4,16.2,5.72,49.9,4.1,104.0,83.0
3,41,MALE,2020-04-04,,0,37.388889,130,83,91,94.0,0.0,,,,,,,,,,,,,137.0,5.0,7.7,2.4,200.0,1.1,33.2,0.9,94.4,32.8,191.0,42.0,112.0,48.0,714.0,12.7,14.0,2.1,24.1,12.8,17.9,5.38,50.8,3.7,110.0,83.0
4,71,FEMALE,2020-04-04,,0,37.0,143,77,89,99.0,0.0,,,,,,,,,,,,,137.0,4.1,8.2,2.9,209.0,1.2,28.6,0.7,84.6,31.8,217.0,19.512195,103.0,70.731707,528.0,14.3,20.0,0.8,17.9,13.6,13.3,4.66,39.4,3.8,28.0,39.0
8,87,MALE,2020-04-03,,0,36.777778,168,59,105,100.0,0.0,1.0,1.0,1.0,1.0,,,,,,,1.0,,135.0,6.8,8.8,6.0,,1.3,31.1,1.4,94.7,30.8,244.0,5.882353,261.0,88.235294,1279.0,15.3,33.0,0.4,181.0,14.3,12.8,4.12,39.0,4.6,27.0,53.0


# Initial data exploration

In [None]:
hoboken_structured_data.columns

Index(['age', 'sex', 'admission_datetime', 'ed_diagnosis', 'mechvent_flag',
       'vitals_temp_ed_first', 'vitals_sbp_ed_first', 'vitals_dbp_ed_first',
       'vitals_hr_ed_first', 'vitals_spo2_ed_first', 'hospital_outcome',
       'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd', 'pmhx_ckd',
       'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer', 'pmhx_chronicliver',
       'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia', 'lab_sodium',
       'lab_leukocyte', 'lab_mean_platelet_volume', 'lab_neutrophil',
       'lab_ddimer', 'lab_inr', 'lab_mch', 'lab_creatinine', 'lab_mcv',
       'lab_aptt', 'lab_platelet', 'lab_lymphocyte_percentage', 'lab_glucose',
       'lab_neutrophil_percentage', 'lab_ldh', 'lab_prothrombin_activity',
       'lab_urea', 'lab_lymphocyte', 'lab_crp', 'lab_rdw', 'lab_hemoglobin',
       'lab_rbc', 'lab_hct', 'lab_potassium', 'lab_alt', 'lab_ast'],
      dtype='object')

In [None]:
#remove labs that have >50% missing values

remove_lab = hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')].isnull().sum().index[(hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')].isnull().sum()/len(hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')]) > 0.50)]
hoboken_structured_data=hoboken_structured_data.drop(columns=remove_lab)
hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')].head()

Unnamed: 0_level_0,lab_sodium,lab_leukocyte,lab_mean_platelet_volume,lab_neutrophil,lab_inr,lab_mch,lab_creatinine,lab_mcv,lab_aptt,lab_platelet,lab_lymphocyte_percentage,lab_glucose,lab_neutrophil_percentage,lab_ldh,lab_prothrombin_activity,lab_urea,lab_lymphocyte,lab_crp,lab_rdw,lab_hemoglobin,lab_rbc,lab_hct,lab_potassium,lab_alt,lab_ast
patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,139.0,7.8,10.7,4.6,1.1,31.1,1.8,93.8,35.6,96.0,12.820513,272.0,58.974359,921.0,12.9,57.0,1.0,156.9,13.6,13.7,4.42,41.4,4.3,15.0,39.0
2,150.0,5.7,8.0,4.2,1.5,28.2,1.8,87.2,40.7,142.0,14.035088,110.0,73.684211,3059.0,17.7,37.0,0.8,44.0,15.4,16.2,5.72,49.9,4.1,104.0,83.0
3,137.0,5.0,7.7,2.4,1.1,33.2,0.9,94.4,32.8,191.0,42.0,112.0,48.0,714.0,12.7,14.0,2.1,24.1,12.8,17.9,5.38,50.8,3.7,110.0,83.0
4,137.0,4.1,8.2,2.9,1.2,28.6,0.7,84.6,31.8,217.0,19.512195,103.0,70.731707,528.0,14.3,20.0,0.8,17.9,13.6,13.3,4.66,39.4,3.8,28.0,39.0
8,135.0,6.8,8.8,6.0,1.3,31.1,1.4,94.7,30.8,244.0,5.882353,261.0,88.235294,1279.0,15.3,33.0,0.4,181.0,14.3,12.8,4.12,39.0,4.6,27.0,53.0


In [None]:
hoboken_structured_data.describe()

Unnamed: 0,age,ed_diagnosis,mechvent_flag,vitals_temp_ed_first,vitals_sbp_ed_first,vitals_dbp_ed_first,vitals_hr_ed_first,vitals_spo2_ed_first,hospital_outcome,pmhx_diabetes,pmhx_hld,pmhx_htn,pmhx_ihd,pmhx_ckd,pmhx_copd,pmhx_asthma,pmhx_activecancer,pmhx_chronicliver,pmhx_stroke,pmhx_chf,pmhx_dementia,lab_sodium,lab_leukocyte,lab_mean_platelet_volume,lab_neutrophil,lab_inr,lab_mch,lab_creatinine,lab_mcv,lab_aptt,lab_platelet,lab_lymphocyte_percentage,lab_glucose,lab_neutrophil_percentage,lab_ldh,lab_prothrombin_activity,lab_urea,lab_lymphocyte,lab_crp,lab_rdw,lab_hemoglobin,lab_rbc,lab_hct,lab_potassium,lab_alt,lab_ast
count,201.0,0.0,201.0,201.0,201.0,201.0,201.0,198.0,201.0,76.0,67.0,110.0,32.0,26.0,20.0,25.0,10.0,1.0,7.0,32.0,28.0,193.0,193.0,201.0,193.0,165.0,201.0,193.0,201.0,163.0,193.0,193.0,192.0,193.0,179.0,137.0,192.0,193.0,166.0,201.0,193.0,201.0,201.0,193.0,190.0,192.0
mean,65.0199,,0.318408,37.989773,129.587065,73.875622,105.004975,86.732323,0.432836,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,136.725389,9.963316,8.48607,8.413731,1.193939,28.837313,1.95285,87.545771,33.310429,242.321244,12.542318,165.854167,82.058988,1324.441341,13.80365,31.765625,1.311917,166.449217,14.755224,14.037306,4.628458,40.421891,4.691192,66.142105,102.130208
std,16.852584,,0.467022,0.937657,24.013613,14.053806,19.71332,12.495037,0.496706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,6.959872,8.640239,0.9612,8.73506,0.183694,2.214803,3.746082,5.573454,5.726585,96.562212,10.762167,90.06879,30.911122,1756.083119,2.247299,29.672069,5.04832,120.303454,1.971772,10.379264,0.670641,5.734032,6.638028,149.063126,251.998481
min,16.0,,0.0,36.277778,67.0,31.0,22.0,20.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,111.0,1.0,6.7,0.45,0.9,20.9,0.3,68.6,14.2,52.0,0.814111,53.0,6.923077,319.0,10.2,4.0,0.1,5.0,12.4,7.8,2.83,24.2,2.7,10.0,19.0
25%,53.0,,0.0,37.222222,115.0,65.0,92.0,84.25,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,133.0,6.1,7.8,4.5,1.1,27.7,0.7,84.5,29.8,181.0,6.349206,110.0,75.510204,827.0,12.7,13.0,0.6,67.275,13.6,12.2,4.24,36.9,3.8,23.0,46.0
50%,66.0,,0.0,37.888889,127.0,73.0,104.0,90.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,136.0,7.9,8.4,6.4,1.2,29.1,0.9,88.2,32.3,224.0,9.638554,132.0,83.168317,1007.0,13.4,21.0,0.8,162.7,14.2,13.2,4.64,40.3,4.1,35.5,65.0
75%,79.0,,1.0,38.833333,143.0,82.0,115.0,94.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,139.0,11.5,9.0,9.9,1.2,30.3,1.7,91.4,36.45,292.0,16.326531,186.25,88.392857,1450.0,14.4,41.25,1.2,245.5,15.3,14.5,5.06,44.3,4.5,63.0,94.0
max,97.0,,1.0,40.0,205.0,131.0,170.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,164.0,84.0,11.4,87.5,2.2,33.9,41.4,99.5,60.2,605.0,83.333333,521.0,455.729167,23097.0,24.8,197.0,69.9,535.5,24.9,155.0,7.37,60.9,96.0,1929.0,3401.0


In [None]:
# since in the end we are not running several models, but only one with the hm_hospitales variables
# we are filling with 0 variables that containg 100% missing values

hoboken_structured_data.ed_diagnosis=0

In [None]:
# we need the same columns available in this dataset that we have in the hm hospitales dataset
# so we are going to fill the missing ones with 0
hoboken_structured_data['lab_ddimer']=0

hoboken_structured_data=hoboken_structured_data[['age','hospital_outcome','sex'
,'vitals_temp_ed_first','vitals_sbp_ed_first','vitals_dbp_ed_first','vitals_hr_ed_first','vitals_spo2_ed_first'
,'pmhx_diabetes','pmhx_hld','pmhx_htn','pmhx_ihd','pmhx_ckd','pmhx_copd','pmhx_asthma'
,'pmhx_activecancer','pmhx_chronicliver','pmhx_stroke','pmhx_chf','pmhx_dementia'
,'lab_sodium', 'lab_leukocyte', 'lab_mean_platelet_volume',
'lab_neutrophil', 'lab_inr', 'lab_mch', 'lab_creatinine', 'lab_mcv',
'lab_aptt', 'lab_platelet', 'lab_lymphocyte_percentage', 'lab_glucose',
'lab_neutrophil_percentage', 'lab_ldh', 'lab_prothrombin_activity',
'lab_urea', 'lab_lymphocyte', 'lab_crp', 'lab_rdw', 'lab_hemoglobin',
'lab_rbc', 'lab_hct', 'lab_potassium', 'lab_alt', 'lab_ast',
'lab_ddimer']]

categorical = ['sex','hospital_outcome', 'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd',
       'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer',
       'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia']
print(TableOne(hoboken_structured_data,columns=hoboken_structured_data.columns.tolist(),categorical=categorical,groupby='hospital_outcome',pval=True))

                                            Grouped by hospital_outcome                                                         
                                                                Missing          Overall            0.0              1.0 P-Value
n                                                                                    201            114               87        
age, mean (SD)                                                        0      65.0 (16.9)    61.9 (16.5)      69.1 (16.5)   0.003
hospital_outcome, n (%)              0.0                              0       114 (56.7)    114 (100.0)                   <0.001
                                     1.0                                       87 (43.3)                      87 (100.0)        
sex, n (%)                           FEMALE                           0        83 (41.3)      55 (48.2)        28 (32.2)   0.032
                                     MALE                                     118 (58.7)      59 

# Data standardization

## Imputation and scale

In [None]:
def impute_external(col_name,imputer_list,data):
    try:
        for name,imputer in imputer_list:
            if name == col_name:
                imputed_data = imputer.transform(data.to_numpy().reshape(-1, 1))
                imputed_data = imputed_data.ravel()
    except BaseException as e:
        print(f'Column: {col_name} encountered exception {e}')
    return pd.Series(imputed_data,name=col_name)

def scale_external(col_name,scaler_list,data):
    try:
        for name,scaler in scaler_list:
            if name == col_name:
                scaled_data = scaler.transform(data.to_numpy().reshape(-1, 1))
                scaled_data = scaled_data.ravel()
    except BaseException as e:
        print(f'Column: {col_name} encountered exception {e}')
    return pd.Series(scaled_data,name=col_name)

In [None]:
#in the case of comorbidities, empty means 0.

hoboken_structured_data[['pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd',
'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer',
'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia']] = hoboken_structured_data[['pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd',
'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer',
'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia']].fillna(0)

In [None]:
hoboken_structured_data_ = hoboken_structured_data.reset_index()
#hoboken_structured_data = hoboken_structured_data.drop(columns='admission_datetime')

#dropping all drug columns, there are zero patients with 'vasopressor' and 'intubation' - this needs to be verified with data owners
hoboken_structured_data_ = hoboken_structured_data_[[name for name in hoboken_structured_data_.columns if 'drug' not in name]]

def impute(series,method=None,missing_indicator=False): 
    """
    Wrapper function for sklearn's SimpleImputer

    Parameters
    ----------
    series: pd.Series
        a pd.Series to impute
    method: string
        passed to SimpleImputer's strategy parameter
    missing_indicator: bool
        logical flag to indicate if a missing value indicator column should be added

    Returns
    -------
    output_df: pd.DataFrame
        a pd.DataFrame containing the imputed series + missing indicator column
    """
    name = series.name
    si = SimpleImputer(strategy=method,add_indicator=missing_indicator)
    array = si.fit_transform(series.values.reshape(-1, 1))
    if missing_indicator:
        output_df = pd.DataFrame(array,columns=[name,name+'_missing'])
    else:
        output_df = pd.DataFrame(array,columns=[name])
    return output_df, si

def encode(series,drop_first=True):
    """
    Onehot encodes a categorical dataframe, dropping the first column

    Parameters
    ----------
    series: pd.Series
        a categorical pandas series
    drop_first: bool
        logical flag for whether the first category should be dropped

    Returns
    -------
    onehot_df: pd.DataFrame
        a pd.DataFrame containing the onehot encoded columns
    """
    onehot_df = pd.get_dummies(series,drop_first=drop_first)
    return onehot_df

def scale_center(series):
    """
    Wrapper function to scale and center a pd.Series using sklearn's Robust Scaler
    """
    rs = RobustScaler()
    array = rs.fit_transform(series.values.reshape(-1,1))
    array = array.flatten()
    series = pd.Series(array)
    return series, rs

any_missing = hoboken_structured_data_.columns[hoboken_structured_data_.isnull().sum() > 0].tolist()

imputer_list = []
scaler_list = []

for column in hoboken_structured_data_.columns.tolist():
    #Categorical features
    if ('pmhx' in column) or ('drug' in column) or (column in ['ed_diagnosis','sex']):
        if column in any_missing:
            #Mode imputation
            imputed,imputer = impute(hoboken_structured_data_[column],method='most_frequent')
            imputer_list += [(column,imputer)]
            hoboken_structured_data_ = hoboken_structured_data_.drop(columns=column).merge(imputed, left_index=True, right_index=True)
        if (len(hoboken_structured_data_[column].value_counts()) > 2) or (hoboken_structured_data_[column].dtype=='O'):
            #One hot encoding
            onehot_df = encode(hoboken_structured_data_[column])
            hoboken_structured_data_ = hoboken_structured_data_.drop(columns=column).merge(onehot_df, left_index=True, right_index=True)
    #Numeric features - vital signs, laboratory values
    elif ('age' in column) or ('vitals' in column) or ('lab' in column):
        if column in any_missing:
            #Median imputation
            imputed,imputer = impute(hoboken_structured_data_[column],method='median')
            imputer_list += [(column,imputer)]
            hoboken_structured_data_ = hoboken_structured_data_.drop(columns=column).merge(imputed, left_index=True, right_index=True)
        if 'spo2' not in column:
            #Winsorize to 1st and 99th percentile - excluding SpO2 which can normally take a value of 100%
            hoboken_structured_data_[column] = winsorize(hoboken_structured_data_[column],limits=(0.01,0.01))
        #Scale and center numeric columns
        hoboken_structured_data_[column],scaler = scale_center(hoboken_structured_data_[column])
        scaler_list += [(column,scaler)]
    
hoboken_structured_data_ = hoboken_structured_data_.set_index('patient ID')

# Final Table 1

In [None]:
categorical = ['hospital_outcome', 'MALE',
'pmhx_chronicliver', 'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd', 'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia',]

print(TableOne(hoboken_structured_data_,columns=hoboken_structured_data_.columns.tolist(),categorical=categorical,groupby='hospital_outcome',pval=True))

                                         Grouped by hospital_outcome                                                  
                                                             Missing       Overall           0.0           1.0 P-Value
n                                                                              242           138           104        
index, mean (SD)                                                   0  120.5 (70.0)  101.9 (71.9)  145.2 (59.3)  <0.001
age, mean (SD)                                                     0    -0.0 (0.6)    -0.2 (0.6)     0.1 (0.6)  <0.001
hospital_outcome, n (%)              0.0                           0    138 (57.0)   138 (100.0)                <0.001
                                     1.0                                104 (43.0)                 104 (100.0)        
vitals_temp_ed_first, mean (SD)                                    0     0.1 (0.6)     0.0 (0.6)     0.1 (0.6)   0.727
vitals_sbp_ed_first, mean (SD)                  

# Append Image features

## Features ingestion

In [None]:
hoboken_image_feature = pd.read_csv('features/test-hoboken_image_feature.csv', header=None)
hoboken_image_names = pd.read_csv('features/test-hoboken_image_names.csv', header=None)

## Appending files

In [None]:
hoboken_image_feature = pd.concat([hoboken_image_feature.reset_index(drop=True), hoboken_image_names], axis=1)

## Naming all features

In [None]:
hoboken_image_feature_names = ['feat_' + s for s in [str(i).zfill(4) for i in range(1,65)] ]
hoboken_image_feature_names = np.append(hoboken_image_feature_names,['path'])
hoboken_image_feature.columns = hoboken_image_feature_names

## Extracting patient id from feature dataframe

In [None]:
hoboken_image_feature['patient ID']=hoboken_image_feature.path.str.extract('(\d+)').astype('int32')

invalid escape sequence \d
invalid escape sequence \d


In [None]:
print('Number of patients with CXR available')
hoboken_image_feature['patient ID'].nunique()

Number of patients with CXR available


201

# Final Join

In [None]:
hoboken_image_feature_data= hoboken_structured_data_.merge(hoboken_image_feature, how='inner', on='patient ID')
print('Final number of patients in the combined dataset (containing both images and structured data)')
print(hoboken_image_feature_data['patient ID'].nunique())
# dropping index column to avoid confusions
del hoboken_image_feature_data['index']

Final number of patients in the combined dataset (containing both images and structured data)
201


# Columns renaming and final export



In [None]:
hoboken_image_feature_data=hoboken_image_feature_data.rename(columns={"patient ID": "PATIENT ID"})
hoboken_image_feature_data=hoboken_image_feature_data.rename(columns={"hospital_outcome": "expired_30_days"})

hoboken_image_feature_data.to_csv('final_validation_datasets/hoboken_image_feature_data.csv',index=False)