This notebook cleans structured data and appends it to cxr features.

# Environment

In [2]:
!pip install google-colab -q
!pip install shap -q
!pip install seaborn
!pip install tableone -q
!pip install sqldf

[?25l[K     |█                               | 10kB 18.8MB/s eta 0:00:01[K     |█▉                              | 20kB 10.6MB/s eta 0:00:01[K     |██▊                             | 30kB 6.2MB/s eta 0:00:01[K     |███▊                            | 40kB 5.6MB/s eta 0:00:01[K     |████▋                           | 51kB 2.9MB/s eta 0:00:01[K     |█████▌                          | 61kB 3.3MB/s eta 0:00:01[K     |██████▍                         | 71kB 3.5MB/s eta 0:00:01[K     |███████▍                        | 81kB 3.7MB/s eta 0:00:01[K     |████████▎                       | 92kB 3.8MB/s eta 0:00:01[K     |█████████▏                      | 102kB 4.0MB/s eta 0:00:01[K     |██████████▏                     | 112kB 4.0MB/s eta 0:00:01[K     |███████████                     | 122kB 4.0MB/s eta 0:00:01[K     |████████████                    | 133kB 4.0MB/s eta 0:00:01[K     |████████████▉                   | 143kB 4.0MB/s eta 0:00:01[K     |█████████████▉           

In [3]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0  84566      0 --:--:-- --:--:-- --:--:-- 84566
OK
65 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 65 not upgraded.
Need to get 10.8 MB of archives.
After this operation, 23.1 MB of additional disk space will be used.
Selecting previously unselected package gcsfuse.
(Reading database ... 160706 files and directories currently installed.)
Preparing to unpack .../gcsfuse_0.35.0_amd64.deb ...
Unpacking gcsfuse (0.35.0) ...
Setting up gcsfuse (0.35.0) ...


In [4]:
#Standard library imports
from google.colab import auth
auth.authenticate_user()

In [5]:
!mkdir data
!gcsfuse hoboken_structured_data data

!mkdir features
!gcsfuse hoboken_cxr_features features

!mkdir final_validation_datasets
!gcsfuse final_validation_datasets final_validation_datasets

2021/05/17 20:17:36.316502 Using mount point: /content/data
2021/05/17 20:17:36.324421 Opening GCS connection...
2021/05/17 20:17:36.495434 Mounting file system "hoboken_structured_data"...
2021/05/17 20:17:36.527538 File system has been successfully mounted.
2021/05/17 20:17:36.759766 Using mount point: /content/features
2021/05/17 20:17:36.766863 Opening GCS connection...
2021/05/17 20:17:37.018734 Mounting file system "hoboken_cxr_features"...
2021/05/17 20:17:37.019103 File system has been successfully mounted.
2021/05/17 20:17:37.188249 Using mount point: /content/final_validation_datasets
2021/05/17 20:17:37.197428 Opening GCS connection...
2021/05/17 20:17:37.379628 Mounting file system "final_validation_datasets"...
2021/05/17 20:17:37.380018 File system has been successfully mounted.


# Libraries

In [6]:
import os, sys, math
from tensorflow.python.lib.io import file_io
import glob
import warnings


#Third party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tableone import TableOne
from scipy.stats import uniform, randint
from scipy.stats.mstats import winsorize
import seaborn as sns
import shap
import sqldf as sql
import math
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, precision_recall_curve, plot_precision_recall_curve, average_precision_score, brier_score_loss, roc_curve
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.utils.class_weight import compute_sample_weight

#Global configuration
pd.options.mode.chained_assignment = None
seed = 2020
np.random.seed(seed)

pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

  import pandas.util.testing as tm


display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


# Tables ingestion

In [7]:
# the only files that contain the variables we are interested in are the following
hoboken_structured_data = pd.read_excel('data/hoboken_structured_data.xlsx', sheet_name='Sheet1')
hoboken_image_names = pd.read_csv('features/test-hoboken_image_names.csv', header=None)
#extracting patient id with cxr from path
hoboken_image_names = hoboken_image_names[0].str.extract('(\d+)').astype('int32')
hoboken_image_names=pd.to_numeric(hoboken_image_names[0])

invalid escape sequence \d
invalid escape sequence \d
invalid escape sequence \d


# Data Wrangling

# Exclusion criteria

In [8]:
print(':::Exclusion criteria:::')
print(f'Initial number of cases (hoboken_structured_data): {len(hoboken_structured_data)}')

hoboken_structured_data = hoboken_structured_data[hoboken_structured_data['age'] >= 16]
print(f'After excluding those with <16 Age: {len(hoboken_structured_data)}')



hoboken_structured_data =  hoboken_structured_data[hoboken_structured_data.index.isin(hoboken_image_names)]
print(f'After excluding those missing cxr: {len(hoboken_structured_data)}')

# we don't have admission_datetime in snuh_combined
#snuh_combined = snuh_combined[snuh_combined["admission_datetime"].isnull() != True]
#print(f'After excluding those with missing admission time: {len(snuh_combined)}')

# hospital_outcome is empty when =0 in this dataset, so removing empty ones would remove survivors. 
#hoboken_structured_data = hoboken_structured_data[hoboken_structured_data['hospital_outcome'].isnull()==False]
#print(f'After excluding those with missing hospital_outcome: {len(hoboken_structured_data)}')

# we are replacing missing cells with 0
hoboken_structured_data.hospital_outcome=hoboken_structured_data.hospital_outcome.fillna(0)

# Outcome distribution
print(':::Outcome distribution:::')
## inspect outcome distribution
print('Breakdown of hospital_outcome:')
print(hoboken_structured_data['hospital_outcome'].value_counts())


:::Exclusion criteria:::
Initial number of cases (hoboken_structured_data): 242
After excluding those with <16 Age: 242
After excluding those missing cxr: 200
:::Outcome distribution:::
Breakdown of hospital_outcome:
0.0    117
1.0     83
Name: hospital_outcome, dtype: int64


# Data Cleansing

In [9]:
#Vital signs
features_list = [name for name in list(hoboken_structured_data) if 'vitals' in name]

#Apply some clinical heuristics for valid ranges
limits = {'_sbp_':[20,240],
          '_hr_':[20,300],
          '_spo2_':[1,100],
          '_temp_':[30,45]}
for substr in limits.keys():
    for name in list(hoboken_structured_data):
        if substr in name:
            hoboken_structured_data[name][hoboken_structured_data[name] < limits[substr][0]] = np.nan
            hoboken_structured_data[name][hoboken_structured_data[name] > limits[substr][1]] = np.nan 

keep = [name for name in hoboken_structured_data.columns if 'last' not in name]
hoboken_structured_data = hoboken_structured_data.loc[:,keep]

# encoding sex the same way it is encoded in the reference dataset (HM COVIDDSL dataset)
print(hoboken_structured_data.sex.value_counts())

dictionary = {0: "MALE", 1: "FEMALE"}
hoboken_structured_data = hoboken_structured_data.replace({"sex": dictionary})

hoboken_structured_data.sex.value_counts()

hoboken_structured_data.head()

0    116
1     84
Name: sex, dtype: int64


Unnamed: 0,patient ID,age,sex,admission_datetime,ed_diagnosis,mechvent_flag,vitals_temp_ed_first,vitals_sbp_ed_first,vitals_dbp_ed_first,vitals_hr_ed_first,vitals_spo2_ed_first,hospital_outcome,pmhx_diabetes,pmhx_hld,pmhx_htn,pmhx_ihd,pmhx_ckd,pmhx_copd,pmhx_asthma,pmhx_activecancer,pmhx_chronicliver,pmhx_stroke,pmhx_chf,pmhx_dementia,lab_sodium,lab_leukocyte,lab_mean_platelet_volume,lab_neutrophil,lab_ddimer,lab_inr,lab_mch,lab_creatinine,lab_mcv,lab_aptt,lab_platelet,lab_lymphocyte_percentage,lab_glucose,lab_neutrophil_percentage,lab_ldh,lab_prothrombin_activity,lab_urea,lab_lymphocyte,lab_crp,lab_rdw,lab_hemoglobin,lab_rbc,lab_hct,lab_potassium,lab_alt,lab_ast
1,2,97,MALE,2020-04-05,,0,38.444444,99,65,118,100.0,1.0,,1.0,1.0,,1.0,1.0,,,,,,1.0,150.0,5.7,8.0,4.2,,1.5,28.2,1.8,87.2,40.7,142.0,14.035088,110.0,73.684211,3059.0,17.7,37.0,0.8,44.0,15.4,16.2,5.72,49.9,4.1,104.0,83.0
2,3,41,MALE,2020-04-04,,0,37.388889,130,83,91,94.0,0.0,,,,,,,,,,,,,137.0,5.0,7.7,2.4,200.0,1.1,33.2,0.9,94.4,32.8,191.0,42.0,112.0,48.0,714.0,12.7,14.0,2.1,24.1,12.8,17.9,5.38,50.8,3.7,110.0,83.0
3,4,71,FEMALE,2020-04-04,,0,37.0,143,77,89,99.0,0.0,,,,,,,,,,,,,137.0,4.1,8.2,2.9,209.0,1.2,28.6,0.7,84.6,31.8,217.0,19.512195,103.0,70.731707,528.0,14.3,20.0,0.8,17.9,13.6,13.3,4.66,39.4,3.8,28.0,39.0
4,5,72,FEMALE,2020-04-03,,0,36.666667,154,99,74,94.0,0.0,1.0,,1.0,,,,,,,,,,135.0,4.8,7.2,2.6,,1.1,29.2,0.5,86.4,37.0,202.0,22.916667,125.0,54.166667,688.0,12.6,8.0,1.1,85.1,12.9,14.0,4.8,41.4,4.0,42.0,56.0
8,9,69,FEMALE,2020-04-03,,0,37.333333,115,71,97,95.0,0.0,1.0,1.0,1.0,,,,,,,,,,136.0,7.8,9.2,6.2,393.0,1.1,29.6,0.5,89.5,32.1,295.0,12.820513,116.0,79.487179,1183.0,12.8,9.0,1.0,185.6,13.7,12.6,4.26,38.1,3.6,53.0,71.0


# Initial data exploration

In [10]:
hoboken_structured_data.columns

Index(['patient ID', 'age', 'sex', 'admission_datetime', 'ed_diagnosis',
       'mechvent_flag', 'vitals_temp_ed_first', 'vitals_sbp_ed_first',
       'vitals_dbp_ed_first', 'vitals_hr_ed_first', 'vitals_spo2_ed_first',
       'hospital_outcome', 'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd',
       'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer',
       'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia',
       'lab_sodium', 'lab_leukocyte', 'lab_mean_platelet_volume',
       'lab_neutrophil', 'lab_ddimer', 'lab_inr', 'lab_mch', 'lab_creatinine',
       'lab_mcv', 'lab_aptt', 'lab_platelet', 'lab_lymphocyte_percentage',
       'lab_glucose', 'lab_neutrophil_percentage', 'lab_ldh',
       'lab_prothrombin_activity', 'lab_urea', 'lab_lymphocyte', 'lab_crp',
       'lab_rdw', 'lab_hemoglobin', 'lab_rbc', 'lab_hct', 'lab_potassium',
       'lab_alt', 'lab_ast'],
      dtype='object')

In [11]:
#remove labs that have >50% missing values

remove_lab = hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')].isnull().sum().index[(hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')].isnull().sum()/len(hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')]) > 0.50)]
hoboken_structured_data=hoboken_structured_data.drop(columns=remove_lab)
hoboken_structured_data.loc[:, hoboken_structured_data.columns.str.startswith('lab_')].head()

Unnamed: 0,lab_sodium,lab_leukocyte,lab_mean_platelet_volume,lab_neutrophil,lab_inr,lab_mch,lab_creatinine,lab_mcv,lab_aptt,lab_platelet,lab_lymphocyte_percentage,lab_glucose,lab_neutrophil_percentage,lab_ldh,lab_prothrombin_activity,lab_urea,lab_lymphocyte,lab_crp,lab_rdw,lab_hemoglobin,lab_rbc,lab_hct,lab_potassium,lab_alt,lab_ast
1,150.0,5.7,8.0,4.2,1.5,28.2,1.8,87.2,40.7,142.0,14.035088,110.0,73.684211,3059.0,17.7,37.0,0.8,44.0,15.4,16.2,5.72,49.9,4.1,104.0,83.0
2,137.0,5.0,7.7,2.4,1.1,33.2,0.9,94.4,32.8,191.0,42.0,112.0,48.0,714.0,12.7,14.0,2.1,24.1,12.8,17.9,5.38,50.8,3.7,110.0,83.0
3,137.0,4.1,8.2,2.9,1.2,28.6,0.7,84.6,31.8,217.0,19.512195,103.0,70.731707,528.0,14.3,20.0,0.8,17.9,13.6,13.3,4.66,39.4,3.8,28.0,39.0
4,135.0,4.8,7.2,2.6,1.1,29.2,0.5,86.4,37.0,202.0,22.916667,125.0,54.166667,688.0,12.6,8.0,1.1,85.1,12.9,14.0,4.8,41.4,4.0,42.0,56.0
8,136.0,7.8,9.2,6.2,1.1,29.6,0.5,89.5,32.1,295.0,12.820513,116.0,79.487179,1183.0,12.8,9.0,1.0,185.6,13.7,12.6,4.26,38.1,3.6,53.0,71.0


In [12]:
hoboken_structured_data.describe()

Unnamed: 0,patient ID,age,ed_diagnosis,mechvent_flag,vitals_temp_ed_first,vitals_sbp_ed_first,vitals_dbp_ed_first,vitals_hr_ed_first,vitals_spo2_ed_first,hospital_outcome,pmhx_diabetes,pmhx_hld,pmhx_htn,pmhx_ihd,pmhx_ckd,pmhx_copd,pmhx_asthma,pmhx_activecancer,pmhx_chronicliver,pmhx_stroke,pmhx_chf,pmhx_dementia,lab_sodium,lab_leukocyte,lab_mean_platelet_volume,lab_neutrophil,lab_inr,lab_mch,lab_creatinine,lab_mcv,lab_aptt,lab_platelet,lab_lymphocyte_percentage,lab_glucose,lab_neutrophil_percentage,lab_ldh,lab_prothrombin_activity,lab_urea,lab_lymphocyte,lab_crp,lab_rdw,lab_hemoglobin,lab_rbc,lab_hct,lab_potassium,lab_alt,lab_ast
count,200.0,200.0,0.0,200.0,200.0,200.0,200.0,200.0,197.0,200.0,69.0,68.0,107.0,33.0,24.0,19.0,24.0,8.0,1.0,7.0,33.0,27.0,192.0,192.0,200.0,192.0,162.0,200.0,192.0,200.0,160.0,192.0,192.0,191.0,192.0,177.0,137.0,191.0,192.0,164.0,200.0,192.0,200.0,200.0,192.0,190.0,192.0
mean,123.795,64.535,,0.3,38.018889,129.21,73.92,104.305,86.84264,0.415,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,136.463542,9.831354,8.5115,8.288021,1.195679,28.9295,1.845833,87.7295,33.335,239.1875,12.681291,159.486911,81.7061,1324.548023,13.807299,28.973822,1.290625,161.275427,14.58,14.143229,4.64295,40.6545,4.664583,65.836842,100.317708
std,70.489266,16.925161,,0.459408,0.906297,24.163511,14.333908,17.882418,12.152123,0.493958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,6.796883,8.62328,0.971409,8.728122,0.188933,2.127938,3.733601,5.378104,5.348346,92.035426,10.001863,85.592662,30.807665,1764.638309,2.317188,26.441681,5.031005,115.649612,1.827209,10.403169,0.662401,5.806553,6.657696,148.530188,249.939298
min,2.0,16.0,,0.0,36.277778,67.0,29.0,63.0,20.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,111.0,1.0,6.7,0.5,0.8,20.9,0.3,68.6,14.2,52.0,0.814111,40.0,14.047619,319.0,9.6,4.0,0.1,5.0,12.4,7.8,2.83,24.2,2.7,10.0,19.0
25%,63.75,52.0,,0.0,37.277778,114.75,65.0,91.0,85.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,133.0,5.9,7.8,4.4,1.1,28.0,0.7,85.0,30.1,182.0,6.514681,110.0,74.519152,815.0,12.6,12.5,0.6,64.65,13.6,12.3,4.2375,36.975,3.8,25.0,46.75
50%,124.5,65.0,,0.0,37.944444,127.0,73.0,103.5,90.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,136.0,7.85,8.4,6.4,1.2,29.2,0.9,88.35,32.6,220.0,9.789205,129.0,82.462366,1011.0,13.4,18.0,0.8,160.45,14.1,13.35,4.655,40.6,4.1,37.5,66.0
75%,183.25,77.0,,1.0,38.833333,142.25,81.25,113.25,94.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,138.0,11.4,9.125,9.75,1.3,30.3,1.5,91.225,36.55,289.25,16.549546,177.0,88.317726,1453.0,14.4,32.5,1.2,237.325,15.1,14.625,5.08,44.5,4.5,62.0,98.0
max,243.0,97.0,,1.0,40.0,205.0,131.0,169.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,164.0,84.0,11.4,87.5,2.2,33.9,41.4,99.5,60.2,605.0,83.214286,521.0,455.729167,23097.0,24.8,147.0,69.9,535.5,24.9,155.0,7.37,60.9,96.0,1929.0,3401.0


In [13]:
# since in the end we are not running several models, but only one with the hm_hospitales variables
# we are filling with 0 variables that containg 100% missing values

hoboken_structured_data.ed_diagnosis=0

In [14]:
# we need the same columns available in this dataset that we have in the hm hospitales dataset
# so we are going to fill the missing ones with 0
hoboken_structured_data['lab_ddimer']=0

hoboken_structured_data=hoboken_structured_data[['patient ID','age','hospital_outcome','sex'
,'vitals_temp_ed_first','vitals_sbp_ed_first','vitals_dbp_ed_first','vitals_hr_ed_first','vitals_spo2_ed_first'
,'pmhx_diabetes','pmhx_hld','pmhx_htn','pmhx_ihd','pmhx_ckd','pmhx_copd','pmhx_asthma'
,'pmhx_activecancer','pmhx_chronicliver','pmhx_stroke','pmhx_chf','pmhx_dementia'
,'lab_sodium', 'lab_leukocyte', 'lab_mean_platelet_volume',
'lab_neutrophil', 'lab_inr', 'lab_mch', 'lab_creatinine', 'lab_mcv',
'lab_aptt', 'lab_platelet', 'lab_lymphocyte_percentage', 'lab_glucose',
'lab_neutrophil_percentage', 'lab_ldh', 'lab_prothrombin_activity',
'lab_urea', 'lab_lymphocyte', 'lab_crp', 'lab_rdw', 'lab_hemoglobin',
'lab_rbc', 'lab_hct', 'lab_potassium', 'lab_alt', 'lab_ast',
'lab_ddimer']]

categorical = ['sex','hospital_outcome', 'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd',
       'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer',
       'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia']
print(TableOne(hoboken_structured_data,columns=hoboken_structured_data.columns.tolist(),categorical=categorical,groupby='hospital_outcome',pval=True))

                                            Grouped by hospital_outcome                                                         
                                                                Missing          Overall            0.0              1.0 P-Value
n                                                                                    200            117               83        
patient ID, mean (SD)                                                 0     123.8 (70.5)   106.8 (73.7)     147.7 (58.2)  <0.001
age, mean (SD)                                                        0      64.5 (16.9)    60.7 (16.1)      70.0 (16.7)  <0.001
hospital_outcome, n (%)              0.0                              0       117 (58.5)    117 (100.0)                   <0.001
                                     1.0                                       83 (41.5)                      83 (100.0)        
sex, n (%)                           FEMALE                           0        84 (42.0)      55 

# Data standardization

## Imputation and scale

In [None]:
def impute_external(col_name,imputer_list,data):
    try:
        for name,imputer in imputer_list:
            if name == col_name:
                imputed_data = imputer.transform(data.to_numpy().reshape(-1, 1))
                imputed_data = imputed_data.ravel()
    except BaseException as e:
        print(f'Column: {col_name} encountered exception {e}')
    return pd.Series(imputed_data,name=col_name)

def scale_external(col_name,scaler_list,data):
    try:
        for name,scaler in scaler_list:
            if name == col_name:
                scaled_data = scaler.transform(data.to_numpy().reshape(-1, 1))
                scaled_data = scaled_data.ravel()
    except BaseException as e:
        print(f'Column: {col_name} encountered exception {e}')
    return pd.Series(scaled_data,name=col_name)

In [None]:
#in the case of comorbidities, empty means 0.

hoboken_structured_data[['pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd',
'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer',
'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia']] = hoboken_structured_data[['pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd',
'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer',
'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia']].fillna(0)

In [None]:
hoboken_structured_data_ = hoboken_structured_data.reset_index()
#hoboken_structured_data = hoboken_structured_data.drop(columns='admission_datetime')

#dropping all drug columns, there are zero patients with 'vasopressor' and 'intubation' - this needs to be verified with data owners
hoboken_structured_data_ = hoboken_structured_data_[[name for name in hoboken_structured_data_.columns if 'drug' not in name]]

def impute(series,method=None,missing_indicator=False): 
    """
    Wrapper function for sklearn's SimpleImputer

    Parameters
    ----------
    series: pd.Series
        a pd.Series to impute
    method: string
        passed to SimpleImputer's strategy parameter
    missing_indicator: bool
        logical flag to indicate if a missing value indicator column should be added

    Returns
    -------
    output_df: pd.DataFrame
        a pd.DataFrame containing the imputed series + missing indicator column
    """
    name = series.name
    si = SimpleImputer(strategy=method,add_indicator=missing_indicator)
    array = si.fit_transform(series.values.reshape(-1, 1))
    if missing_indicator:
        output_df = pd.DataFrame(array,columns=[name,name+'_missing'])
    else:
        output_df = pd.DataFrame(array,columns=[name])
    return output_df, si

def encode(series,drop_first=True):
    """
    Onehot encodes a categorical dataframe, dropping the first column

    Parameters
    ----------
    series: pd.Series
        a categorical pandas series
    drop_first: bool
        logical flag for whether the first category should be dropped

    Returns
    -------
    onehot_df: pd.DataFrame
        a pd.DataFrame containing the onehot encoded columns
    """
    onehot_df = pd.get_dummies(series,drop_first=drop_first)
    return onehot_df

def scale_center(series):
    """
    Wrapper function to scale and center a pd.Series using sklearn's Robust Scaler
    """
    rs = RobustScaler()
    array = rs.fit_transform(series.values.reshape(-1,1))
    array = array.flatten()
    series = pd.Series(array)
    return series, rs

any_missing = hoboken_structured_data_.columns[hoboken_structured_data_.isnull().sum() > 0].tolist()

imputer_list = []
scaler_list = []

for column in hoboken_structured_data_.columns.tolist():
    #Categorical features
    if ('pmhx' in column) or ('drug' in column) or (column in ['ed_diagnosis','sex']):
        if column in any_missing:
            #Mode imputation
            imputed,imputer = impute(hoboken_structured_data_[column],method='most_frequent')
            imputer_list += [(column,imputer)]
            hoboken_structured_data_ = hoboken_structured_data_.drop(columns=column).merge(imputed, left_index=True, right_index=True)
        if (len(hoboken_structured_data_[column].value_counts()) > 2) or (hoboken_structured_data_[column].dtype=='O'):
            #One hot encoding
            onehot_df = encode(hoboken_structured_data_[column])
            hoboken_structured_data_ = hoboken_structured_data_.drop(columns=column).merge(onehot_df, left_index=True, right_index=True)
    #Numeric features - vital signs, laboratory values
    elif ('age' in column) or ('vitals' in column) or ('lab' in column):
        if column in any_missing:
            #Median imputation
            imputed,imputer = impute(hoboken_structured_data_[column],method='median')
            imputer_list += [(column,imputer)]
            hoboken_structured_data_ = hoboken_structured_data_.drop(columns=column).merge(imputed, left_index=True, right_index=True)
        if 'spo2' not in column:
            #Winsorize to 1st and 99th percentile - excluding SpO2 which can normally take a value of 100%
            hoboken_structured_data_[column] = winsorize(hoboken_structured_data_[column],limits=(0.01,0.01))
        #Scale and center numeric columns
        hoboken_structured_data_[column],scaler = scale_center(hoboken_structured_data_[column])
        scaler_list += [(column,scaler)]
    
hoboken_structured_data_ = hoboken_structured_data_.set_index('patient ID')

# Final Table 1

In [None]:
categorical = ['hospital_outcome', 'MALE',
'pmhx_chronicliver', 'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd', 'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia',]

print(TableOne(hoboken_structured_data_,columns=hoboken_structured_data_.columns.tolist(),categorical=categorical,groupby='hospital_outcome',pval=True))

                                         Grouped by hospital_outcome                                                  
                                                             Missing       Overall           0.0           1.0 P-Value
n                                                                              242           138           104        
index, mean (SD)                                                   0  120.5 (70.0)  101.9 (71.9)  145.2 (59.3)  <0.001
age, mean (SD)                                                     0    -0.0 (0.6)    -0.2 (0.6)     0.1 (0.6)  <0.001
hospital_outcome, n (%)              0.0                           0    138 (57.0)   138 (100.0)                <0.001
                                     1.0                                104 (43.0)                 104 (100.0)        
vitals_temp_ed_first, mean (SD)                                    0     0.1 (0.6)     0.0 (0.6)     0.1 (0.6)   0.727
vitals_sbp_ed_first, mean (SD)                  

# Append Image features

## Features ingestion

In [None]:
hoboken_image_feature = pd.read_csv('features/test-hoboken_image_feature.csv', header=None)
hoboken_image_names = pd.read_csv('features/test-hoboken_image_names.csv', header=None)

## Appending files

In [None]:
hoboken_image_feature = pd.concat([hoboken_image_feature.reset_index(drop=True), hoboken_image_names], axis=1)

## Naming all features

In [None]:
hoboken_image_feature_names = ['feat_' + s for s in [str(i).zfill(4) for i in range(1,65)] ]
hoboken_image_feature_names = np.append(hoboken_image_feature_names,['path'])
hoboken_image_feature.columns = hoboken_image_feature_names

## Extracting patient id from feature dataframe

In [None]:
hoboken_image_feature['patient ID']=hoboken_image_feature.path.str.extract('(\d+)').astype('int32')

invalid escape sequence \d
invalid escape sequence \d


In [None]:
print('Number of patients with CXR available')
hoboken_image_feature['patient ID'].nunique()

Number of patients with CXR available


201

# Final Join

In [None]:
hoboken_image_feature_data= hoboken_structured_data_.merge(hoboken_image_feature, how='inner', on='patient ID')
print('Final number of patients in the combined dataset (containing both images and structured data)')
print(hoboken_image_feature_data['patient ID'].nunique())
# dropping index column to avoid confusions
del hoboken_image_feature_data['index']

Final number of patients in the combined dataset (containing both images and structured data)
201


# Columns renaming and final export



In [None]:
hoboken_image_feature_data=hoboken_image_feature_data.rename(columns={"patient ID": "PATIENT ID"})
hoboken_image_feature_data=hoboken_image_feature_data.rename(columns={"hospital_outcome": "expired_30_days"})

hoboken_image_feature_data.to_csv('final_validation_datasets/hoboken_image_feature_data.csv',index=False)