In this notebook we are preparing the Seoul test dataset:
Seoul tables are ingested; exclusion criteria are applied; data is explored; vitals, comorbidities, drugs and labs are appropriately transformed and cleaned; variables are mean centered and standardized; missing values are imputed; table 1 is produced; image features are appended and renamed and tables are joined.


# Environment

In [None]:
!pip install google-colab -q
!pip install shap -q
!pip install seaborn
!pip install tableone -q
!pip install sqldf

[?25l[K     |█                               | 10kB 26.0MB/s eta 0:00:01[K     |█▉                              | 20kB 33.9MB/s eta 0:00:01[K     |██▊                             | 30kB 37.1MB/s eta 0:00:01[K     |███▊                            | 40kB 23.7MB/s eta 0:00:01[K     |████▋                           | 51kB 16.5MB/s eta 0:00:01[K     |█████▌                          | 61kB 13.1MB/s eta 0:00:01[K     |██████▍                         | 71kB 14.7MB/s eta 0:00:01[K     |███████▍                        | 81kB 14.3MB/s eta 0:00:01[K     |████████▎                       | 92kB 12.6MB/s eta 0:00:01[K     |█████████▏                      | 102kB 13.7MB/s eta 0:00:01[K     |██████████▏                     | 112kB 13.7MB/s eta 0:00:01[K     |███████████                     | 122kB 13.7MB/s eta 0:00:01[K     |████████████                    | 133kB 13.7MB/s eta 0:00:01[K     |████████████▉                   | 143kB 13.7MB/s eta 0:00:01[K     |█████████████

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0   107k      0 --:--:-- --:--:-- --:--:--  107k
OK
48 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 48 not upgraded.
Need to get 10.8 MB of archives.
After this operation, 23.1 MB of additional disk space will be used.
Selecting previously unselected package gcsfuse.
(Reading database ... 160772 files and directories currently installed.)
Preparing to unpack .../gcsfuse_0.35.1_amd64.deb ...
Unpacking gcsfuse (0.35.1) ...
Setting up gcsfuse (0.35.1) ...


In [None]:
#Standard library imports
from google.colab import auth
auth.authenticate_user()

In [None]:
!mkdir data
!gcsfuse snuh_covid data

!mkdir features
!gcsfuse snuh_cxr_features features

!mkdir final_validation_datasets
!gcsfuse final_validation_datasets final_validation_datasets

2021/06/08 15:47:40.875820 Using mount point: /content/data
2021/06/08 15:47:40.882064 Opening GCS connection...
2021/06/08 15:47:41.288431 Mounting file system "snuh_covid"...
2021/06/08 15:47:41.322079 File system has been successfully mounted.
2021/06/08 15:47:41.500166 Using mount point: /content/features
2021/06/08 15:47:41.505386 Opening GCS connection...
2021/06/08 15:47:41.919756 Mounting file system "snuh_cxr_features"...
2021/06/08 15:47:41.920251 File system has been successfully mounted.
2021/06/08 15:47:42.118233 Using mount point: /content/final_validation_datasets
2021/06/08 15:47:42.124113 Opening GCS connection...
2021/06/08 15:47:42.476651 Mounting file system "final_validation_datasets"...
2021/06/08 15:47:42.477439 File system has been successfully mounted.


# Libraries

In [None]:
import os, sys, math
from tensorflow.python.lib.io import file_io
import glob
import warnings
from pandas_profiling import ProfileReport 

#Third party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tableone import TableOne
from scipy.stats import uniform, randint
from scipy.stats.mstats import winsorize
import seaborn as sns
import shap
import sqldf as sql
import math
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, precision_recall_curve, plot_precision_recall_curve, average_precision_score, brier_score_loss, roc_curve
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier

#Global configuration
pd.options.mode.chained_assignment = None
seed = 2020
np.random.seed(seed)
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')


  import pandas.util.testing as tm


display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


# Tables ingestion

In [None]:
# the only files that contain the variables we are interested in are the following
COVID_FINAL_ICU_HID_TABLE = pd.read_excel('data/COVID_FINAL_ICU_HID_TABLE.xlsx', sheet_name='Sheet1')
COVID_FINAL_NO_ICU_HID_TABLE = pd.read_excel('data/COVID_FINAL_NO_ICU_HID_TABLE.xlsx', sheet_name='Sheet1')

snuh_image_names = pd.read_csv('features/snuh_image_names.csv', header=None)
#extracting patient id with cxr from path
snuh_image_names = snuh_image_names[0].str.extract('(\d+)').astype('int32')
snuh_image_names=pd.to_numeric(snuh_image_names[0])

invalid escape sequence \d
invalid escape sequence \d
invalid escape sequence \d


# Data Wrangling

## Data append

In [None]:
# we have two datasets, one with ICU patients and another one without them
snuh_combined = COVID_FINAL_ICU_HID_TABLE
snuh_combined = snuh_combined.append(pd.DataFrame(data = COVID_FINAL_NO_ICU_HID_TABLE), ignore_index=True)
snuh_combined.set_index('subject_id', inplace=True)

## Outcome creation 

In [None]:
# if there is no mortality info, we consider the patient survived
snuh_combined['hospital_outcome']=np.where((snuh_combined['hospital_outcome']=='Expire'), 1, 0)

# Exclusion criteria

In [None]:
print(':::Exclusion criteria:::')
print(f'Initial number of cases (snuh_combined): {len(snuh_combined)}')

snuh_combined = snuh_combined[snuh_combined['age'] >= 16]
print(f'After excluding those with <16 Age: {len(snuh_combined)}')

# we don't have admission_datetime in snuh_combined
#snuh_combined = snuh_combined[snuh_combined["admission_datetime"].isnull() != True]
#print(f'After excluding those with missing admission time: {len(snuh_combined)}')

snuh_combined = snuh_combined[snuh_combined['hospital_outcome'].isnull()==False]
print(f'After excluding those with missing hospital_outcome: {len(snuh_combined)}')

snuh_combined =  snuh_combined[snuh_combined.index.isin(snuh_image_names)]
print(f'After excluding those missing cxr: {len(snuh_combined)}')

print()

# Outcome distribution
print(':::Outcome distribution:::')
## inspect outcome distribution
print('Breakdown of hospital_outcome:')
print(snuh_combined['hospital_outcome'].value_counts())


:::Exclusion criteria:::
Initial number of cases (snuh_combined): 336
After excluding those with <16 Age: 320
After excluding those with missing hospital_outcome: 320
After excluding those missing cxr: 315

:::Outcome distribution:::
Breakdown of hospital_outcome:
0    310
1      5
Name: hospital_outcome, dtype: int64


# Data Cleansing

In [None]:
#Vital signs
features_list = [name for name in list(snuh_combined) if 'vitals' in name]

#Apply some clinical heuristics for valid ranges
limits = {'_sbp_':[20,240],
          '_hr_':[20,300],
          '_spo2_':[1,100],
          '_temp_':[30,45]}
for substr in limits.keys():
    for name in list(snuh_combined):
        if substr in name:
            snuh_combined[name][snuh_combined[name] < limits[substr][0]] = np.nan
            snuh_combined[name][snuh_combined[name] > limits[substr][1]] = np.nan 

keep = [name for name in snuh_combined.columns if 'last' not in name]
snuh_combined = snuh_combined.loc[:,keep]

# encoding sex the same way it is encoded in the reference dataset (HM COVIDDSL dataset)
print(snuh_combined.sex.value_counts())

dictionary = {"M": "MALE", "F": "FEMALE"}
snuh_combined = snuh_combined.replace({"sex": dictionary})

snuh_combined.sex.value_counts()

snuh_combined.head()

M    165
F    150
Name: sex, dtype: int64


Unnamed: 0,subject_id,age,sex,ed_diagnosis,vitals_temp_ed_first,vitals_sbp_ed_first,vitals_dbp_ed_first,vitals_hr_ed_first,vitals_spo2_ed_first,hospital_outcome,pmhx_diabetes,pmhx_hld,pmhx_htn,pmhx_ihd,pmhx_ckd,pmhx_copd,pmhx_asthma,pmhx_activecancer,pmhx_chronicliver,pmhx_stroke,pmhx_chf,pmhx_dementia,lab_sodium,lab_leukocyte,lab_mean_platelet_volume,lab_glucose,lab_hct,lab_hemoglobin,lab_creatinine,lab_lymphocyte_percentage,lab_urea,lab_inr,lab_lymphocyte,lab_ddimer,lab_crp,lab_ldh,lab_neutrophil_percentage,lab_rbc,lab_neutrophil,lab_alt,lab_potassium,lab_rdw,lab_mcv,lab_aptt,lab_mch,lab_prothrombin_activity,lab_ast,lab_platelet,pmhx_activecancer.1
1,2,71,MALE,"Coronavirus disease (COVID-20), virus identified",37.8,131.0,82.0,108.0,91.0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.0,28.65,11.3,,35.0,11.7,0.79,3.0,30.0,1.37,85.95,0.0,18.94,552.0,89.0,3.89,2549.85,13.0,4.3,12.5,90.0,33.2,30.1,62.0,30.0,68.0,
2,3,65,MALE,"Coronavirus disease (COVID-21), virus identified",39.8,138.0,68.0,97.0,97.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,135.0,2.77,11.0,118.0,35.7,12.4,0.69,4.7,11.0,1.03,13.019,0.0,19.01,518.0,63.0,3.91,174.51,107.0,3.6,13.5,91.3,31.5,31.7,96.0,149.0,142.0,
3,4,62,MALE,"Coronavirus disease (COVID-22), virus identified",36.8,114.0,72.0,64.0,93.0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138.0,4.72,10.3,140.0,39.5,14.0,0.81,25.2,14.0,1.02,118.944,0.0,9.56,431.0,,4.39,,42.0,3.7,11.5,90.0,29.7,31.9,97.0,63.0,120.0,
4,5,73,MALE,"Coronavirus disease (COVID-23), virus identified",37.0,144.0,72.0,64.0,98.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,136.0,10.11,12.3,90.0,38.6,13.1,0.55,8.3,14.0,1.31,83.913,0.0,9.99,319.0,93.0,4.17,940.23,15.0,3.9,12.9,92.6,37.7,31.4,66.0,20.0,230.0,
5,6,72,FEMALE,"Coronavirus disease (COVID-24), virus identified",35.9,127.0,69.0,67.0,94.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,9.49,10.8,125.0,41.1,13.9,0.77,7.6,16.0,1.05,72.124,0.0,12.76,654.0,,4.43,,29.0,5.0,12.6,92.8,29.8,31.4,92.0,71.0,174.0,


# Initial data exploration

In [None]:
snuh_combined.columns

Index(['subject_id', 'age', 'sex', 'ed_diagnosis', 'vitals_temp_ed_first',
       'vitals_sbp_ed_first', 'vitals_dbp_ed_first', 'vitals_hr_ed_first',
       'vitals_spo2_ed_first', 'hospital_outcome', 'pmhx_diabetes', 'pmhx_hld',
       'pmhx_htn', 'pmhx_ihd', 'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma',
       'pmhx_activecancer', 'pmhx_chronicliver', 'pmhx_stroke', 'pmhx_chf',
       'pmhx_dementia', 'lab_sodium', 'lab_leukocyte',
       'lab_mean_platelet_volume', 'lab_glucose', 'lab_hct', 'lab_hemoglobin',
       'lab_creatinine', 'lab_lymphocyte_percentage', 'lab_urea', 'lab_inr',
       'lab_lymphocyte', 'lab_ddimer', 'lab_crp', 'lab_ldh',
       'lab_neutrophil_percentage', 'lab_rbc', 'lab_neutrophil', 'lab_alt',
       'lab_potassium', 'lab_rdw', 'lab_mcv', 'lab_aptt', 'lab_mch',
       'lab_prothrombin_activity', 'lab_ast', 'lab_platelet',
       ' pmhx_activecancer'],
      dtype='object')

In [None]:
#remove labs that have >50% missing values

remove_lab = snuh_combined.loc[:, snuh_combined.columns.str.startswith('lab_')].isnull().sum().index[(snuh_combined.loc[:, snuh_combined.columns.str.startswith('lab_')].isnull().sum()/len(snuh_combined.loc[:, snuh_combined.columns.str.startswith('lab_')]) > 0.50)]
snuh_combined=snuh_combined.drop(columns=remove_lab)
snuh_combined.loc[:, snuh_combined.columns.str.startswith('lab_')].head()

1
2
3
4
5


All labs have >50% in the snuh dataset.

In [None]:
# since in the end we are not running several models, but only one with the hm_hospitales variables
# we are filling with 0 the variables that contain 100% missing values

#ed_diagnosis always contains 'Coronavirus disease (COVID-19), virus identified'   
snuh_combined.ed_diagnosis=0

In [None]:
# we need the same columns available in this dataset that we have in the hm hospitales dataset
# so we are going to fill the missing ones with 0

snuh_combined['lab_hct']=0
snuh_combined['lab_urea']=0
snuh_combined['lab_rbc']=0
snuh_combined['lab_aptt']=0
snuh_combined['lab_glucose']=0
snuh_combined['lab_ddimer']=0
snuh_combined['lab_platelet']=0
snuh_combined['lab_sodium']=0
snuh_combined['lab_crp']=0
snuh_combined['lab_mcv']=0
snuh_combined['lab_inr']=0
snuh_combined['lab_alt']=0
snuh_combined['lab_lymphocyte_percentage']=0
snuh_combined['lab_neutrophil']=0
snuh_combined['lab_lymphocyte']=0
snuh_combined['lab_ast']=0
snuh_combined['lab_hemoglobin']=0
snuh_combined['lab_ldh']=0
snuh_combined['lab_mch']=0
snuh_combined['lab_leukocyte']=0
snuh_combined['lab_neutrophil_percentage']=0
snuh_combined['lab_potassium']=0
snuh_combined['lab_mean_platelet_volume']=0
snuh_combined['lab_creatinine']=0
snuh_combined['lab_rdw']=0
snuh_combined['lab_prothrombin_activity']=0

categorical = ['sex','hospital_outcome', 'pmhx_diabetes', 'pmhx_hld',
       'pmhx_htn', 'pmhx_ihd', 'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma',
       'pmhx_activecancer', 'pmhx_stroke', 'pmhx_chf','pmhx_chronicliver',
       'pmhx_dementia']

snuh_combined=snuh_combined[['subject_id','age','hospital_outcome','sex'
,'vitals_temp_ed_first','vitals_sbp_ed_first','vitals_dbp_ed_first','vitals_hr_ed_first','vitals_spo2_ed_first'
,'pmhx_diabetes','pmhx_hld','pmhx_htn','pmhx_ihd','pmhx_ckd','pmhx_copd','pmhx_asthma'
,'pmhx_activecancer','pmhx_chronicliver','pmhx_stroke','pmhx_chf','pmhx_dementia'
,'lab_hct','lab_urea','lab_rbc','lab_aptt','lab_glucose','lab_ddimer'
,'lab_platelet','lab_sodium','lab_crp','lab_mcv','lab_inr','lab_alt'
,'lab_lymphocyte_percentage','lab_neutrophil','lab_lymphocyte','lab_ast'
,'lab_hemoglobin','lab_ldh','lab_mch','lab_leukocyte','lab_neutrophil_percentage'
,'lab_potassium','lab_mean_platelet_volume','lab_creatinine','lab_rdw'
,'lab_prothrombin_activity']]

print(TableOne(snuh_combined,columns=snuh_combined.columns.tolist(),categorical=categorical,groupby='hospital_outcome',pval=True))

                                            Grouped by hospital_outcome                                                  
                                                                Missing       Overall             0             1 P-Value
n                                                                                 315           310             5        
subject_id, mean (SD)                                                 0  165.0 (97.2)  166.3 (96.9)   84.2 (90.7)   0.113
age, mean (SD)                                                        0   46.0 (19.6)   45.7 (19.6)   64.0 (15.2)   0.053
hospital_outcome, n (%)              0                                0    310 (98.4)   310 (100.0)                <0.001
                                     1                                        5 (1.6)                   5 (100.0)        
sex, n (%)                           FEMALE                           0    150 (47.6)    150 (48.4)                 0.062
                        

# Data standardization

## Imputation and scale

In [None]:
def impute_external(col_name,imputer_list,data):
    try:
        for name,imputer in imputer_list:
            if name == col_name:
                imputed_data = imputer.transform(data.to_numpy().reshape(-1, 1))
                imputed_data = imputed_data.ravel()
    except BaseException as e:
        print(f'Column: {col_name} encountered exception {e}')
    return pd.Series(imputed_data,name=col_name)

def scale_external(col_name,scaler_list,data):
    try:
        for name,scaler in scaler_list:
            if name == col_name:
                scaled_data = scaler.transform(data.to_numpy().reshape(-1, 1))
                scaled_data = scaled_data.ravel()
    except BaseException as e:
        print(f'Column: {col_name} encountered exception {e}')
    return pd.Series(scaled_data,name=col_name)

In [None]:
snuh_combined_ = snuh_combined.reset_index()
#snuh_combined_ = snuh_combined_.drop(columns='admission_datetime')

#dropping all drug columns, there are zero patients with 'vasopressor' and 'intubation' - this needs to be verified with data owners
snuh_combined_ = snuh_combined_[[name for name in snuh_combined_.columns if 'drug' not in name]]

def impute(series,method=None,missing_indicator=False): 
    """
    Wrapper function for sklearn's SimpleImputer

    Parameters
    ----------
    series: pd.Series
        a pd.Series to impute
    method: string
        passed to SimpleImputer's strategy parameter
    missing_indicator: bool
        logical flag to indicate if a missing value indicator column should be added

    Returns
    -------
    output_df: pd.DataFrame
        a pd.DataFrame containing the imputed series + missing indicator column
    """
    name = series.name
    si = SimpleImputer(strategy=method,add_indicator=missing_indicator)
    array = si.fit_transform(series.values.reshape(-1, 1))
    if missing_indicator:
        output_df = pd.DataFrame(array,columns=[name,name+'_missing'])
    else:
        output_df = pd.DataFrame(array,columns=[name])
    return output_df, si

def encode(series,drop_first=True):
    """
    Onehot encodes a categorical dataframe, dropping the first column

    Parameters
    ----------
    series: pd.Series
        a categorical pandas series
    drop_first: bool
        logical flag for whether the first category should be dropped

    Returns
    -------
    onehot_df: pd.DataFrame
        a pd.DataFrame containing the onehot encoded columns
    """
    onehot_df = pd.get_dummies(series,drop_first=drop_first)
    return onehot_df

def scale_center(series):
    """
    Wrapper function to scale and center a pd.Series using sklearn's Robust Scaler
    """
    rs = RobustScaler()
    array = rs.fit_transform(series.values.reshape(-1,1))
    array = array.flatten()
    series = pd.Series(array)
    return series, rs

any_missing = snuh_combined_.columns[snuh_combined_.isnull().sum() > 0].tolist()

imputer_list = []
scaler_list = []

for column in snuh_combined_.columns.tolist():
    #Categorical features
    if ('pmhx' in column) or ('drug' in column) or (column in ['ed_diagnosis','sex']):
        if column in any_missing:
            #Mode imputation
            imputed,imputer = impute(snuh_combined_[column],method='most_frequent')
            imputer_list += [(column,imputer)]
            snuh_combined_ = snuh_combined_.drop(columns=column).merge(imputed, left_index=True, right_index=True)
        if (len(snuh_combined_[column].value_counts()) > 2) or (snuh_combined_[column].dtype=='O'):
            #One hot encoding
            onehot_df = encode(snuh_combined_[column])
            snuh_combined_ = snuh_combined_.drop(columns=column).merge(onehot_df, left_index=True, right_index=True)
    #Numeric features - vital signs, laboratory values
    elif ('age' in column) or ('vitals' in column) or ('lab' in column):
        if column in any_missing:
            #Median imputation
            imputed,imputer = impute(snuh_combined_[column],method='median')
            imputer_list += [(column,imputer)]
            snuh_combined_ = snuh_combined_.drop(columns=column).merge(imputed, left_index=True, right_index=True)
        if 'spo2' not in column:
            #Winsorize to 1st and 99th percentile - excluding SpO2 which can normally take a value of 100%
            snuh_combined_[column] = winsorize(snuh_combined_[column],limits=(0.01,0.01))
        #Scale and center numeric columns
        snuh_combined_[column],scaler = scale_center(snuh_combined_[column])
        scaler_list += [(column,scaler)]
    
snuh_combined_ = snuh_combined_.set_index('subject_id')

# Final Table 1

In [None]:
categorical = ['hospital_outcome', 'MALE',
'pmhx_chronicliver', 'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd', 'pmhx_ckd', 'pmhx_copd', 'pmhx_asthma', 'pmhx_activecancer', 'pmhx_stroke', 'pmhx_chf', 'pmhx_dementia']

print(TableOne(snuh_combined_,columns=snuh_combined_.columns.tolist(),categorical=categorical,groupby='hospital_outcome',pval=True))

                                         Grouped by hospital_outcome                                                 
                                                             Missing       Overall             0            1 P-Value
n                                                                              320           315            5        
index, mean (SD)                                                   0  163.9 (97.2)  165.2 (96.9)  83.2 (90.7)   0.113
age, mean (SD)                                                     0     0.0 (0.6)     0.0 (0.6)    0.6 (0.5)   0.054
hospital_outcome, n (%)              0                             0    315 (98.4)   315 (100.0)               <0.001
                                     1                                     5 (1.6)                  5 (100.0)        
lab_hct, mean (SD)                                                 0     0.0 (0.0)     0.0 (0.0)    0.0 (0.0)     nan
lab_urea, mean (SD)                                     

# Append Image features

## Features ingestion

In [None]:
korean_image_feature = pd.read_csv('features/snuh_image_feature.csv', header=None)
korean_image_names = pd.read_csv('features/snuh_image_names.csv', header=None)

## Appending files

In [None]:
korean_image_feature = pd.concat([korean_image_feature.reset_index(drop=True), korean_image_names], axis=1)

## Naming all features

In [None]:
korean_image_feature_names = ['feat_' + s for s in [str(i).zfill(4) for i in range(1,65)] ]
korean_image_feature_names = np.append(korean_image_feature_names,['path'])
korean_image_feature.columns = korean_image_feature_names

## Extracting patient id from feature

In [None]:
korean_image_feature['subject_id']=korean_image_feature.path.str.extract('(\d+)').astype('int32')

invalid escape sequence \d
invalid escape sequence \d


In [None]:
korean_image_feature['subject_id'].nunique()

330

# Final Join

In [None]:
snuh_image_feature_data= snuh_combined_.merge(korean_image_feature, how='inner', on='subject_id')
print('Final number of patients in the combined dataset (containing both images and structured data)')
print(snuh_image_feature_data['subject_id'].nunique())
# dropping index column to avoid confusions
del snuh_image_feature_data['index']

final number of patients in the combined dataset (containing both image and structured data)
315


# Columns renaming and final export



In [None]:
snuh_image_feature_data=snuh_image_feature_data.rename(columns={"subject_id": "PATIENT ID"})
snuh_image_feature_data=snuh_image_feature_data.rename(columns={"hospital_outcome": "expired_30_days"})

snuh_image_feature_data.to_csv('final_validation_datasets/snuh_image_feature_data.csv',index=False)