# Unifrom the column names of all the databases

because column names and units of databases are very different

In [1]:
import pandas as pd
import numpy as np

## load the data

In [2]:
data_mimic = pd.read_csv('../datasets/MIMIC-data.csv')
data_aumc = pd.read_csv('../datasets/AUMC-data.csv')
data_eicu = pd.read_csv('../datasets/eCRD-data.csv')
data_plagh = pd.read_csv('../datasets/PLAGH-data.csv')
data_HJ23 = pd.read_csv('../datasets/HJ23-data.csv')

In [3]:
age_bins = [17, 39, 49, 59, 69, 79, 300]
age_label = ['18-39', '40-49', '50-59', '60-69', '70-79', '80+']

In [4]:
#define the obvious outlier ranges
lab_outlier_ranges = {'ALBUMIN':[0.2, 7], 
                      'BICARBONATE': [1,80],
                      'CALCIUM': [0.1,100],
                      'CREATININE': [0,30],
                      'GLUCOSE':[7, 1000.0],
                      'HEMOGLOBIN': [0.1,30],
                      'LACTATE': [0,30],
                      'MAGNESIUM': [0,50],
                      'PHOSPHATE': [0,100],
                      'PLATELET': [1,1200],
                      'POTASSIUM': [0,20],
                      'SODIUM': [50,300],
                      'IONIZEDCALCIUM': [0, 15],
                      'WBC': [0, 400]
                     }

## MIMIC

In [5]:
data_mimic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38508 entries, 0 to 38507
Data columns (total 45 columns):
subject_id             38508 non-null int64
hadm_id                38508 non-null int64
icustay_id             38508 non-null int64
gender                 38508 non-null object
age                    38508 non-null float64
ethnicity              38508 non-null object
icu_los_hours          38506 non-null float64
icu_expire_flag        38508 non-null int64
albumin_min            9787 non-null float64
albumin_max            9787 non-null float64
bilirubin_min          12312 non-null float64
bilirubin_max          12312 non-null float64
bicarbonate_min        36856 non-null float64
bicarbonate_max        36856 non-null float64
creatinine_min         37059 non-null float64
creatinine_max         37059 non-null float64
glucose_min            37106 non-null float64
glucose_max            37106 non-null float64
hemoglobin_min         36733 non-null float64
hemoglobin_max         36733 

In [6]:
#male->1, female->0
data_mimic['gender'].replace('M',1,inplace=True)
data_mimic['gender'].replace('F',0,inplace=True)

In [7]:
# urgent -> emergency
data_mimic['admission_type']=data_mimic['admission_type'].str.upper()
data_mimic['admission_type'].replace('URGENT','EMERGENCY',inplace=True)

In [8]:
#the unit of 'hospital_los_hours' in current data is day, transmit to hour
data_mimic['hospital_los_hours']=data_mimic['hospital_los_hours']*24

In [9]:
#make 'vent_flag' and 'crrt_flag'
data_mimic.loc[data_mimic['vent_duration_hours'].notnull(),'vent_flag']=1
data_mimic.loc[data_mimic['crrt_duration_hours'].notnull(),'crrt_flag']=1

In [10]:
data_mimic['age'].describe()

count    38508.000000
mean        63.884242
std         17.582336
min         18.000000
25%         52.400000
50%         65.700000
75%         77.900000
max         91.400000
Name: age, dtype: float64

In [11]:
#divide into different age group
data_mimic['age_group']=pd.cut(data_mimic['age'], age_bins)

In [12]:
#divide into different outcome group, according to the quartiles
survival_df=data_mimic.loc[data_mimic['icu_expire_flag']==0,:]
q_1=survival_df['icu_los_hours'].describe()['25%']
q_2=survival_df['icu_los_hours'].describe()['50%']
q_3=survival_df['icu_los_hours'].describe()['75%']
data_mimic.loc[data_mimic['icu_los_hours'] <= q_1, 'outcome_group'] = 1
data_mimic.loc[(data_mimic['icu_los_hours'] > q_1) & (data_mimic['icu_los_hours'] <= q_2), 'outcome_group'] = 2
data_mimic.loc[(data_mimic['icu_los_hours'] > q_2) & (data_mimic['icu_los_hours'] <= q_3), 'outcome_group'] = 3
data_mimic.loc[(data_mimic['icu_los_hours'] > q_3), 'outcome_group'] = 4
#expired patients in group 5
data_mimic.loc[(data_mimic['icu_expire_flag'] == 1), 'outcome_group'] = 5

In [13]:
#cardiac surgery patients
data_mimic.loc[data_mimic['unittype']=='CSRU','cardiac_surgery_flag']=1
data_mimic.loc[data_mimic['unittype']!='CSRU','cardiac_surgery_flag']=0

In [14]:
data_mimic['cardiac_surgery_flag'].sum()

7606.0

In [15]:
data_mimic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38508 entries, 0 to 38507
Data columns (total 50 columns):
subject_id              38508 non-null int64
hadm_id                 38508 non-null int64
icustay_id              38508 non-null int64
gender                  38508 non-null int64
age                     38508 non-null float64
ethnicity               38508 non-null object
icu_los_hours           38506 non-null float64
icu_expire_flag         38508 non-null int64
albumin_min             9787 non-null float64
albumin_max             9787 non-null float64
bilirubin_min           12312 non-null float64
bilirubin_max           12312 non-null float64
bicarbonate_min         36856 non-null float64
bicarbonate_max         36856 non-null float64
creatinine_min          37059 non-null float64
creatinine_max          37059 non-null float64
glucose_min             37106 non-null float64
glucose_max             37106 non-null float64
hemoglobin_min          36733 non-null float64
hemoglobin_

### eICU

In [16]:
data_eicu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158358 entries, 0 to 158357
Data columns (total 21 columns):
uniquepid                    158358 non-null object
patienthealthsystemstayid    158358 non-null int64
patientunitstayid            158358 non-null int64
gender                       158276 non-null object
age                          158358 non-null float64
unittype                     158358 non-null object
unitadmitsource              157606 non-null object
unitvisitnumber              158358 non-null int64
icu_los_hours                158358 non-null int64
hospital_los_hours           158358 non-null int64
icu_expire_flag              158339 non-null float64
hospital_expire_flag         156934 non-null float64
bicarbonate_min              143629 non-null float64
creatinine_max               152848 non-null float64
glucose_max                  152019 non-null float64
hemoglobin_min               151904 non-null float64
lactate_max                  59183 non-null float64
sod

In [17]:
data_eicu['unitadmitsource'].unique()

array(['Operating Room', 'Floor', 'Emergency Department', nan,
       'Direct Admit', 'Other Hospital', 'Step-Down Unit (SDU)',
       'Other ICU', 'ICU to SDU', 'Recovery Room', 'Chest Pain Center',
       'PACU', 'Acute Care/Floor', 'ICU', 'Observation', 'Other'],
      dtype=object)

In [18]:
data_eicu['age'].describe()

count    158358.000000
mean         62.966399
std          17.409776
min           0.000000
25%          52.000000
50%          65.000000
75%          76.000000
max          91.400000
Name: age, dtype: float64

In [19]:
#select age >=18
data_eicu=data_eicu.loc[data_eicu['age']>=18,:]
data_eicu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157883 entries, 0 to 158357
Data columns (total 21 columns):
uniquepid                    157883 non-null object
patienthealthsystemstayid    157883 non-null int64
patientunitstayid            157883 non-null int64
gender                       157801 non-null object
age                          157883 non-null float64
unittype                     157883 non-null object
unitadmitsource              157137 non-null object
unitvisitnumber              157883 non-null int64
icu_los_hours                157883 non-null int64
hospital_los_hours           157883 non-null int64
icu_expire_flag              157864 non-null float64
hospital_expire_flag         156466 non-null float64
bicarbonate_min              143209 non-null float64
creatinine_max               152421 non-null float64
glucose_max                  151588 non-null float64
hemoglobin_min               151496 non-null float64
lactate_max                  59084 non-null float64
sod

In [20]:
#reserve the first row data of each patient
#from the offical website, 'There is no systematic method for chronologically ordering patientHealthSystemStayID for the same patient within the same year.'
#we can not find which one is first when a patient was admitted to ICU in different health system in one year
#use the first row instead(the data has been sorted by id & age, so the first line ought to be the minimum of age of each patient)
grouped=data_eicu.groupby(['uniquepid'])
data_eicu=grouped.first()
data_eicu.reset_index(inplace=True)
data_eicu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132994 entries, 0 to 132993
Data columns (total 21 columns):
uniquepid                    132994 non-null object
patienthealthsystemstayid    132994 non-null int64
patientunitstayid            132994 non-null int64
gender                       132925 non-null object
age                          132994 non-null float64
unittype                     132994 non-null object
unitadmitsource              132431 non-null object
unitvisitnumber              132994 non-null int64
icu_los_hours                132994 non-null int64
hospital_los_hours           132994 non-null int64
icu_expire_flag              132980 non-null float64
hospital_expire_flag         131936 non-null float64
bicarbonate_min              121307 non-null float64
creatinine_max               128790 non-null float64
glucose_max                  128079 non-null float64
hemoglobin_min               128137 non-null float64
lactate_max                  52412 non-null float64
sod

In [21]:
data_eicu['gender'].replace('female',0,inplace=True)
data_eicu['gender'].replace('male',1,inplace=True)

In [22]:
data_eicu['crrt_duration_hours']=np.nan
data_eicu.loc[data_eicu['vent_duration_hours'].notnull(),'vent_flag']=1

In [23]:
#age group
data_eicu['age_group']=pd.cut(data_eicu['age'], age_bins)

In [24]:
#divide into different outcome group, according to the quartiles
survival_df=data_eicu.loc[data_eicu['icu_expire_flag']==0,:]
q_1=survival_df['icu_los_hours'].describe()['25%']
q_2=survival_df['icu_los_hours'].describe()['50%']
q_3=survival_df['icu_los_hours'].describe()['75%']
data_eicu.loc[data_eicu['icu_los_hours'] <= q_1, 'outcome_group'] = 1
data_eicu.loc[(data_eicu['icu_los_hours'] > q_1) & (data_eicu['icu_los_hours'] <= q_2), 'outcome_group'] = 2
data_eicu.loc[(data_eicu['icu_los_hours'] > q_2) & (data_eicu['icu_los_hours'] <= q_3), 'outcome_group'] = 3
data_eicu.loc[(data_eicu['icu_los_hours'] > q_3), 'outcome_group'] = 4
#expired patients in group 5
data_eicu.loc[(data_eicu['icu_expire_flag'] == 1), 'outcome_group'] = 5

In [25]:
data_eicu['unittype'].replace('Neuro ICU', 'NICU',inplace=True)
data_eicu['unittype'].replace('Cardiac ICU', 'CCU',inplace=True)

In [26]:
#cardiac surgery patients, ccu-cticu & med-surg icu
data_eicu_cardiac_surgery = pd.read_csv('../datasets/eicu_cardiac surgery.csv')
data_eicu_cardiac_surgery['cardiac_surgery_flag']=1
data_eicu_cardiac_surgery.drop(['treatmentoffset','treatmentstring','unittype'],axis=1,inplace=True)
data_eicu_cardiac_surgery.drop_duplicates(inplace=True)
data_eicu=pd.merge(data_eicu,data_eicu_cardiac_surgery,on='patientunitstayid',how='left')
data_eicu

Unnamed: 0,uniquepid,patienthealthsystemstayid,patientunitstayid,gender,age,unittype,unitadmitsource,unitvisitnumber,icu_los_hours,hospital_los_hours,...,lactate_max,sodium_max,sodium_min,crrt_flag,vent_duration_hours,crrt_duration_hours,vent_flag,age_group,outcome_group,cardiac_surgery_flag
0,002-10009,193705,224606,0.0,76.0,Med-Surg ICU,Operating Room,1,69,139,...,1.9,143.0,134.0,,23.466667,,1.0,"(69, 79]",3.0,
1,002-10018,178200,204602,0.0,29.0,Med-Surg ICU,Floor,1,19,396,...,2.9,144.0,129.0,,,,,"(17, 39]",1.0,
2,002-10034,141169,157016,0.0,23.0,Med-Surg ICU,Floor,1,69,125,...,,141.0,140.0,,,,,"(17, 39]",3.0,
3,002-10050,183274,211144,0.0,67.0,Med-Surg ICU,Operating Room,1,92,162,...,,146.0,146.0,,49.450000,,1.0,"(59, 69]",4.0,
4,002-10052,137239,151900,0.0,66.0,MICU,Emergency Department,1,57,106,...,0.9,140.0,134.0,,15.666667,,1.0,"(59, 69]",3.0,
5,002-10063,189145,218742,0.0,69.0,NICU,Emergency Department,1,86,130,...,,136.0,136.0,,,,,"(59, 69]",4.0,
6,002-10066,185872,214497,0.0,42.0,Med-Surg ICU,Emergency Department,1,50,140,...,0.4,140.0,135.0,,,,,"(39, 49]",3.0,
7,002-10067,168546,192233,0.0,55.0,Med-Surg ICU,Operating Room,1,47,70,...,,,,,,,,"(49, 59]",3.0,
8,002-1007,178462,204935,0.0,83.0,Med-Surg ICU,Floor,1,12,62,...,,142.0,139.0,,,,,"(79, 300]",1.0,
9,002-10076,187781,216951,0.0,46.0,NICU,Emergency Department,1,142,722,...,,137.0,137.0,,,,,"(39, 49]",4.0,


In [27]:
data_eicu['cardiac_surgery_flag'].sum()

2775.0

In [28]:
((data_eicu['unittype']=='CTICU') | (data_eicu['unittype']=='CSICU')).sum()

9328

In [29]:
#cardiac surgery patients, csicu & cticu
data_eicu.loc[(data_eicu['unittype']=='CTICU') | (data_eicu['unittype']=='CSICU'),'cardiac_surgery_flag']=1
data_eicu.loc[data_eicu['cardiac_surgery_flag'].isnull(),'cardiac_surgery_flag']=0

In [30]:
data_eicu['cardiac_surgery_flag'].sum()

12103.0

In [31]:
#admission type, elective for surgical patients
data_eicu_elective = pd.read_csv('../datasets/eicu_admission type elective.csv')
data_eicu_elective['admission_type']=1
data_eicu_elective.drop('admitdxpath',axis=1,inplace=True)
data_eicu_elective.drop_duplicates(inplace=True)
data_eicu=pd.merge(data_eicu,data_eicu_elective,on='patientunitstayid',how='left')
data_eicu['admission_type'].sum()

24631.0

In [32]:
#other patients(emergency for surgical patients, all medical patients) as emergency
data_eicu.loc[data_eicu['admission_type'].isnull(),'admission_type']='EMERGENCY'
data_eicu.loc[data_eicu['admission_type']==1,'admission_type']='ELECTIVE'

In [33]:
data_eicu

Unnamed: 0,uniquepid,patienthealthsystemstayid,patientunitstayid,gender,age,unittype,unitadmitsource,unitvisitnumber,icu_los_hours,hospital_los_hours,...,sodium_max,sodium_min,crrt_flag,vent_duration_hours,crrt_duration_hours,vent_flag,age_group,outcome_group,cardiac_surgery_flag,admission_type
0,002-10009,193705,224606,0.0,76.0,Med-Surg ICU,Operating Room,1,69,139,...,143.0,134.0,,23.466667,,1.0,"(69, 79]",3.0,0.0,EMERGENCY
1,002-10018,178200,204602,0.0,29.0,Med-Surg ICU,Floor,1,19,396,...,144.0,129.0,,,,,"(17, 39]",1.0,0.0,EMERGENCY
2,002-10034,141169,157016,0.0,23.0,Med-Surg ICU,Floor,1,69,125,...,141.0,140.0,,,,,"(17, 39]",3.0,0.0,EMERGENCY
3,002-10050,183274,211144,0.0,67.0,Med-Surg ICU,Operating Room,1,92,162,...,146.0,146.0,,49.450000,,1.0,"(59, 69]",4.0,0.0,ELECTIVE
4,002-10052,137239,151900,0.0,66.0,MICU,Emergency Department,1,57,106,...,140.0,134.0,,15.666667,,1.0,"(59, 69]",3.0,0.0,EMERGENCY
5,002-10063,189145,218742,0.0,69.0,NICU,Emergency Department,1,86,130,...,136.0,136.0,,,,,"(59, 69]",4.0,0.0,EMERGENCY
6,002-10066,185872,214497,0.0,42.0,Med-Surg ICU,Emergency Department,1,50,140,...,140.0,135.0,,,,,"(39, 49]",3.0,0.0,EMERGENCY
7,002-10067,168546,192233,0.0,55.0,Med-Surg ICU,Operating Room,1,47,70,...,,,,,,,"(49, 59]",3.0,0.0,ELECTIVE
8,002-1007,178462,204935,0.0,83.0,Med-Surg ICU,Floor,1,12,62,...,142.0,139.0,,,,,"(79, 300]",1.0,0.0,EMERGENCY
9,002-10076,187781,216951,0.0,46.0,NICU,Emergency Department,1,142,722,...,137.0,137.0,,,,,"(39, 49]",4.0,0.0,EMERGENCY


### AUMC

lack of LOS_hospital, crrt, vent

In [34]:
data_aumc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20127 entries, 0 to 20126
Data columns (total 38 columns):
patientid             20127 non-null int64
gender                19656 non-null object
agegroup              20127 non-null object
icu_los_hours         20127 non-null int64
icu_expire_flag       20127 non-null int64
admissionid           19879 non-null float64
albumin_min           15061 non-null float64
albumin_max           15061 non-null float64
bilirubin_min         8413 non-null float64
bilirubin_max         8413 non-null float64
bicarbonate_min       19031 non-null float64
bicarbonate_max       19031 non-null float64
creatinine_min        19496 non-null float64
creatinine_max        19496 non-null float64
glucose_min           19752 non-null float64
glucose_max           19752 non-null float64
hemoglobin_min        19797 non-null float64
hemoglobin_max        19797 non-null float64
lactate_min           11115 non-null float64
lactate_max           11115 non-null float64
p

merge the extend info

In [35]:
aumc_extend=pd.read_csv('../datasets/AUMC_extend_admissiontype.csv')
aumc_extend.sort_values(by=['patientid','admissionid'],inplace=True)

In [36]:
aumc_extend.drop('admissionid',axis=1,inplace=True)
aumc_extend['urgency'].replace('planned','elective',inplace=True)
aumc_extend['urgency'].replace('unplanned','emergency',inplace=True)

In [37]:
data_aumc=pd.merge(data_aumc,aumc_extend,on='patientid',how='left')
data_aumc.rename(columns={'urgency':'admission_type'},inplace=True)
data_aumc['admission_type']=data_aumc['admission_type'].str.upper()

In [38]:
aumc_extend_2=pd.read_csv('../datasets/AUMC_extend_icutypes.csv')
aumc_extend_2=aumc_extend_2.loc[aumc_extend_2['admissioncount']==1,['patientid','location']]
aumc_extend_2

Unnamed: 0,patientid,location
0,12,MC
1,13,MC
3,40,MC
4,63,MC
5,91,MC
6,117,MC
7,156,IC
8,157,IC
9,367,MC
10,167,MC


In [39]:
data_aumc=pd.merge(data_aumc,aumc_extend_2,on='patientid',how='left')
data_aumc.rename(columns={'location':'unittype'},inplace=True)
data_aumc['unittype'].replace('IC','SICU',inplace=True)
data_aumc['unittype'].replace('MC','MICU',inplace=True)
data_aumc['unittype'].replace('IC&MC','SICU&MICU',inplace=True)
data_aumc['unittype'].replace('MC&IC','SICU&MICU',inplace=True)
data_aumc

Unnamed: 0,patientid,gender,agegroup,icu_los_hours,icu_expire_flag,admissionid,albumin_min,albumin_max,bilirubin_min,bilirubin_max,...,bun_min,bun_max,wbc_min,wbc_max,calcium_min,calcium_max,ionizedcalcium_min,ionizedcalcium_max,admission_type,unittype
0,0,female,80+,42,0,0.0,22.0,22.0,,,...,,,14.6,14.6,2.1,2.3,1.1,1.1,ELECTIVE,SICU
1,1,male,60-69,26,0,1.0,29.0,29.0,,,...,,,7.3,10.8,2.0,2.1,1.2,1.2,ELECTIVE,SICU
2,2,male,60-69,23,0,2.0,28.0,28.0,,,...,,,12.3,16.5,2.0,2.0,1.1,1.1,EMERGENCY,SICU
3,3,male,50-59,23,0,3.0,28.0,28.0,,,...,,,12.5,19.3,2.1,2.2,1.1,1.1,ELECTIVE,MICU
4,4,male,70-79,50,0,4.0,,,8.0,8.0,...,7.6,7.6,21.6,21.6,2.0,2.0,1.2,1.2,ELECTIVE,SICU&MICU
5,5,male,50-59,69,0,5.0,28.0,33.0,5.0,5.0,...,3.5,3.5,10.0,19.2,2.2,2.2,1.1,1.1,EMERGENCY,SICU
6,6,female,80+,48,0,6.0,27.0,32.0,18.0,18.0,...,5.3,5.3,11.5,12.9,1.9,2.1,1.1,1.1,EMERGENCY,SICU
7,7,male,70-79,42,0,7.0,26.0,26.0,11.0,11.0,...,7.9,8.0,8.9,12.6,1.8,1.8,,,ELECTIVE,MICU
8,8,male,50-59,24,0,8.0,27.0,29.0,,,...,,,14.1,14.6,1.9,2.0,,,ELECTIVE,SICU
9,9,female,70-79,67,0,9.0,21.0,26.0,,,...,,,18.3,20.4,2.0,2.1,1.1,1.1,ELECTIVE,SICU


In [40]:
#male->1, female->0
data_aumc['gender'].replace('male',1,inplace=True)
data_aumc['gender'].replace('female',0,inplace=True)

In [41]:
data_aumc.rename(columns={'agegroup':'age_group'},inplace=True)
data_aumc['age_group'].replace(['18-39'],'(17, 39]',inplace=True)
data_aumc['age_group'].replace(['40-49'],'(39, 49]',inplace=True)
data_aumc['age_group'].replace(['50-59'],'(49, 59]',inplace=True)
data_aumc['age_group'].replace(['60-69'],'(59, 69]',inplace=True)
data_aumc['age_group'].replace(['70-79'],'(69, 79]',inplace=True)
data_aumc['age_group'].replace(['80+'],'(79, 300]',inplace=True)

In [42]:
#divide into different outcome group, according to the quartiles
survival_df=data_aumc.loc[data_aumc['icu_expire_flag']==0,:]
q_1=survival_df['icu_los_hours'].describe()['25%']
q_2=survival_df['icu_los_hours'].describe()['50%']
q_3=survival_df['icu_los_hours'].describe()['75%']
data_aumc.loc[data_aumc['icu_los_hours'] <= q_1, 'outcome_group'] = 1
data_aumc.loc[(data_aumc['icu_los_hours'] > q_1) & (data_aumc['icu_los_hours'] <= q_2), 'outcome_group'] = 2
data_aumc.loc[(data_aumc['icu_los_hours'] > q_2) & (data_aumc['icu_los_hours'] <= q_3), 'outcome_group'] = 3
data_aumc.loc[(data_aumc['icu_los_hours'] > q_3), 'outcome_group'] = 4
#expired patients in group 5
data_aumc.loc[(data_aumc['icu_expire_flag'] == 1), 'outcome_group'] = 5

In [43]:
#cardiac surgery patients
data_aumc_cardiac_surgery = pd.read_csv('../datasets/AUMC_cardiac surgery.csv')
data_aumc_cardiac_surgery=data_aumc_cardiac_surgery.loc[data_aumc_cardiac_surgery['admissioncount']==1,['patientid','unitType']]
data_aumc_cardiac_surgery.loc[data_aumc_cardiac_surgery['unitType']=='Cardiac Surgery','cardiac_surgery_flag']=1
data_aumc_cardiac_surgery.loc[data_aumc_cardiac_surgery['unitType']!='Cardiac Surgery','cardiac_surgery_flag']=0
data_aumc_cardiac_surgery.drop('unitType',axis=1,inplace=True)
data_aumc=pd.merge(data_aumc,data_aumc_cardiac_surgery,on='patientid',how='left')

In [44]:
data_aumc

Unnamed: 0,patientid,gender,age_group,icu_los_hours,icu_expire_flag,admissionid,albumin_min,albumin_max,bilirubin_min,bilirubin_max,...,wbc_min,wbc_max,calcium_min,calcium_max,ionizedcalcium_min,ionizedcalcium_max,admission_type,unittype,outcome_group,cardiac_surgery_flag
0,0,0.0,"(79, 300]",42,0,0.0,22.0,22.0,,,...,14.6,14.6,2.1,2.3,1.1,1.1,ELECTIVE,SICU,3.0,1.0
1,1,1.0,"(59, 69]",26,0,1.0,29.0,29.0,,,...,7.3,10.8,2.0,2.1,1.2,1.2,ELECTIVE,SICU,3.0,1.0
2,2,1.0,"(59, 69]",23,0,2.0,28.0,28.0,,,...,12.3,16.5,2.0,2.0,1.1,1.1,EMERGENCY,SICU,2.0,1.0
3,3,1.0,"(49, 59]",23,0,3.0,28.0,28.0,,,...,12.5,19.3,2.1,2.2,1.1,1.1,ELECTIVE,MICU,2.0,1.0
4,4,1.0,"(69, 79]",50,0,4.0,,,8.0,8.0,...,21.6,21.6,2.0,2.0,1.2,1.2,ELECTIVE,SICU&MICU,3.0,1.0
5,5,1.0,"(49, 59]",69,0,5.0,28.0,33.0,5.0,5.0,...,10.0,19.2,2.2,2.2,1.1,1.1,EMERGENCY,SICU,4.0,0.0
6,6,0.0,"(79, 300]",48,0,6.0,27.0,32.0,18.0,18.0,...,11.5,12.9,1.9,2.1,1.1,1.1,EMERGENCY,SICU,3.0,0.0
7,7,1.0,"(69, 79]",42,0,7.0,26.0,26.0,11.0,11.0,...,8.9,12.6,1.8,1.8,,,ELECTIVE,MICU,3.0,0.0
8,8,1.0,"(49, 59]",24,0,8.0,27.0,29.0,,,...,14.1,14.6,1.9,2.0,,,ELECTIVE,SICU,2.0,1.0
9,9,0.0,"(69, 79]",67,0,9.0,21.0,26.0,,,...,18.3,20.4,2.0,2.1,1.1,1.1,ELECTIVE,SICU,3.0,1.0


In [45]:
(data_aumc['cardiac_surgery_flag']==1).sum()

7036

In [46]:
#adjust the unit of each labtest
#for AUMC
data_aumc['albumin_min']=data_aumc['albumin_min']/10
data_aumc['albumin_max']=data_aumc['albumin_max']/10
data_aumc['bilirubin_max']=data_aumc['bilirubin_max']/17.1
data_aumc['bilirubin_min']=data_aumc['bilirubin_min']/17.1
data_aumc['calcium_max']=data_aumc['calcium_max']/0.25
data_aumc['calcium_min']=data_aumc['calcium_min']/0.25
data_aumc['hemoglobin_max']=data_aumc['hemoglobin_max']/0.62
data_aumc['hemoglobin_min']=data_aumc['hemoglobin_min']/0.62
data_aumc['creatinine_max']=data_aumc['creatinine_max']/88.4
data_aumc['creatinine_min']=data_aumc['creatinine_min']/88.4
data_aumc['glucose_max']=data_aumc['glucose_max']*18.018
data_aumc['glucose_min']=data_aumc['glucose_min']*18.018
data_aumc['magnesium_max']=data_aumc['magnesium_max']/0.5
data_aumc['magnesium_min']=data_aumc['magnesium_min']/0.5
data_aumc['phosphate_max']=data_aumc['phosphate_max']/0.32
data_aumc['phosphate_min']=data_aumc['phosphate_min']/0.32

### HJ23

In [47]:
data_HJ23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4840 entries, 0 to 4839
Data columns (total 46 columns):
subject_id             4840 non-null int64
hadm_id                4840 non-null object
icustay_id             4840 non-null int64
gender                 4840 non-null object
age                    4840 non-null int64
ethnicity              4840 non-null object
icu_los_hours          4840 non-null float64
icu_expire_flag        4840 non-null int64
albumin_min            842 non-null float64
albumin_max            842 non-null float64
bilirubin_min          3037 non-null float64
bilirubin_max          3037 non-null float64
bicarbonate_min        4424 non-null float64
bicarbonate_max        4424 non-null float64
creatinine_min         4721 non-null float64
creatinine_max         4721 non-null float64
glucose_min            4778 non-null float64
glucose_max            4778 non-null float64
hemoglobin_min         4724 non-null float64
hemoglobin_max         4724 non-null float64
lactat

In [48]:
#male->1, female->0
data_HJ23['gender'].replace('M',1,inplace=True)
data_HJ23['gender'].replace('F',0,inplace=True)

In [49]:
data_HJ23['crrt_duration_hours']=np.nan
data_HJ23.loc[data_mimic['vent_duration_hours'].notnull(),'vent_flag']=1

In [50]:
data_HJ23['age'].describe()

count    4840.000000
mean       60.075620
std        16.872008
min         9.000000
25%        50.000000
50%        63.000000
75%        73.000000
max        99.000000
Name: age, dtype: float64

In [51]:
#select age >=18
data_HJ23=data_HJ23.loc[data_HJ23['age']>=18,:]
data_HJ23.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4774 entries, 0 to 4839
Data columns (total 48 columns):
subject_id             4774 non-null int64
hadm_id                4774 non-null object
icustay_id             4774 non-null int64
gender                 4774 non-null object
age                    4774 non-null int64
ethnicity              4774 non-null object
icu_los_hours          4774 non-null float64
icu_expire_flag        4774 non-null int64
albumin_min            838 non-null float64
albumin_max            838 non-null float64
bilirubin_min          2997 non-null float64
bilirubin_max          2997 non-null float64
bicarbonate_min        4364 non-null float64
bicarbonate_max        4364 non-null float64
creatinine_min         4656 non-null float64
creatinine_max         4656 non-null float64
glucose_min            4712 non-null float64
glucose_max            4712 non-null float64
hemoglobin_min         4659 non-null float64
hemoglobin_max         4659 non-null float64
lactat

In [52]:
data_HJ23.sort_values(by=['subject_id','age','icustay_id'],ascending=True,inplace=True)

In [53]:
#reserve the first row data of each patient
grouped=data_HJ23.groupby(['subject_id'])
data_HJ23=grouped.first()
data_HJ23.reset_index(inplace=True)
data_HJ23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 48 columns):
subject_id             4238 non-null int64
hadm_id                4238 non-null object
icustay_id             4238 non-null int64
gender                 4238 non-null object
age                    4238 non-null int64
ethnicity              4238 non-null object
icu_los_hours          4238 non-null float64
icu_expire_flag        4238 non-null int64
albumin_min            804 non-null float64
albumin_max            804 non-null float64
bilirubin_min          2748 non-null float64
bilirubin_max          2748 non-null float64
bicarbonate_min        3892 non-null float64
bicarbonate_max        3892 non-null float64
creatinine_min         4140 non-null float64
creatinine_max         4140 non-null float64
glucose_min            4186 non-null float64
glucose_max            4186 non-null float64
hemoglobin_min         4149 non-null float64
hemoglobin_max         4149 non-null float64
lactat

In [54]:
#age group
data_HJ23['age_group']=pd.cut(data_HJ23['age'], age_bins)

In [55]:
#divide into different outcome group, according to the quartiles
survival_df=data_HJ23.loc[data_HJ23['icu_expire_flag']==0,:]
q_1=survival_df['icu_los_hours'].describe()['25%']
q_2=survival_df['icu_los_hours'].describe()['50%']
q_3=survival_df['icu_los_hours'].describe()['75%']
data_HJ23.loc[data_HJ23['icu_los_hours'] <= q_1, 'outcome_group'] = 1
data_HJ23.loc[(data_HJ23['icu_los_hours'] > q_1) & (data_HJ23['icu_los_hours'] <= q_2), 'outcome_group'] = 2
data_HJ23.loc[(data_HJ23['icu_los_hours'] > q_2) & (data_HJ23['icu_los_hours'] <= q_3), 'outcome_group'] = 3
data_HJ23.loc[(data_HJ23['icu_los_hours'] > q_3), 'outcome_group'] = 4
#expired patients in group 5
data_HJ23.loc[(data_HJ23['icu_expire_flag'] == 1), 'outcome_group'] = 5

In [56]:
#all patients in HJ23 are not cardiac surgery
data_HJ23['cardiac_surgery_flag']=0

In [57]:
data_HJ23

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,age,ethnicity,icu_los_hours,icu_expire_flag,albumin_min,albumin_max,...,hospital_los_hours,unittype,admission_type,crrt_flag,vent_duration_hours,crrt_duration_hours,vent_flag,age_group,outcome_group,cardiac_surgery_flag
0,4863,0012995174,3135,1,56,UNKNOWN/NOT SPECIFIED,16.6,0,,,...,53.1,SICU,EMERGENCY,0,,,1.0,"(49, 59]",1.0,0
1,22349,0013085973,3479,0,78,UNKNOWN/NOT SPECIFIED,130.0,1,,,...,135.8,MICU,EMERGENCY,1,,,1.0,"(69, 79]",5.0,0
2,44533,0014868934,9718,0,58,UNKNOWN/NOT SPECIFIED,100.9,0,3.38,3.38,...,192.9,MICU,EMERGENCY,0,,,,"(49, 59]",3.0,0
3,46643,0013863730,6267,1,51,UNKNOWN/NOT SPECIFIED,50.1,0,,,...,120.0,MICU,EMERGENCY,0,,,1.0,"(49, 59]",1.0,0
4,49078,0012818450,2472,0,70,UNKNOWN/NOT SPECIFIED,113.9,0,,,...,376.8,MICU,EMERGENCY,0,,,,"(69, 79]",3.0,0
5,52001,0013836099,6174,0,55,UNKNOWN/NOT SPECIFIED,44.8,0,,,...,56.8,MICU,EMERGENCY,0,,,,"(49, 59]",1.0,0
6,54047,0012650354,1882,1,81,UNKNOWN/NOT SPECIFIED,229.4,1,,,...,230.6,MICU,EMERGENCY,0,193.8,,,"(79, 300]",5.0,0
7,72697,0012930602,2867,1,55,UNKNOWN/NOT SPECIFIED,429.2,0,,,...,596.6,MICU,EMERGENCY,0,260.2,,1.0,"(49, 59]",4.0,0
8,89576,0013464725,4848,0,52,UNKNOWN/NOT SPECIFIED,206.3,0,,,...,550.7,SICU,ELECTIVE,0,72.8,,,"(49, 59]",4.0,0
9,90904,0013078158,3451,1,77,UNKNOWN/NOT SPECIFIED,48.8,0,,,...,238.5,MICU,EMERGENCY,0,42.0,,1.0,"(69, 79]",1.0,0


### PLAGH

no crrt nor vent information

In [58]:
data_plagh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63515 entries, 0 to 63514
Data columns (total 37 columns):
patient_id            63515 non-null object
visit_id              63515 non-null int64
unittype              63515 non-null object
gender                63515 non-null object
age                   63515 non-null float64
body_height           61235 non-null float64
body_weight           61226 non-null float64
icu_los_hours         63515 non-null float64
icu_expire_flag       63515 non-null int64
admission_type        63515 non-null object
hospital_los_hours    63515 non-null float64
lactate_min           16 non-null float64
bicarbonate_min       1162 non-null float64
ionizedcalcium_min    8776 non-null float64
wbc_min               58991 non-null float64
creatinine_min        59017 non-null float64
glucose_min           55691 non-null float64
platelets_min         58465 non-null float64
albumin_min           57350 non-null float64
hemoglobin_min        58250 non-null float64
calc

In [59]:
#male->1, female->0
data_plagh['gender'].replace('M',1,inplace=True)
data_plagh['gender'].replace('F',0,inplace=True)

In [60]:
#transfer -> emergency, outpatient -> elective
data_plagh['admission_type']=data_plagh['admission_type'].str.upper()
data_plagh['admission_type'].replace('TRANSFER','EMERGENCY',inplace=True)
data_plagh['admission_type'].replace('OUTPATIENT','ELECTIVE',inplace=True)

In [61]:
#the unit of 'hospital_los_hours' and 'icu_los_hours' in current data is day, transmit to hour
data_plagh['hospital_los_hours']=data_plagh['hospital_los_hours']*24
data_plagh['icu_los_hours']=data_plagh['icu_los_hours']*24

In [62]:
data_plagh['age'].describe()

count    63515.000000
mean        55.785572
std         15.394890
min         18.000000
25%         45.890000
50%         56.890000
75%         66.770000
max        102.750000
Name: age, dtype: float64

In [63]:
#age group
data_plagh['age_group']=pd.cut(data_plagh['age'], age_bins)

In [64]:
#divide into different outcome group, according to the quartiles
survival_df=data_plagh.loc[data_plagh['icu_expire_flag']==0,:]
q_1=survival_df['icu_los_hours'].describe()['25%']
q_2=survival_df['icu_los_hours'].describe()['50%']
q_3=survival_df['icu_los_hours'].describe()['75%']
data_plagh.loc[data_plagh['icu_los_hours'] <= q_1, 'outcome_group'] = 1
data_plagh.loc[(data_plagh['icu_los_hours'] > q_1) & (data_plagh['icu_los_hours'] <= q_2), 'outcome_group'] = 2
data_plagh.loc[(data_plagh['icu_los_hours'] > q_2) & (data_plagh['icu_los_hours'] <= q_3), 'outcome_group'] = 3
data_plagh.loc[(data_plagh['icu_los_hours'] > q_3), 'outcome_group'] = 4
#expired patients in group 5
data_plagh.loc[(data_plagh['icu_expire_flag'] == 1), 'outcome_group'] = 5

In [65]:
#cardiac surgery patients
data_plagh.loc[data_plagh['unittype']=='CSICU','cardiac_surgery_flag']=1
data_plagh.loc[data_plagh['unittype']!='CSICU','cardiac_surgery_flag']=0

In [66]:
data_plagh

Unnamed: 0,patient_id,visit_id,unittype,gender,age,body_height,body_weight,icu_los_hours,icu_expire_flag,admission_type,...,platelets_max,albumin_max,hemoglobin_max,calcium_max,sodium_max,potassium_max,magnesium_max,age_group,outcome_group,cardiac_surgery_flag
0,S108413368,2,TICU,1,81.36,169.0,70.0,190.6872,0,ELECTIVE,...,262.0,30.7,91.0,2.06,133.8,3.56,0.73,"(79, 300]",4.0,0.0
1,S108415620,2,CCU,0,78.48,155.0,52.0,41.0304,0,EMERGENCY,...,251.0,36.8,124.0,2.07,143.5,3.51,0.85,"(69, 79]",1.0,0.0
2,S108417789,3,CCU,0,82.27,160.0,90.0,448.0008,0,EMERGENCY,...,106.0,35.3,105.0,2.22,139.5,3.89,1.08,"(79, 300]",4.0,0.0
3,S108418984,1,CCU,1,72.54,175.0,85.0,187.5000,0,EMERGENCY,...,242.0,40.5,155.0,2.28,141.9,4.11,0.90,"(69, 79]",4.0,0.0
4,S108419116,1,CCU,1,73.30,160.0,73.0,213.4992,0,EMERGENCY,...,147.0,37.3,165.0,2.27,138.2,3.93,1.09,"(69, 79]",4.0,0.0
5,S108420701,2,CCU,1,79.09,166.0,60.0,262.9008,0,EMERGENCY,...,203.0,40.7,142.0,2.29,133.7,5.24,0.93,"(79, 300]",4.0,0.0
6,S108420759,1,CCU,1,77.69,177.0,84.0,212.1096,0,EMERGENCY,...,161.0,38.2,151.0,2.17,144.7,3.61,0.98,"(69, 79]",4.0,0.0
7,S108422800,1,CCU,0,66.05,152.0,48.0,91.0008,0,EMERGENCY,...,310.0,40.3,110.0,2.21,143.2,3.05,0.92,"(59, 69]",2.0,0.0
8,S108423165,1,CCU,0,61.33,160.0,58.0,118.3344,0,EMERGENCY,...,170.0,43.0,139.0,2.26,144.2,3.95,0.89,"(59, 69]",3.0,0.0
9,S108423212,3,CCU,1,73.90,174.0,80.0,139.5840,0,ELECTIVE,...,187.0,40.0,122.0,2.19,144.1,4.00,0.94,"(69, 79]",3.0,0.0


In [67]:
(data_plagh['cardiac_surgery_flag']==1).sum()

7265

In [68]:
#adjust the unit of each labtest
#for plagh database
data_plagh['albumin_min']=data_plagh['albumin_min']/10
data_plagh['albumin_max']=data_plagh['albumin_max']/10
data_plagh['creatinine_max']=data_plagh['creatinine_max']/88.4
data_plagh['creatinine_min']=data_plagh['creatinine_min']/88.4
data_plagh['glucose_max']=data_plagh['glucose_max']*18.018
data_plagh['glucose_min']=data_plagh['glucose_min']*18.018
data_plagh['hemoglobin_min']=data_plagh['hemoglobin_min']/10
data_plagh['hemoglobin_max']=data_plagh['hemoglobin_max']/10
data_plagh['magnesium_max']=data_plagh['magnesium_max']/0.5
data_plagh['magnesium_min']=data_plagh['magnesium_min']/0.5
data_plagh['calcium_max']=data_plagh['calcium_max']/0.25
data_plagh['calcium_min']=data_plagh['calcium_min']/0.25

In [69]:
data_plagh['unittype'].replace('ICU','SICU',inplace=True)

## Remove outliers

In [70]:
kinds=['_min','_max']
for kind in kinds:
    for l, u in iter(sorted(lab_outlier_ranges.items())):
        if (l.lower()+kind) not in data_plagh.columns.values:
            data_plagh[l.lower()+kind]=np.nan
        if (l.lower()+kind) not in data_eicu.columns.values:
            data_eicu[l.lower()+kind]=np.nan
        
        data_mimic.loc[data_mimic[(data_mimic[l.lower()+kind]<lab_outlier_ranges[l][0])|(data_mimic[l.lower()+kind]>lab_outlier_ranges[l][1])].index,
                       [l.lower()+kind]]=np.nan
        
        data_eicu.loc[data_eicu[(data_eicu[l.lower()+kind]<lab_outlier_ranges[l][0])|(data_eicu[l.lower()+kind]>lab_outlier_ranges[l][1])].index,
                       [l.lower()+kind]]=np.nan
        
        data_plagh.loc[data_plagh[(data_plagh[l.lower()+kind]<lab_outlier_ranges[l][0])|(data_plagh[l.lower()+kind]>lab_outlier_ranges[l][1])].index,
                       [l.lower()+kind]]=np.nan
        
        data_aumc.loc[data_aumc[(data_aumc[l.lower()+kind]<lab_outlier_ranges[l][0])|(data_aumc[l.lower()+kind]>lab_outlier_ranges[l][1])].index,
                       [l.lower()+kind]]=np.nan
        
        data_HJ23.loc[data_HJ23[(data_HJ23[l.lower()+kind]<lab_outlier_ranges[l][0])|(data_HJ23[l.lower()+kind]>lab_outlier_ranges[l][1])].index,
                       [l.lower()+kind]]=np.nan

## save data

In [71]:
data_mimic.to_csv('../datasets/processed data/MIMIC-data.csv',index=False)
data_aumc.to_csv('../datasets/processed data/AUMC-data.csv',index=False)
data_eicu.to_csv('../datasets/processed data/eCRD-data.csv',index=False)
data_plagh.to_csv('../datasets/processed data/PLAGH-data.csv',index=False)
data_HJ23.to_csv('../datasets/processed data/HJ23-data.csv',index=False)