In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
## random 100 patient ids
sample_pid_list = ['PUH-2019-032',
    'PUH-2011-125',
    'PUH-2014-216',
    'PUH-2014-187',
    'PUH-2017-229',
    'PUH-2018-046',
    'PUH-2013-069',
    'PUH-2018-164',
    'PUH-2012-210',
    'PUH-2015-122',
    'PUH-2014-150',
    'PUH-2011-163',
    'PUH-2015-136',
    'PUH-2016-092',
    'PUH-2017-027',
    'PUH-2013-140',
    'PUH-2018-054',
    'PUH-2014-174',
    'PUH-2019-022',
    'PUH-2019-037',
    'PUH-2016-048',
    'PUH-2014-130',
    'PUH-2016-181',
    'PUH-2012-158',
    'PUH-2010-205',
    'PUH-2015-095',
    'PUH-2013-194',
    'PUH-2011-166',
    'PUH-2012-186',
    'PUH-2014-211',
    'PUH-2011-017',
    'PUH-2016-175',
    'PUH-2015-270',
    'PUH-2013-063',
    'PUH-2016-277',
    'PUH-2017-296',
    'PUH-2019-108',
    'PUH-2016-313',
    'PUH-2017-098',
    'PUH-2011-089',
    'PUH-2017-077',
    'PUH-2011-071',
    'PUH-2016-142',
    'PUH-2017-057',
    'PUH-2013-165',
    'PUH-2016-304',
    'PUH-2013-002',
    'PUH-2011-154',
    'PUH-2013-059',
    'PUH-2018-302',
    'PUH-2016-080',
    'PUH-2018-042',
    'PUH-2011-120',
    'PUH-2018-167',
    'PUH-2017-294',
    'PUH-2016-246',
    'PUH-2019-124',
    'PUH-2016-100',
    'PUH-2014-079',
    'PUH-2015-293',
    'PUH-2011-152',
    'PUH-2016-027',
    'PUH-2017-181',
    'PUH-2012-043',
    'PUH-2010-181',
    'PUH-2016-257',
    'PUH-2010-165',
    'PUH-2011-030',
    'PUH-2014-106',
    'PUH-2012-095',
    'PUH-2012-214',
    'PUH-2016-242',
    'PUH-2012-223',
    'PUH-2011-111',
    'PUH-2014-060',
    'PUH-2013-032',
    'PUH-2015-264',
    'PUH-2015-207',
    'PUH-2017-292',
    'PUH-2015-051',
    'PUH-2010-140',
    'PUH-2014-088',
    'PUH-2013-172',
    'PUH-2016-324',
    'PUH-2015-258',
    'PUH-2017-008',
    'PUH-2019-033',
    'PUH-2016-154',
    'PUH-2017-222',
    'PUH-2010-163',
    'PUH-2015-209',
    'PUH-2017-146',
    'PUH-2013-119',
    'PUH-2019-010',
    'PUH-2012-153',
    'PUH-2015-064',
    'PUH-2014-100',
    'PUH-2016-087',
    'PUH-2012-172',
    'PUH-2015-216']


In [3]:
## patient metadata
data_path = 'metadata_random_100id.csv'
meta_data = pd.read_csv(data_path)
interest_cols = ['id','surv', 'death_cat', 'follow_com',] # outcome event related
cols = ['id', 'y', 'delta']
invariant_cols = ['age', 'female', 'oohca', 'edarrest', 'rhythm', 'ca_type', 'transfer', 
            'witnessed', 'bystander_cpr', 'shocks', 'duration', 
            'four_r_0', 'four_eye_0', 'four_m_0', 'pupils_0', 'corneals_0', 'cough_0', 'gag_0', 
            ] # time-invariant / static covariates

cols_to_be_OHE = ['rhythm','ca_type', 'witnessed', 'bystander_cpr', 
              'four_eye_0', 'four_m_0', 'pupils_0', 'corneals_0', 'cough_0', 'gag_0',
              'four_r_0', 
              'ccimi', 'ccipvd', 'ccidementia', 'ccicva', 'ccihemi', 'ccichf', 'ccicvd', 
              'ccicld', 'ccictd', 'ccipud', 'cciaids', 'ccickd', 'ccielsd', 'ccidm', 'ccica', 
              'ccileukemia', 'ccilymphoma'] # to be one-hot encoded
numeric_cols = [col for col in invariant_cols if col not in cols_to_be_OHE]
meta_data['edarrest'].replace(np.nan, 0, inplace=True) # No
meta_data['witnessed'].replace(np.nan, 0, inplace=True) # No
meta_data['bystander_cpr'].replace(np.nan, 0, inplace=True) # No
meta_data['shocks'].replace(np.nan, 0, inplace=True) # zero
meta_data['duration'].replace(np.nan, 0, inplace=True) # zero-minute duration
meta_data['duration'].replace(-1, 0, inplace=True) # zero-minute duration
meta_data['four_r_0'].replace(np.nan, -1, inplace=True) # unable to determine
meta_data['four_eye_0'].replace(np.nan, -1, inplace=True) # unable to determine
meta_data['four_m_0'].replace(np.nan, -1, inplace=True) # unable to determine
meta_data['pupils_0'].replace(np.nan, -1, inplace=True) # unable to determine
meta_data['corneals_0'].replace(np.nan, -1, inplace=True) # unable to determine
meta_data['cough_0'].replace(np.nan, -1, inplace=True) # unable to determine
meta_data['gag_0'].replace(np.nan, -1, inplace=True) # unable to determine
cols_to_be_OHE = ['rhythm','ca_type', 'witnessed', 'bystander_cpr', 
              'four_r_0', 'four_eye_0', 'four_m_0', 'pupils_0', 'corneals_0', 'cough_0', 'gag_0']
numeric_cols = [col for col in invariant_cols if col not in cols_to_be_OHE]

dummies = pd.get_dummies(meta_data[cols_to_be_OHE].astype('object'), prefix=cols_to_be_OHE, drop_first=True)
inv_data = pd.concat([meta_data[['id']+numeric_cols], dummies], axis=1)
invariant_cols = inv_data.columns.tolist()[1:] # exclude the first col of id

static_feat = pd.DataFrame({'id': sample_pid_list})
static_feat = static_feat.merge(inv_data, left_on='id', right_on='id')
static_feat.head()

Unnamed: 0,id,age,female,oohca,edarrest,transfer,shocks,duration,rhythm_2,rhythm_3,...,pupils_0_0.0,pupils_0_1.0,pupils_0_2.0,corneals_0_0.0,corneals_0_1.0,corneals_0_2.0,cough_0_0.0,cough_0_1.0,gag_0_0.0,gag_0_1.0
0,PUH-2019-032,62,1.0,1.0,0.0,0,1.0,16.0,False,False,...,False,False,True,False,False,True,False,True,False,True
1,PUH-2011-125,21,0.0,1.0,0.0,1,0.0,0.0,False,True,...,True,False,False,True,False,False,True,False,True,False
2,PUH-2014-216,49,0.0,0.0,0.0,0,0.0,45.0,True,False,...,True,False,False,False,False,False,True,False,True,False
3,PUH-2014-187,69,0.0,1.0,0.0,1,3.0,25.0,False,False,...,True,False,False,True,False,False,True,False,True,False
4,PUH-2017-229,26,0.0,1.0,0.0,1,0.0,26.0,False,True,...,True,False,False,True,False,False,True,False,True,False


Causes of death:
- 1: Rearrested, intractable shock, multisystem organ failure
- 2: Brain death
- 3: Withdrawal for non-neurological reasons (DNR, advanced directives, etc), excluded in the study
- 4: Withdrawal for perceived poor neurological prognosis

Combine death_cat of 1 and 2

In [4]:
## time to awake label
time_to_outcome_path = 'time_to_outcome_random_100id.csv'
time_to_outcome = pd.read_csv(time_to_outcome_path)
time_to_outcome['awake'] = time_to_outcome.follow_com.apply(lambda x: 1 if x=='Yes' else 0)

## merge outcome-related features
dt = time_to_outcome.merge(meta_data[interest_cols], on='id')

## get event label based on cause of death
dt['event'] = 0 # censored
dt.loc[dt.death_cat==1, 'event'] = 1 # 1: natural death
dt.loc[dt.death_cat==2, 'event'] = 1 # 1: natural death
dt.loc[dt.death_cat==3, 'event'] = 3 # 3: Withdrawal for non-neurological reasons, not considered in the study, to be excluded later
dt.loc[dt.death_cat==4, 'event'] = 4 # 4: Withdrawal for perceived poor neurological prognosis
dt.loc[dt.awake==1, 'event'] = 2 # awake, could possibly die later
# dt.event.value_counts().sort_index()

dt.head()

Unnamed: 0,id,follow_com_x,obsduration,awake,surv,death_cat,follow_com_y,event
0,PUH-2010-140,Yes,89.699997,1,1.0,,1.0,2
1,PUH-2010-163,No,56.099998,0,0.0,4.0,0.0,4
2,PUH-2010-165,No,286.70001,0,0.0,4.0,0.0,4
3,PUH-2010-181,Yes,61.799999,1,1.0,,1.0,2
4,PUH-2010-205,No,64.0,0,0.0,4.0,0.0,4


In [5]:
save_path = 'eeg_data/'

patients_ls = sorted(sample_pid_list)

X_summary = []
X_static = []
event_times = []
event_indicators = []
time_to_outcomes_one_per_patient = []

for patient in patients_ls[:]:
    with open(save_path+patient+'.pickle', 'rb') as handle:
        curr_X_summary, curr_y, curr_x_static, seq_len_in_minute = pickle.load(handle)
        curr_y[1] = dt[dt['id']==patient].event.tolist()[0]
        curr_X_summary = curr_X_summary.reshape((-1,72))
        curr_x_static = list(curr_x_static.squeeze())
        event_time = np.array(list(curr_y[0] - np.arange(1, curr_X_summary.shape[0])) + [curr_y[0] - seq_len_in_minute/60])
        evnet_indicator = np.array([int(curr_y[1])] * curr_X_summary.shape[0])
        
        X_summary.append(curr_X_summary)
        X_static.append(curr_x_static)
        event_times.append(event_time)
        event_indicators.append(evnet_indicator)
        time_to_outcomes_one_per_patient.append(curr_y[0])


In [6]:
## put in the format needed for Dynamic-DeepHit
dt = []
num_eeg_feat = 72
num_static_feat = 43
for i in range(len(event_times)):
    time_to_outcome = time_to_outcomes_one_per_patient[i]
    seq_len_i = event_times[i].shape[0]
    dt_i = pd.DataFrame({'tte': [time_to_outcome] * seq_len_i,
                         'times': time_to_outcome - event_times[i],
                         'label': event_indicators[i]})
    dt_i['id'] = i+1
    

    dt_i = pd.concat([dt_i, pd.DataFrame(X_summary[i], 
                                         columns=['eeg_'+str(e+1) for e in range(num_eeg_feat)])], axis=1)
    dt_i = pd.concat([dt_i, pd.DataFrame(np.repeat(np.array(X_static[i]).reshape(1,-1), seq_len_i, 0), 
                                         columns=['sta_'+str(s+1) for s in range(num_static_feat)])], axis=1)
    dt.append(dt_i)

dt = pd.concat(dt)
dt.label = dt.label.apply(lambda x: 3 if x==4 else x)
dt.sample(5)

Unnamed: 0,tte,times,label,id,eeg_1,eeg_2,eeg_3,eeg_4,eeg_5,eeg_6,...,sta_34,sta_35,sta_36,sta_37,sta_38,sta_39,sta_40,sta_41,sta_42,sta_43
6,16.5,7.0,1,83,1.644075,2.013129,22.692695,12.704718,16.983665,19.575765,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
6,117.4,7.0,3,20,27.577321,37.142335,24.301794,12.626912,18.157651,21.373582,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26.0,2.0,2,68,4.705797,6.869078,18.220651,11.326365,14.363237,16.067783,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
11,235.10001,11.95,2,34,6.26098,6.168697,23.377456,12.890406,17.439244,20.207105,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,49.700001,5.0,3,32,4.731919,4.116031,51.51573,27.081573,37.756089,44.393967,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [7]:
dt.to_csv('combined_data_long_random_100id.csv', index=False)