In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('/cluster/tufts/hugheslab/prath01/projects/time_series_prediction/src/')
from split_dataset import split_dataframe_by_keys
from feature_transformation import get_fenceposts

In [10]:
ts_df = pd.read_csv('/cluster/tufts/hugheslab/datasets/eicu_v2.0/standardized_data/features_per_tstep_first_48_hours_irregular_ts.csv.gz')
outcomes_df = pd.read_csv('/cluster/tufts/hugheslab/datasets/eicu_v2.0/standardized_data/outcomes_per_seq_first_48_hours_irregular_ts.csv')

In [12]:
outcomes_df

Unnamed: 0,hadm_id,icustay_id,mort,los_icu,hospitalid,unittype
0,128919,141168,1,2.497222,59,Med-Surg ICU
1,128927,141178,0,0.005556,60,Med-Surg ICU
2,128927,141179,0,1.418056,60,Med-Surg ICU
3,128941,141194,0,3.342361,73,CTICU
4,128943,141196,0,1.015972,67,Med-Surg ICU
...,...,...,...,...,...,...
200854,2743084,3353235,0,0.742361,458,Cardiac ICU
200855,2743086,3353237,0,0.881250,458,MICU
200856,2743099,3353251,0,11.290972,458,Cardiac ICU
200857,2743102,3353254,0,0.299306,459,Med-Surg ICU


## Remove stays less than 48 hours

In [15]:
min_stay_hrs = 48
keep_inds = outcomes_df['los_icu']>=min_stay_hrs/24.0
outcomes_df = outcomes_df.loc[keep_inds, :].copy().reset_index(drop=True)
ts_df = ts_df.loc[ts_df['icustay_id'].isin(outcomes_df['icustay_id']), :].reset_index(drop=True)


stay_lengths = outcomes_df['los_icu'].values
n_stays = len(outcomes_df['icustay_id'].unique())
# n_patients = len(outcomes_df['subject_id'].unique())
n_deaths = outcomes_df['mort'].sum()

print('Total stays : %d'%n_stays)
# print('Total patients : %d'%n_patients)
print('Frac of stays resulting in death : %.3f'%(n_deaths/n_stays))
# print('Frac of patients who die : %.3f'%(n_deaths/n_patients))

for min_los in [3, 7, 11]:
    inds = stay_lengths>=min_los
    frac_above_min_los = len(stay_lengths[inds])/n_stays
    print('Frac stays > %d days : %.3f'%(min_los, frac_above_min_los))

Total stays : 77832
Frac of stays resulting in death : 0.065
Frac stays > 3 days : 0.636
Frac stays > 7 days : 0.192
Frac stays > 11 days : 0.089


## Get feature names

In [4]:
ts_feature_cols = [
        'creatinine',
        'potassium',
        'sodium', 
        'platelets x 1000', 
        'glucose',
        'HCO3',
        'paO2',
        'albumin',
        'ALT (SGPT)',
        'AST (SGOT)',
        'direct bilirubin', 
        'total bilirubin',
        'troponin - T'
    ]


dem_cols = ['Age', 'is_gender_male', 'is_gender_unknown']
id_col = ['stay_id']
id_cols = ['subject_id', 'hadm_id', 'stay_id']
time_col = ['minutes_from_admission']
feature_cols = ts_feature_cols+dem_cols
features_df = pd.merge(ts_df, demographics_df, on=id_cols, how='left')

features_df['minutes_from_admission']=features_df['minutes_from_admission'].astype(int)

## Split into train/valid/test

In [5]:
# split features into train valid test
x_train_df, x_test_df = split_dataframe_by_keys(
        features_df, cols_to_group=id_cols, size=0.2, random_state=41)

x_train_df, x_valid_df = split_dataframe_by_keys(
        x_train_df, cols_to_group=id_cols, size=0.2, random_state=41)

# split outcomes into train valid test
y_train_df, y_test_df = split_dataframe_by_keys(
        outcomes_df, cols_to_group=id_cols, size=0.2, random_state=41)

y_train_df, y_valid_df = split_dataframe_by_keys(
        y_train_df, cols_to_group=id_cols, size=0.2, random_state=41)


In [6]:
del features_df, ts_df, outcomes_df, demographics_df

## Get the train/valid/test stats

In [7]:
for split, y_df, x_df in [('train', y_train_df, x_train_df),
                   ('valid', y_valid_df, x_valid_df),
                   ('test', y_test_df, x_test_df)]:

    stay_lengths = y_df['length_of_stay_in_hours'].values
    n_stays = len(y_df['stay_id'].unique())
    n_patients = len(y_df['subject_id'].unique())
    n_deaths = y_df['in_icu_mortality'].sum()

    print('Total stays : %d'%n_stays)
    print('Total patients : %d'%n_patients)
    print('Frac of stays resulting in death : %.3f'%(n_deaths/n_stays))
    print('Frac of patients who die : %.3f'%(n_deaths/n_patients))
    
        
#     save_dir = '/cluster/tufts/hugheslab/datasets/MIMIC-IV/ordinal_los_prediction/'
    for min_los in [3, 7, 11]:
        inds = stay_lengths>=min_los*24
        frac_above_min_los = len(stay_lengths[inds])/n_stays
        print('Frac stays > %d days in %s : %.3f'%(min_los, split, frac_above_min_los))
        y_df['los_geq_%s_days'%min_los] = (stay_lengths>=min_los*24)*1

Total stays : 33506
Total patients : 27084
Frac of stays resulting in death : 0.081
Frac of patients who die : 0.100
Frac stays > 3 days in train : 0.467
Frac stays > 7 days in train : 0.161
Frac stays > 11 days in train : 0.082
Total stays : 8377
Total patients : 7821
Frac of stays resulting in death : 0.080
Frac of patients who die : 0.086
Frac stays > 3 days in valid : 0.456
Frac stays > 7 days in valid : 0.160
Frac stays > 11 days in valid : 0.085
Total stays : 10471
Total patients : 9673
Frac of stays resulting in death : 0.084
Frac of patients who die : 0.090
Frac stays > 3 days in test : 0.472
Frac stays > 7 days in test : 0.166
Frac stays > 11 days in test : 0.085


## Get features (NxTxD), times(NxT) and labels(N) 

In [8]:
def convert_csv_to_ts_matrix(x_df, y_df, outcome_col):
    fp = get_fenceposts(x_df, id_cols)
    nrows = len(fp)-1
    T = 1440
    D = len(feature_cols)

    X_NTD = np.ones((nrows, T, D), dtype=np.float32)*np.nan
    times_NT = np.zeros((nrows, T), dtype=np.float32)
    y_N = np.zeros(nrows, dtype=int)
    mask_times_NT = np.zeros((nrows, T), dtype=bool)+False
    mask_obs_NTD = np.zeros((nrows, T, D), dtype=bool)+False

#     outcome_col = 'los_geq_3_days'
    for ii in range(nrows):
        cur_seq_len = fp[ii+1]-fp[ii]
        curr_vals = x_df.iloc[fp[ii]:fp[ii+1]][feature_cols].values
        curr_ts = np.squeeze(x_df.iloc[fp[ii]:fp[ii+1]][time_col].values, axis=1)
        curr_mask = np.logical_not(np.isnan(curr_vals))

        X_NTD[ii, :cur_seq_len, :] = curr_vals
        times_NT[ii, :cur_seq_len] = curr_ts
        y_N[ii] = y_df.iloc[ii, :][outcome_col]
        mask_times_NT[ii, :cur_seq_len] = True
        mask_obs_NTD[ii, :cur_seq_len] = curr_mask
        
    return X_NTD, y_N, times_NT, mask_times_NT, mask_obs_NTD

In [11]:
save_dir = '/cluster/tufts/hugheslab/datasets/MIMIC-IV/ordinal_los_prediction/'
suffix = '_irregular_ts'
for min_los in [3, 7, 11]:
    outcome_col = "los_geq_%s_days"%min_los
    train_X_NTD, train_y_N, train_times_NT, train_mask_times_NT, train_mask_obs_NTD = convert_csv_to_ts_matrix(x_train_df, 
                                                                                                               y_train_df,
                                                                                                              outcome_col)
    
    valid_X_NTD, valid_y_N, valid_times_NT, valid_mask_times_NT, valid_mask_obs_NTD = convert_csv_to_ts_matrix(x_valid_df, 
                                                                                                               y_valid_df,
                                                                                                              outcome_col)
    test_X_NTD, test_y_N, test_times_NT, test_mask_times_NT, test_mask_obs_NTD = convert_csv_to_ts_matrix(x_test_df, 
                                                                                                          y_test_df,
                                                                                                         outcome_col)
    
    
    
    D = train_X_NTD.shape[-1]
    
    # normalize the data exactly as per mtan
    for d in range(D):
        mins = np.nanpercentile(train_X_NTD[:, :, d], 1)
        maxs = np.nanpercentile(train_X_NTD[:, :, d], 99)
        if maxs==0:
            maxs=1
        train_X_NTD[:, :, d] = (train_X_NTD[:, :, d]-mins)/maxs
        valid_X_NTD[:, :, d] = (valid_X_NTD[:, :, d]-mins)/maxs
        test_X_NTD[:, :, d] = (test_X_NTD[:, :, d]-mins)/maxs
    
    curr_save_dir = os.path.join(save_dir, 'los_geq_%s_days_prediction'%min_los)
    
    
    # replace the nan values with 0s
    train_X_NTD[np.isnan(train_X_NTD)]=0
    valid_X_NTD[np.isnan(valid_X_NTD)]=0
    test_X_NTD[np.isnan(test_X_NTD)]=0
    
    # normalize the observed timepoints between 0 and 1
    max_t = np.max(train_times_NT)
    train_times_NT = train_times_NT/max_t
    valid_times_NT = valid_times_NT/max_t
    test_times_NT = test_times_NT/max_t
    
    print('Saving data to %s'%curr_save_dir)
    np.save(os.path.join(curr_save_dir, 'X_train%s.npy'%suffix), 
            train_X_NTD)
    np.save(os.path.join(curr_save_dir, 'y_train%s.npy'%suffix), 
            train_y_N)
    np.save(os.path.join(curr_save_dir, 'train_times_NT%s.npy'%suffix), 
            train_times_NT)
    np.save(os.path.join(curr_save_dir, 'train_mask_times_NT%s.npy'%suffix), 
            train_mask_times_NT)
    np.save(os.path.join(curr_save_dir, 'train_mask_obs_NTD%s.npy'%suffix), 
            train_mask_obs_NTD)

    print('Done saving train..')
    np.save(os.path.join(curr_save_dir, 'X_valid%s.npy'%suffix), 
            valid_X_NTD)
    np.save(os.path.join(curr_save_dir, 'y_valid%s.npy'%suffix), 
            valid_y_N)
    np.save(os.path.join(curr_save_dir, 'valid_times_NT%s.npy'%suffix), 
            valid_times_NT)
    np.save(os.path.join(curr_save_dir, 'valid_mask_times_NT%s.npy'%suffix), 
            valid_mask_times_NT)
    np.save(os.path.join(curr_save_dir, 'valid_mask_obs_NTD%s.npy'%suffix), 
            valid_mask_obs_NTD)

    print('Done saving valid..')
    np.save(os.path.join(curr_save_dir, 'X_test%s.npy'%suffix), 
            test_X_NTD)
    np.save(os.path.join(curr_save_dir, 'y_test%s.npy'%suffix), 
            test_y_N)
    np.save(os.path.join(curr_save_dir, 'test_times_NT%s.npy'%suffix), 
            test_times_NT)
    np.save(os.path.join(curr_save_dir, 'test_mask_times_NT%s.npy'%suffix), 
            test_mask_times_NT)
    np.save(os.path.join(curr_save_dir, 'test_mask_obs_NTD%s.npy'%suffix), 
            test_mask_obs_NTD)

    print('Done saving test..')

Saving data to /cluster/tufts/hugheslab/datasets/MIMIC-IV/ordinal_los_prediction/los_geq_3_days_prediction
Done saving train..
Done saving valid..
Done saving test..
Saving data to /cluster/tufts/hugheslab/datasets/MIMIC-IV/ordinal_los_prediction/los_geq_7_days_prediction
Done saving train..
Done saving valid..
Done saving test..
Saving data to /cluster/tufts/hugheslab/datasets/MIMIC-IV/ordinal_los_prediction/los_geq_11_days_prediction
Done saving train..
Done saving valid..
Done saving test..


In [70]:
np.max(train_times_NT)

1439.0

In [69]:
train_X_NTD[0, :, 0]

array([       nan,        nan, 0.39855072, ...,        nan,        nan,
              nan], dtype=float32)

In [61]:
feature_cols

['Heart Rate',
 'Respiratory Rate',
 'O2 saturation pulseoxymetry',
 'Non Invasive Blood Pressure systolic',
 'Non Invasive Blood Pressure diastolic',
 'Temperature Fahrenheit',
 'Height (cm)',
 'Respiratory Rate (Total)',
 'Potassium (serum)',
 'Sodium (serum)',
 'Chloride (serum)',
 'Hematocrit (serum)',
 'Hemoglobin',
 'Creatinine (serum)',
 'Glucose (serum)',
 'Magnesium',
 'Phosphorous',
 'Platelet Count',
 'Glucose (whole blood)',
 'Daily Weight',
 'Absolute Neutrophil Count',
 'Prothrombin time',
 'Fibrinogen',
 'PH (Arterial)',
 'PH (Venous)',
 'HCO3 (serum)',
 'Arterial O2 pressure',
 'Arterial CO2 Pressure',
 'Lactic Acid',
 'Albumin',
 'Calcium non-ionized',
 'C Reactive Protein (CRP)',
 'ALT',
 'AST',
 'Direct Bilirubin',
 'Total Bilirubin',
 'Troponin-T',
 'Venous CO2 Pressure',
 'Age',
 'is_gender_male',
 'is_gender_unknown']

In [27]:
a_df.columns

Index(['Unnamed: 0', 'subject_id', 'hadm_id', 'stay_id',
       'hours_from_admission', 'timestamp', 'Heart Rate', 'Respiratory Rate',
       'O2 saturation pulseoxymetry', 'Non Invasive Blood Pressure systolic',
       'Non Invasive Blood Pressure diastolic', 'Temperature Fahrenheit',
       'Height (cm)', 'Respiratory Rate (Total)', 'Potassium (serum)',
       'Sodium (serum)', 'Chloride (serum)', 'Hematocrit (serum)',
       'Hemoglobin', 'Creatinine (serum)', 'Glucose (serum)', 'Magnesium',
       'Phosphorous', 'Platelet Count', 'Glucose (whole blood)',
       'Daily Weight', 'Absolute Neutrophil Count', 'Prothrombin time',
       'Fibrinogen', 'PH (Arterial)', 'PH (Venous)', 'HCO3 (serum)',
       'Arterial O2 pressure', 'Arterial CO2 Pressure', 'Lactic Acid',
       'Albumin', 'Calcium non-ionized', 'C Reactive Protein (CRP)', 'ALT',
       'AST', 'Direct Bilirubin', 'Total Bilirubin', 'Troponin-T',
       'Venous CO2 Pressure', 'admission_timestamp_x', 'Age', 'is_gender_male',


In [13]:
feature_cols

['Heart Rate',
 'Respiratory Rate',
 'O2 saturation pulseoxymetry',
 'Non Invasive Blood Pressure systolic',
 'Non Invasive Blood Pressure diastolic',
 'Temperature Fahrenheit',
 'Height (cm)',
 'Respiratory Rate (Total)',
 'Potassium (serum)',
 'Sodium (serum)',
 'Chloride (serum)',
 'Hematocrit (serum)',
 'Hemoglobin',
 'Creatinine (serum)',
 'Glucose (serum)',
 'Magnesium',
 'Phosphorous',
 'Platelet Count',
 'Glucose (whole blood)',
 'Daily Weight',
 'Absolute Neutrophil Count',
 'Prothrombin time',
 'Fibrinogen',
 'PH (Arterial)',
 'PH (Venous)',
 'HCO3 (serum)',
 'Arterial O2 pressure',
 'Arterial CO2 Pressure',
 'Lactic Acid',
 'Albumin',
 'Calcium non-ionized',
 'C Reactive Protein (CRP)',
 'ALT',
 'AST',
 'Direct Bilirubin',
 'Total Bilirubin',
 'Troponin-T',
 'Venous CO2 Pressure',
 'Age',
 'is_gender_male',
 'is_gender_unknown']