## Part III: Feature Engineering and Data Preparation

#### Setup Environment

In [1]:
%run environment-setup.ipynb

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Stored 's3_datalake_path_csv' (str)
Stored 's3_datalake_path_prepared' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


In [2]:
# import additional libs needed
from sklearn.preprocessing import StandardScaler

In [3]:
# load the cleaned datset from Athena/S3
sepsis_dataset = load_clean_dataset()

2024-11-17 00:24:34,899	INFO worker.py:1786 -- Started a local Ray instance.


### Data Transformation

The dataset is cleaned and complete, however additional work is still required to prepare for modeling.  In this section, the following steps will be taken:

-  Encode categorical features
-  Transform the time series data into patient time series sequences
-  Split dataset: the dataset will be split in to train/val/test sets
-  Normalize dataset: the dataset will be normalized using a standard scaler

In [13]:
# helper function to create lag features
def create_lag_features(df, lag_step, feature_cols):
    # get all unique patients
    unique_patient_ids = df['patient_id'].unique()
    
    # create matrix for lag features 
    lag_features = np.zeros(shape=(df.shape[0], len(feature_cols)))
    lag_column_names = [x.lower() + '_t_' + str(lag_step) for x in feature_cols]
    lag_features_found = np.zeros(shape=len(feature_cols))
    feature_cols = [x.lower() for x in feature_cols]
    
    # iterate patients
    row_counter = 0
    for patient_id in unique_patient_ids:
        patient_records = df[df['patient_id'] == patient_id].sort_values(by=['hour'], ascending=True)
        patient_row_counter = 0
        for idx, patient_record in patient_records.iterrows():
            for col_idx, col in enumerate(feature_cols):
                if(patient_row_counter == 0):
                    lag_features[row_counter, col_idx] = patient_record[col]
                else:
                    lag_features[row_counter, col_idx] = patient_records.iloc[patient_row_counter - 1][col]
            row_counter += 1
            patient_row_counter += 1

    return pd.concat([df, pd.DataFrame(index=df.index, data=lag_features, columns=lag_column_names)], axis=1)

# create an index to indicate deterioration in vitals and labs over given set of time steps
#def create_deterioration_index(df, lag_step, feature_cols):
    



In [14]:
# create the lag features for t-1
sepsis_dataset_with_lags = create_lag_features(sepsis_dataset, 1, continuous_feature_cols)
sepsis_dataset_with_lags

Unnamed: 0,patient_id,hour,sepsislabel,hr,o2sat,temp,sbp,map,dbp,resp,...,wbc_t_1,platelets_t_1,creatinine_t_1,glucose_t_1,lactate_t_1,hct_t_1,bun_t_1,potassium_t_1,magnesium_t_1,calcium_t_1
0,11093,0,0,77.0,100.0,36.939241,97.0,26.5,68.0,21.00,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
1,11093,1,0,71.5,97.5,36.939241,97.0,83.5,68.0,16.50,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
2,11093,2,0,79.0,100.0,36.939241,106.0,87.0,72.0,20.75,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
3,11093,3,0,61.0,100.0,36.939241,151.0,95.0,90.0,16.00,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
4,11093,4,0,49.0,100.0,36.939241,145.0,92.0,84.0,20.00,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302722,116212,51,0,76.0,98.0,37.600000,89.0,63.0,44.0,18.00,...,5.0,113.0,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1
302723,116212,52,0,77.0,97.5,37.000000,92.0,68.0,50.0,17.50,...,5.0,113.0,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1
302724,116212,53,0,72.0,95.0,37.000000,104.0,72.0,51.0,18.00,...,5.0,113.0,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1
302725,116212,54,0,71.0,99.0,37.000000,98.0,65.0,45.0,15.00,...,5.0,113.0,3.07,148.0,1.16,32.2,119.0,5.1,2.6,8.1


In [15]:
# create the lag features for t-3
sepsis_dataset_with_lags = create_lag_features(sepsis_dataset_with_lags, 3, continuous_feature_cols)
sepsis_dataset_with_lags

Unnamed: 0,patient_id,hour,sepsislabel,hr,o2sat,temp,sbp,map,dbp,resp,...,wbc_t_3,platelets_t_3,creatinine_t_3,glucose_t_3,lactate_t_3,hct_t_3,bun_t_3,potassium_t_3,magnesium_t_3,calcium_t_3
0,11093,0,0,77.0,100.0,36.939241,97.0,26.5,68.0,21.00,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
1,11093,1,0,71.5,97.5,36.939241,97.0,83.5,68.0,16.50,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
2,11093,2,0,79.0,100.0,36.939241,106.0,87.0,72.0,20.75,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
3,11093,3,0,61.0,100.0,36.939241,151.0,95.0,90.0,16.00,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
4,11093,4,0,49.0,100.0,36.939241,145.0,92.0,84.0,20.00,...,17.7,158.0,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302722,116212,51,0,76.0,98.0,37.600000,89.0,63.0,44.0,18.00,...,5.0,113.0,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1
302723,116212,52,0,77.0,97.5,37.000000,92.0,68.0,50.0,17.50,...,5.0,113.0,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1
302724,116212,53,0,72.0,95.0,37.000000,104.0,72.0,51.0,18.00,...,5.0,113.0,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1
302725,116212,54,0,71.0,99.0,37.000000,98.0,65.0,45.0,15.00,...,5.0,113.0,3.07,148.0,1.16,32.2,119.0,5.1,2.6,8.1


In [17]:
# one hot encode the sex feature (M/F)
one_hot = pd.get_dummies(sepsis_dataset_with_lags['gender'], prefix='gender', dtype='int')

# Join the encoded df
sepsis_dataset_encoded = sepsis_dataset_with_lags.drop('gender',axis = 1)
sepsis_dataset_encoded = sepsis_dataset_encoded.join(one_hot)
sepsis_dataset_encoded

Unnamed: 0,patient_id,hour,sepsislabel,hr,o2sat,temp,sbp,map,dbp,resp,...,creatinine_t_3,glucose_t_3,lactate_t_3,hct_t_3,bun_t_3,potassium_t_3,magnesium_t_3,calcium_t_3,gender_0,gender_1
0,11093,0,0,77.0,100.0,36.939241,97.0,26.5,68.0,21.00,...,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3,0,1
1,11093,1,0,71.5,97.5,36.939241,97.0,83.5,68.0,16.50,...,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3,0,1
2,11093,2,0,79.0,100.0,36.939241,106.0,87.0,72.0,20.75,...,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3,0,1
3,11093,3,0,61.0,100.0,36.939241,151.0,95.0,90.0,16.00,...,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3,0,1
4,11093,4,0,49.0,100.0,36.939241,145.0,92.0,84.0,20.00,...,0.90,158.0,1.70,42.2,33.0,4.8,2.0,8.3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302722,116212,51,0,76.0,98.0,37.600000,89.0,63.0,44.0,18.00,...,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1,1,0
302723,116212,52,0,77.0,97.5,37.000000,92.0,68.0,50.0,17.50,...,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1,1,0
302724,116212,53,0,72.0,95.0,37.000000,104.0,72.0,51.0,18.00,...,3.07,184.0,1.16,32.2,119.0,5.1,2.6,8.1,1,0
302725,116212,54,0,71.0,99.0,37.000000,98.0,65.0,45.0,15.00,...,3.07,148.0,1.16,32.2,119.0,5.1,2.6,8.1,1,0


#### Transform Dataset into Patient-Level Time-Series

Currently, our data is formatted in a row per time step - so the time series for a given paitent would have [x] row entries corresponding to the duration of their time series, [x].  For modeling, this needs to be converted to sequences for each patient.  The sequence will be a single row, with one column per time step (there will be total time steps of LOOKBACK_WINDOW + PREDICTION_HORIZON).  In each column will be a vector of the variables for that patient at that time step.

In [18]:
# set target sequence length for each patient   
target_sequence_length = LOOKBACK_WINDOW + PREDICTION_HORIZON

In [19]:
# helper to filter patient time series to most recent (LOOKBACK_WINDOW + PREDICTION_HORIZON) samples
def truncate_patient_time_series(grouped_df):
  # don't include the positive sepsis time steps - we want to preict 6 hours before
  grouped_df_filtered = grouped_df[grouped_df['sepsislabel'] == 0]

  # filter to get the most recent 
  grouped_df_filtered = grouped_df[grouped_df['hour'] > (max(grouped_df['hour']) - target_sequence_length)]
  grouped_df_filtered = grouped_df_filtered[grouped_df_filtered['hour'] <= (max(grouped_df_filtered['hour']) - PREDICTION_HORIZON)]
  grouped_df_filtered['hour'] = grouped_df_filtered['hour'] - min(grouped_df_filtered['hour'])
  return grouped_df_filtered

# Execute grouping and sequence truncation
ts_limited_sepsis_data = sepsis_dataset_encoded.groupby('patient_id').apply(truncate_patient_time_series).reset_index(drop=True)
ts_limited_sepsis_data

  ts_limited_sepsis_data = sepsis_dataset_encoded.groupby('patient_id').apply(truncate_patient_time_series).reset_index(drop=True)


Unnamed: 0,patient_id,hour,sepsislabel,hr,o2sat,temp,sbp,map,dbp,resp,...,creatinine_t_3,glucose_t_3,lactate_t_3,hct_t_3,bun_t_3,potassium_t_3,magnesium_t_3,calcium_t_3,gender_0,gender_1
0,9,0,0,117.0,98.0,37.72,105.0,79.0,62.0,18.0,...,0.80,124.0,1.500000,23.1,22.0,3.7,2.0,8.1,0,1
1,9,1,0,124.0,99.0,37.72,120.0,92.0,75.0,16.5,...,0.80,124.0,1.500000,23.1,22.0,3.7,2.0,8.1,0,1
2,9,2,0,127.0,100.0,37.72,134.0,100.0,80.0,22.0,...,0.80,124.0,1.500000,23.1,22.0,3.7,2.0,8.1,0,1
3,9,3,0,131.0,96.0,37.72,127.0,94.0,75.0,27.0,...,0.80,124.0,1.500000,23.1,22.0,3.7,2.0,8.1,0,1
4,9,4,0,126.0,98.0,37.94,120.0,91.0,74.0,25.0,...,0.80,124.0,1.500000,23.1,22.0,3.7,2.0,8.1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186331,119982,43,0,74.0,93.0,36.30,116.0,86.0,69.0,26.0,...,0.84,95.0,1.819955,44.6,9.0,4.2,1.9,19.0,1,0
186332,119982,44,0,68.0,92.0,36.20,128.0,93.0,73.0,24.0,...,0.84,95.0,1.819955,44.6,9.0,4.2,1.9,19.0,1,0
186333,119982,45,0,69.0,92.0,36.20,119.0,85.0,62.0,24.0,...,0.84,95.0,1.819955,44.6,9.0,4.2,1.9,19.0,1,0
186334,119982,46,0,72.0,94.0,36.20,108.0,77.0,56.0,21.0,...,0.84,95.0,1.819955,44.6,9.0,4.2,1.9,19.0,1,0


In [20]:
# narrow down our columns to just the variables
feature_cols = ts_limited_sepsis_data.columns.to_list()
ignore_cols = ['patient_id', 'hour', 'sepsislabel', 'gender']
feature_cols = [x for x in feature_cols if x not in ignore_cols]

In [21]:
feature_cols

['hr',
 'o2sat',
 'temp',
 'sbp',
 'map',
 'dbp',
 'resp',
 'wbc',
 'platelets',
 'creatinine',
 'glucose',
 'lactate',
 'hct',
 'bun',
 'potassium',
 'magnesium',
 'calcium',
 'age',
 'hospadmtime',
 'iculos',
 'hr_t_1',
 'o2sat_t_1',
 'temp_t_1',
 'sbp_t_1',
 'map_t_1',
 'dbp_t_1',
 'resp_t_1',
 'wbc_t_1',
 'platelets_t_1',
 'creatinine_t_1',
 'glucose_t_1',
 'lactate_t_1',
 'hct_t_1',
 'bun_t_1',
 'potassium_t_1',
 'magnesium_t_1',
 'calcium_t_1',
 'hr_t_3',
 'o2sat_t_3',
 'temp_t_3',
 'sbp_t_3',
 'map_t_3',
 'dbp_t_3',
 'resp_t_3',
 'wbc_t_3',
 'platelets_t_3',
 'creatinine_t_3',
 'glucose_t_3',
 'lactate_t_3',
 'hct_t_3',
 'bun_t_3',
 'potassium_t_3',
 'magnesium_t_3',
 'calcium_t_3',
 'gender_0',
 'gender_1']

In [22]:
# Helper function to perform vectorization of features at each time step
def get_patient_feature_vector(row):
  vector = []
  for col in feature_cols:
    vector.append(row[col])
  return vector

# test on a few samples
v = ts_limited_sepsis_data.head().apply(get_patient_feature_vector, axis=1)
len(v[0])

56

In [23]:
# Apply to the whole dataset
ts_limited_sepsis_data["feature_vector"] = ts_limited_sepsis_data.apply(get_patient_feature_vector, axis=1)
ts_limited_sepsis_data["feature_vector"]

0         [117.0, 98.0, 37.72, 105.0, 79.0, 62.0, 18.0, ...
1         [124.0, 99.0, 37.72, 120.0, 92.0, 75.0, 16.5, ...
2         [127.0, 100.0, 37.72, 134.0, 100.0, 80.0, 22.0...
3         [131.0, 96.0, 37.72, 127.0, 94.0, 75.0, 27.0, ...
4         [126.0, 98.0, 37.94, 120.0, 91.0, 74.0, 25.0, ...
                                ...                        
186331    [74.0, 93.0, 36.3, 116.0, 86.0, 69.0, 26.0, 11...
186332    [68.0, 92.0, 36.2, 128.0, 93.0, 73.0, 24.0, 11...
186333    [69.0, 92.0, 36.2, 119.0, 85.0, 62.0, 24.0, 11...
186334    [72.0, 94.0, 36.2, 108.0, 77.0, 56.0, 21.0, 11...
186335    [75.0, 90.0, 36.2, 124.0, 89.0, 64.0, 22.0, 11...
Name: feature_vector, Length: 186336, dtype: object

In [24]:
len(ts_limited_sepsis_data.iloc[0]["feature_vector"])

56

In [25]:
# Drop everything except the patient ID, date, and selected features
drop_columns = [col for col in ts_limited_sepsis_data.columns if col not in ['patient_id', 'hour', 'sepsislabel', 'feature_vector']]
ts_limited_sepsis_data.drop(columns=drop_columns, inplace=True)
ts_limited_sepsis_data.head()

Unnamed: 0,patient_id,hour,sepsislabel,feature_vector
0,9,0,0,"[117.0, 98.0, 37.72, 105.0, 79.0, 62.0, 18.0, ..."
1,9,1,0,"[124.0, 99.0, 37.72, 120.0, 92.0, 75.0, 16.5, ..."
2,9,2,0,"[127.0, 100.0, 37.72, 134.0, 100.0, 80.0, 22.0..."
3,9,3,0,"[131.0, 96.0, 37.72, 127.0, 94.0, 75.0, 27.0, ..."
4,9,4,0,"[126.0, 98.0, 37.94, 120.0, 91.0, 74.0, 25.0, ..."


In [26]:
# Transform the dataset to have time step as columns, features in each col
ts_limited_sepsis_sequence = ts_limited_sepsis_data.pivot(index="patient_id", columns="hour", values="feature_vector")
ts_limited_sepsis_sequence


hour,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,"[117.0, 98.0, 37.72, 105.0, 79.0, 62.0, 18.0, ...","[124.0, 99.0, 37.72, 120.0, 92.0, 75.0, 16.5, ...","[127.0, 100.0, 37.72, 134.0, 100.0, 80.0, 22.0...","[131.0, 96.0, 37.72, 127.0, 94.0, 75.0, 27.0, ...","[126.0, 98.0, 37.94, 120.0, 91.0, 74.0, 25.0, ...","[126.0, 100.0, 37.94, 127.0, 96.0, 79.0, 21.0,...","[126.0, 98.0, 37.94, 115.0, 88.0, 72.0, 22.0, ...","[136.0, 96.0, 38.78, 132.0, 97.0, 79.0, 30.0, ...","[133.0, 96.0, 38.78, 111.5, 77.0, 66.0, 24.75,...","[134.0, 96.0, 38.78, 112.0, 82.0, 66.0, 26.0, ...",...,"[135.0, 97.0, 38.67, 136.0, 101.0, 82.0, 31.0,...","[129.0, 98.0, 38.67, 139.0, 105.0, 85.0, 29.0,...","[122.0, 99.0, 38.06, 136.0, 105.0, 86.0, 28.0,...","[121.0, 98.0, 38.06, 140.0, 109.0, 89.0, 28.0,...","[115.0, 98.0, 37.72, 139.0, 105.0, 85.0, 27.0,...","[113.0, 97.0, 37.72, 134.0, 101.0, 81.0, 26.0,...","[119.0, 100.0, 37.94, 140.0, 106.0, 85.0, 26.5...","[118.0, 96.0, 37.94, 138.0, 108.0, 88.0, 26.0,...","[111.0, 97.0, 37.39, 136.0, 106.0, 86.0, 26.0,...","[116.0, 96.0, 37.72, 143.0, 109.0, 88.0, 30.0,..."
18,"[90.0, 100.0, 37.1, 157.0, 87.0, 79.0, 13.5, 1...","[94.0, 100.0, 36.53, 166.5, 95.5, 80.0, 19.0, ...","[93.0, 100.0, 36.53, 159.0, 74.0, 70.0, 14.0, ...","[102.0, 99.5, 37.83, 101.0, 82.0, 83.0, 20.0, ...","[119.0, 97.0, 37.83, 83.0, 77.0, 71.0, 22.0, 1...","[122.0, 98.0, 39.72, 164.0, 70.0, 72.0, 20.0, ...","[110.0, 95.0, 39.72, 110.0, 67.0, 63.0, 15.0, ...","[103.0, 95.5, 38.06, 112.0, 70.0, 65.0, 22.0, ...","[98.0, 96.0, 37.5, 127.0, 73.0, 73.0, 25.0, 12...","[103.0, 96.0, 37.5, 137.0, 71.0, 78.0, 22.0, 1...",...,"[109.0, 95.0, 37.72, 133.0, 83.0, 79.0, 19.0, ...","[109.0, 96.0, 37.61, 123.0, 87.0, 80.0, 22.5, ...","[107.0, 95.0, 37.61, 127.0, 102.0, 88.0, 25.0,...","[105.0, 96.0, 37.61, 127.0, 90.0, 88.0, 26.0, ...","[116.0, 95.0, 39.0, 127.0, 89.0, 88.0, 24.0, 1...","[102.0, 95.0, 37.89, 127.0, 64.5, 88.0, 25.0, ...","[110.0, 98.0, 37.89, 127.0, 84.0, 88.0, 20.0, ...","[105.0, 96.0, 37.83, 127.0, 75.0, 88.0, 20.0, ...","[112.0, 96.0, 37.83, 127.0, 94.0, 88.0, 21.0, ...","[116.0, 96.0, 38.61, 127.0, 99.0, 88.0, 21.5, ..."
21,"[93.0, 95.0, 36.44, 128.0, 80.5, 54.5, 21.0, 1...","[94.0, 95.0, 36.44, 121.0, 70.0, 49.0, 14.0, 1...","[96.5, 94.0, 37.56, 114.0, 65.0, 45.0, 14.0, 1...","[97.0, 96.0, 38.0, 97.0, 61.0, 44.0, 14.0, 15....","[92.0, 95.0, 37.39, 110.0, 61.0, 48.0, 15.0, 1...","[99.0, 93.0, 37.28, 96.0, 61.0, 44.0, 16.0, 15...","[98.0, 94.0, 37.28, 95.0, 62.0, 45.0, 15.0, 15...","[97.0, 93.0, 37.56, 104.0, 63.0, 45.0, 18.0, 1...","[93.0, 94.0, 37.56, 97.0, 64.0, 42.0, 17.0, 15...","[106.0, 93.0, 37.56, 99.0, 62.0, 43.0, 17.0, 1...",...,"[113.0, 94.5, 36.72, 144.0, 99.0, 72.5, 31.0, ...","[101.0, 99.0, 36.72, 156.0, 109.0, 75.0, 32.0,...","[94.0, 92.0, 36.72, 139.0, 92.0, 62.0, 27.0, 1...","[94.0, 92.0, 36.72, 139.0, 92.0, 62.0, 27.0, 1...","[100.0, 94.0, 36.56, 149.5, 85.0, 77.0, 31.0, ...","[100.0, 95.0, 36.56, 152.0, 90.0, 74.0, 27.0, ...","[91.0, 93.0, 36.56, 145.5, 72.0, 68.0, 21.0, 1...","[104.0, 91.0, 36.56, 146.5, 80.0, 81.0, 35.0, ...","[102.0, 94.0, 36.67, 158.0, 108.0, 75.0, 28.0,...","[102.0, 92.0, 36.67, 137.0, 91.0, 62.0, 32.0, ..."
31,"[65.0, 100.0, 36.5, 139.5, 83.5, 55.5, 15.0, 1...","[74.5, 99.5, 36.5, 137.0, 83.0, 56.0, 16.0, 10...","[74.0, 100.0, 36.5, 130.0, 79.0, 54.0, 15.0, 1...","[79.0, 99.0, 37.39, 133.0, 76.0, 50.0, 15.0, 1...","[89.0, 98.0, 37.39, 133.0, 71.0, 50.0, 15.0, 1...","[82.0, 100.0, 37.39, 133.0, 81.0, 50.0, 15.0, ...","[84.0, 100.0, 37.39, 133.0, 84.0, 50.0, 17.0, ...","[96.0, 100.0, 37.39, 133.0, 81.0, 50.0, 17.0, ...","[94.0, 100.0, 38.39, 133.0, 83.0, 50.0, 15.0, ...","[97.0, 100.0, 38.39, 133.0, 88.0, 50.0, 20.0, ...",...,"[62.0, 98.0, 37.89, 133.0, 85.0, 50.0, 15.0, 1...","[70.0, 97.0, 37.89, 133.0, 84.0, 50.0, 15.0, 1...","[97.0, 97.0, 37.56, 133.0, 85.0, 50.0, 16.0, 1...","[96.0, 97.0, 37.56, 133.0, 79.0, 50.0, 16.0, 1...","[69.0, 99.0, 37.56, 133.0, 79.0, 50.0, 15.0, 1...","[71.0, 98.0, 37.56, 133.0, 76.0, 50.0, 14.0, 1...","[91.0, 98.0, 37.0, 133.0, 74.0, 50.0, 17.0, 10...","[88.0, 98.0, 37.0, 133.0, 74.0, 50.0, 15.0, 10...","[72.0, 99.0, 37.0, 133.0, 81.5, 50.0, 16.0, 10...","[76.0, 99.0, 37.0, 133.0, 67.0, 50.0, 15.0, 10..."
42,"[111.0, 88.0, 36.67, 115.0, 82.0, 56.0, 24.0, ...","[102.0, 97.0, 36.67, 108.0, 73.0, 56.0, 16.0, ...","[102.0, 97.0, 36.67, 108.0, 73.0, 56.0, 16.0, ...","[100.0, 98.0, 37.67, 97.0, 72.0, 56.0, 12.0, 1...","[100.0, 98.0, 37.67, 97.0, 72.0, 56.0, 12.0, 1...","[101.0, 98.0, 37.67, 91.0, 63.0, 56.0, 20.0, 1...","[101.0, 98.0, 37.67, 91.0, 63.0, 56.0, 20.0, 1...","[112.0, 98.0, 38.11, 136.0, 90.0, 56.0, 22.0, ...","[112.0, 98.0, 38.11, 136.0, 90.0, 56.0, 22.0, ...","[112.0, 98.0, 38.11, 136.0, 90.0, 56.0, 22.0, ...",...,"[101.0, 97.0, 37.56, 126.0, 87.0, 56.0, 14.0, ...","[101.0, 97.0, 37.56, 113.0, 85.0, 56.0, 14.0, ...","[104.0, 97.0, 37.56, 113.0, 85.0, 56.0, 16.0, ...","[105.0, 96.0, 37.56, 126.0, 85.0, 56.0, 16.0, ...","[107.0, 96.0, 37.56, 126.0, 85.0, 56.0, 15.0, ...","[106.0, 97.0, 37.72, 137.0, 92.0, 56.0, 14.0, ...","[109.0, 97.0, 37.72, 114.0, 96.0, 56.0, 19.0, ...","[109.0, 97.0, 37.72, 114.0, 96.0, 56.0, 19.0, ...","[109.0, 97.0, 37.72, 114.0, 96.0, 56.0, 19.0, ...","[89.0, 100.0, 37.72, 128.0, 94.0, 71.0, 12.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119939,"[77.0, 100.0, 36.1, 148.0, 98.0, 68.0, 20.0, 8...","[72.0, 100.0, 36.1, 155.0, 106.0, 74.0, 24.0, ...","[77.5, 100.0, 36.3, 165.5, 118.0, 83.5, 19.0, ...","[77.5, 100.0, 36.1, 173.0, 114.0, 79.0, 26.0, ...","[77.0, 100.0, 36.1, 171.0, 115.0, 80.0, 25.0, ...","[79.0, 100.0, 35.9, 134.0, 91.0, 63.0, 22.5, 8...","[73.0, 100.0, 35.7, 129.0, 96.0, 76.0, 23.0, 8...","[73.0, 100.0, 35.7, 129.0, 96.0, 76.0, 23.0, 8...","[79.0, 100.0, 35.7, 163.0, 118.0, 87.0, 15.0, ...","[78.0, 100.0, 36.6, 131.0, 99.0, 76.0, 15.0, 8...",...,"[76.0, 100.0, 36.6, 173.0, 119.0, 83.0, 15.0, ...","[102.0, 99.0, 36.6, 180.0, 115.0, 80.0, 13.0, ...","[79.5, 99.0, 36.6, 170.0, 131.0, 103.0, 12.0, ...","[76.0, 100.0, 36.6, 171.0, 121.0, 88.0, 12.0, ...","[86.0, 99.0, 36.6, 179.0, 128.0, 92.0, 12.0, 7...","[91.0, 99.0, 36.6, 169.0, 125.0, 94.0, 23.0, 7...","[104.0, 99.0, 36.6, 191.0, 131.0, 91.0, 12.0, ...","[97.0, 100.0, 36.6, 181.0, 122.0, 85.0, 20.0, ...","[92.0, 100.0, 36.6, 165.0, 108.0, 75.0, 18.0, ...","[92.0, 100.0, 37.2, 177.0, 123.0, 86.0, 18.0, ..."
119946,"[107.5, 97.0, 36.8, 128.5, 102.0, 90.5, 12.0, ...","[106.0, 97.0, 36.8, 124.0, 90.0, 74.0, 13.0, 8...","[109.0, 99.0, 36.8, 109.0, 75.0, 62.0, 13.0, 8...","[111.0, 98.0, 36.8, 111.5, 78.5, 65.5, 15.0, 8...","[103.0, 97.5, 36.8, 107.0, 72.5, 58.0, 13.0, 8...","[103.0, 98.5, 37.0, 122.5, 85.5, 69.5, 14.0, 8...","[101.5, 96.0, 37.0, 116.0, 94.5, 83.0, 14.0, 8...","[100.0, 92.5, 37.0, 112.5, 82.5, 71.5, 21.0, 8...","[98.0, 95.0, 37.0, 110.0, 80.0, 65.0, 15.0, 8....","[96.0, 93.0, 36.7, 102.0, 81.0, 70.0, 16.0, 8....",...,"[87.0, 96.0, 36.7, 100.0, 69.0, 55.0, 19.0, 7....","[91.0, 94.5, 36.7, 100.0, 69.0, 55.0, 18.0, 7....","[95.0, 94.0, 36.7, 135.0, 100.0, 87.0, 20.0, 7...","[105.0, 94.0, 36.7, 135.0, 100.0, 87.0, 20.0, ...","[88.0, 94.0, 36.6, 135.0, 100.0, 87.0, 16.0, 7...","[84.0, 94.0, 36.6, 135.0, 100.0, 87.0, 16.0, 7...","[96.0, 94.0, 36.6, 135.0, 100.0, 87.0, 16.0, 7...","[90.0, 97.0, 36.6, 126.0, 88.0, 74.0, 18.0, 7....","[99.5, 97.0, 36.6, 123.5, 95.5, 79.0, 17.0, 7....","[99.5, 97.0, 36.6, 123.5, 95.5, 79.0, 17.0, 7...."
119954,"[72.0, 98.0, 36.4, 106.0, 68.0, 51.0, 4.5, 10....","[72.0, 99.0, 36.4, 107.0, 68.0, 53.0, 4.5, 10....","[78.0, 98.0, 36.4, 117.0, 76.0, 57.0, 4.5, 10....","[78.0, 98.0, 37.9, 109.0, 74.0, 57.0, 4.5, 10....","[82.0, 99.0, 38.3, 113.5, 75.0, 55.5, 4.5, 10....","[88.0, 98.0, 38.3, 101.0, 67.0, 50.0, 2.0, 10....","[86.0, 100.0, 38.3, 96.0, 70.0, 54.0, 3.0, 10....","[83.0, 99.0, 38.3, 108.0, 72.0, 54.0, 2.0, 10....","[82.0, 100.0, 37.6, 121.0, 82.0, 61.0, 2.0, 10...","[105.0, 100.0, 37.6, 120.0, 83.0, 63.0, 3.0, 1...",...,"[95.0, 97.0, 37.2, 103.0, 99.0, 93.0, 18.0, 11...","[90.0, 96.0, 37.2, 97.0, 89.0, 79.0, 18.0, 11....","[85.0, 97.0, 37.2, 169.0, 113.0, 82.0, 18.0, 1...","[88.0, 95.0, 37.2, 176.0, 112.0, 79.0, 16.0, 1...","[96.0, 97.0, 37.2, 189.0, 125.0, 87.0, 16.0, 1...","[94.0, 98.0, 37.2, 156.0, 107.0, 78.0, 16.0, 1...","[95.0, 94.0, 37.2, 141.0, 93.0, 68.0, 14.0, 11...","[96.0, 96.0, 37.2, 150.0, 103.0, 77.0, 24.0, 1...","[94.0, 91.0, 37.2, 119.0, 78.0, 58.0, 23.0, 11...","[94.0, 93.0, 37.2, 144.0, 92.0, 67.0, 17.0, 11..."
119957,"[62.0, 100.0, 36.9, 162.0, 110.0, 76.0, 12.5, ...","[64.0, 100.0, 36.9, 168.0, 112.0, 76.0, 12.5, ...","[64.0, 100.0, 36.9, 146.0, 94.0, 64.0, 12.5, 1...","[66.0, 100.0, 36.9, 144.0, 90.0, 60.0, 12.5, 1...","[68.0, 100.0, 36.9, 160.0, 106.0, 72.0, 12.5, ...","[68.0, 100.0, 36.9, 138.0, 86.0, 56.0, 12.5, 1...","[66.0, 100.0, 36.9, 140.0, 89.0, 58.0, 12.5, 1...","[70.0, 98.0, 36.7, 140.0, 90.0, 62.0, 12.5, 10...","[72.0, 99.0, 36.7, 140.0, 92.0, 62.0, 15.0, 10...","[78.0, 99.0, 36.7, 142.0, 92.0, 64.0, 11.0, 10...",...,"[72.0, 100.0, 37.8, 156.0, 90.0, 60.0, 23.0, 1...","[70.0, 100.0, 37.7, 132.0, 94.0, 68.0, 26.0, 1...","[74.0, 100.0, 37.7, 144.0, 86.0, 58.0, 26.0, 1...","[74.0, 100.0, 37.7, 142.0, 88.0, 60.0, 26.0, 1...","[74.0, 99.0, 37.7, 128.0, 86.0, 60.0, 20.0, 15...","[80.0, 99.0, 37.4, 130.0, 86.0, 60.0, 30.0, 15...","[76.0, 98.0, 37.4, 122.0, 80.0, 58.0, 19.0, 15...","[76.0, 97.0, 37.4, 118.0, 82.0, 58.0, 20.0, 15...","[81.0, 99.0, 37.4, 116.0, 88.0, 66.0, 19.5, 15...","[82.0, 99.0, 37.5, 128.0, 92.0, 68.0, 20.0, 15..."


In [27]:
# helper to re-map target value to patient in ts dataset
def remap_sepsis_outcome_to_patient_ts(patient_ts_row):
    p_id = patient_ts_row['patient_id']
    patient_ts_row['sepsislabel'] = sepsis_dataset[sepsis_dataset['patient_id'] == p_id]['sepsislabel'].max()
    return patient_ts_row

ts_limited_sepsis_sequence_label = ts_limited_sepsis_sequence.reset_index().apply(remap_sepsis_outcome_to_patient_ts, axis=1)

In [28]:
# grab just the patient label column for use later as our target var
patient_sequence_sepsis_label = ts_limited_sepsis_sequence_label['sepsislabel']
patient_sequence_sepsis_label = np.array(patient_sequence_sepsis_label.to_list())
patient_sequence_sepsis_label.shape

(3882,)

In [29]:
# Convert the data to an array
patient_sepsis_sequences = np.array(ts_limited_sepsis_sequence.values.tolist())
patient_sepsis_sequences.shape

(3882, 48, 56)

In [30]:
ts_limited_sepsis_sequence.values.tolist()[0][0]

[117.0,
 98.0,
 37.72,
 105.0,
 79.0,
 62.0,
 18.0,
 13.0,
 448.0,
 0.8,
 124.0,
 1.5,
 23.1,
 22.0,
 3.7,
 2.0,
 8.1,
 27.92,
 -0.03,
 205.0,
 120.0,
 98.0,
 38.67,
 95.0,
 74.0,
 59.0,
 18.0,
 13.0,
 448.0,
 0.8,
 124.0,
 1.5,
 23.1,
 22.0,
 3.7,
 2.0,
 8.1,
 120.0,
 98.0,
 38.67,
 95.0,
 74.0,
 59.0,
 18.0,
 13.0,
 448.0,
 0.8,
 124.0,
 1.5,
 23.1,
 22.0,
 3.7,
 2.0,
 8.1,
 0.0,
 1.0]

### Split Dataset

In [31]:
# shuffle the dataset
indices = np.arange(patient_sepsis_sequences.shape[0])
np.random.seed(23)
np.random.shuffle(indices, )

X = patient_sepsis_sequences[indices]
y = patient_sequence_sepsis_label[indices]

In [32]:
# Split the data into test/train/val sets with a 80/10/10 split
n = X.shape[0]
X_train = X[:int(n*0.8), :, :]
y_train = y[:int(n*0.8)]

X_test = X[int(n*0.8):int(n*0.9), :, :]
y_test = y[int(n*0.8):int(n*0.9)]

X_val = X[int(n*0.9):, :, :]
y_val = y[int(n*0.9):]

print(f"Train data shape: X: {X_train.shape}, y: {y_train.shape}")
print(f"Test data shape: X: {X_test.shape}, y: {y_test.shape}")
print(f"Validation data shape: X: {X_val.shape}, y: {y_val.shape}")

Train data shape: X: (3105, 48, 56), y: (3105,)
Test data shape: X: (388, 48, 56), y: (388,)
Validation data shape: X: (389, 48, 56), y: (389,)


#### Scale/Normalize Continuous Features

In [33]:
# Will apply standard scaling to the continuous features
scaler = StandardScaler()

# setup index to apply only to cont features
num_continuous_features = len(feature_cols) - 2
num_continuous_features

54

In [34]:
# We need to temporarily flatten our datasets as scaler supports only two dims
X_train_2d = X_train.reshape(-1, X_train.shape[2])
X_test_2d = X_test.reshape(-1, X_test.shape[2])
X_val_2d = X_val.reshape(-1, X_val.shape[2])

print(f"Train data flattened shape: X: {X_train_2d.shape}")
print(f"Test data flattened shape: X: {X_test_2d.shape}")
print(f"Validation data flattened shape: X: {X_val_2d.shape}")

Train data flattened shape: X: (149040, 56)
Test data flattened shape: X: (18624, 56)
Validation data flattened shape: X: (18672, 56)


In [35]:
# apply scaling to continuous features only
X_train_2d[:, :num_continuous_features] = scaler.fit_transform(X_train_2d[:, :num_continuous_features])
X_test_2d[:, :num_continuous_features] = scaler.transform(X_test_2d[:, :num_continuous_features])
X_val_2d[:, :num_continuous_features] = scaler.transform(X_val_2d[:, :num_continuous_features])

In [36]:
# reshape back to original
X_train_norm = X_train_2d.reshape(X_train.shape)
X_test_norm = X_test_2d.reshape(X_test.shape)
X_val_norm = X_val_2d.reshape(X_val.shape)

print(f"Train data un-flattened shape: X: {X_train_norm.shape}")
print(f"Test data un-flattened shape: X: {X_test_norm.shape}")
print(f"Validation data un-flattened shape: X: {X_val_norm.shape}")

Train data un-flattened shape: X: (3105, 48, 56)
Test data un-flattened shape: X: (388, 48, 56)
Validation data un-flattened shape: X: (389, 48, 56)


In [37]:
# Save mean and standard deviation arrays to S3
scaler_mean = scaler.mean_
scaler_stddev = scaler.scale_

print(f"Scaler mean: {scaler_mean} and std dev: {scaler_stddev}")

np.save(f"{local_data_path_csv}/scaler_mean.npy", scaler_mean)
np.save(f"{local_data_path_csv}/scaler_stddev.npy", scaler_stddev)

Scaler mean: [ 86.24514728  97.15120773  36.93236834 124.30621499  82.78228096
  63.92864042  19.16935473  11.51232007 207.00766289   1.49242969
 132.67222495   1.80408645  31.12147233  25.8020957    4.06370459
   2.0670089    8.04263142  62.64175201 -71.09330757  49.61078905
  86.21166969  97.1882347   36.93144514 124.11570798  82.72115381
  63.89688717  19.12961037  11.52075166 206.62213536   1.50938873
 133.01940347   1.81643318  31.11556855  26.027889     4.07002289
   2.06561327   8.04042024  86.21166969  97.1882347   36.93144514
 124.11570798  82.72115381  63.89688717  19.12961037  11.52075166
 206.62213536   1.50938873 133.01940347   1.81643318  31.11556855
  26.027889     4.07002289   2.06561327   8.04042024] and std dev: [ 18.13704565   3.11793741   0.76503963  23.97548526  16.39168037
  13.15333117   5.58036148   6.85650576 107.18202815   1.67884256
  46.61722016   0.9479452    5.6102187   20.96750038   0.58333879
   0.37722344   1.80489817  16.22928864 207.31039539  49.89659

In [38]:
# Save datasets locally
np.save(f"{local_data_path_csv}/X_train.npy", X_train)
np.save(f"{local_data_path_csv}/y_train.npy", y_train)

np.save(f"{local_data_path_csv}/X_test.npy", X_test)
np.save(f"{local_data_path_csv}/y_test.npy", y_test)

np.save(f"{local_data_path_csv}/X_val.npy", X_val)
np.save(f"{local_data_path_csv}/y_val.npy", y_val)

In [39]:
# Set session variables
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_session.region_name
bucket = sess.default_bucket()

##### Store results in S3

In [40]:
# copy the dataset files to the datalake
!aws s3 cp $local_data_path_csv $s3_datalake_path_prepared --recursive --exclude "*" --include "*.npy"

upload: ../data/X_test.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/X_test.npy
upload: ../data/y_train.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/y_train.npy
upload: ../data/scaler_stddev.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/scaler_stddev.npy
upload: ../data/y_val.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/y_val.npy
upload: ../data/y_test.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/y_test.npy
upload: ../data/scaler_mean.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/scaler_mean.npy
upload: ../data/X_val.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/X_val.npy
upload: ../data/X_train.npy to s3://sagemaker-us-east-1-343218227212/sepsis-prediction/prepared/X_train.npy
