## Part III: Feature Engineering and Data Preparation

#### Setup Environment

In [31]:
%run environment-setup.ipynb

Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


In [3]:
# import additional libraries
from imblearn.over_sampling import RandomOverSampler, SMOTE, SVMSMOTE

In [2]:
# load the cleaned datset from Athena/S3
sepsis_dataset = load_clean_dataset()

2024-11-12 23:38:08,115	INFO worker.py:1786 -- Started a local Ray instance.


### Data Transformation

The dataset is cleaned and complete, however additional work is still required to prepare for modeling.  In this section, the following steps will be taken:

-  Encode categorical features
-  Transform the time series data into patient time series sequences
-  Split dataset: the dataset will be split in to train/val/test sets
-  Normalize dataset: the dataset will be normalized using a standard scaler

In [3]:
# one hot encode the sex feature (M/F)
one_hot = pd.get_dummies(sepsis_dataset['gender'], prefix='gender', dtype='int')

# Join the encoded df
sepsis_dataset_encoded = sepsis_dataset.drop('gender',axis = 1)
sepsis_dataset_encoded = sepsis_dataset.join(one_hot)
sepsis_dataset_encoded

Unnamed: 0,patient_id,hour,sepsislabel,hr,o2sat,temp,sbp,map,dbp,resp,...,creatinine_lag,glucose_lag,lactate_lag,hct_lag,bun_lag,potassium_lag,magnesium_lag,calcium_lag,gender_0,gender_1
0,17072,0,0,65.0,100.0,35.78,129.0,72.0,69.0,16.5,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,1,0
1,17072,1,0,65.0,100.0,35.78,129.0,72.0,69.0,16.5,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,1,0
2,17072,2,0,78.0,100.0,35.78,129.0,42.5,69.0,16.5,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,1,0
3,17072,3,0,73.0,100.0,35.78,129.0,42.5,69.0,17.0,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,1,0
4,17072,4,0,70.0,100.0,35.78,129.0,74.0,69.0,14.0,...,0.0,0.0,99.0,0.0,0.0,0.0,0.0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1181709,104763,33,0,81.0,98.0,36.80,122.0,71.0,53.0,18.0,...,5.0,5.0,132.0,5.0,5.0,5.0,5.0,5.0,1,0
1181710,104763,34,0,80.0,98.0,36.80,119.0,66.0,47.0,17.0,...,6.0,6.0,133.0,6.0,6.0,6.0,6.0,6.0,1,0
1181711,104763,35,0,80.0,100.0,36.70,113.0,67.0,52.0,12.0,...,7.0,0.0,134.0,7.0,7.0,7.0,7.0,7.0,1,0
1181712,104763,36,0,80.0,100.0,36.70,111.0,68.0,54.0,16.0,...,8.0,1.0,135.0,8.0,8.0,8.0,8.0,8.0,1,0


#### Transform Dataset into Patient-Level Time-Series

Currently, our data is formatted in a row per time step - so the time series for a given paitent would have [x] row entries corresponding to the duration of their time series, [x].  For modeling, this needs to be converted to sequences for each patient.  The sequence will be a single row, with one column per time step (there will be total time steps of LOOKBACK_WINDOW + PREDICTION_HORIZON).  In each column will be a vector of the variables for that patient at that time step.

In [28]:
# set target sequence length for each patient   
target_sequence_length = LOOKBACK_WINDOW + PREDICTION_HORIZON

In [29]:
# helper to filter patient time series to most recent (LOOKBACK_WINDOW + PREDICTION_HORIZON) samples
def truncate_patient_time_series(grouped_df):
  grouped_df_filtered = grouped_df[grouped_df['hour'] > (max(grouped_df['hour']) - target_sequence_length)]
  return grouped_df_filtered

# Execute grouping and sequence truncation
ts_limited_sepsis_data = sepsis_dataset_encoded.groupby('patient_id').apply(truncate_patient_time_series).reset_index(drop=True)
ts_limited_sepsis_data

  ts_limited_sepsis_data = sepsis_dataset_encoded.groupby('patient_id').apply(truncate_patient_time_series).reset_index(drop=True)


Unnamed: 0,patient_id,hour,sepsislabel,hr,o2sat,temp,sbp,map,dbp,resp,...,creatinine_lag,glucose_lag,lactate_lag,hct_lag,bun_lag,potassium_lag,magnesium_lag,calcium_lag,gender_0,gender_1
0,1,24,0,108.0,87.0,36.67,149.0,89.67,63.995019,30.0,...,8.0,8.0,123.0,8.0,8.0,8.0,8.0,8.0,1,0
1,1,25,0,107.0,90.0,36.67,156.0,96.67,63.995019,26.0,...,9.0,9.0,124.0,9.0,9.0,9.0,9.0,9.0,1,0
2,1,26,0,104.0,91.0,36.67,168.0,141.33,63.995019,29.0,...,10.0,10.0,125.0,10.0,10.0,10.0,10.0,10.0,1,0
3,1,27,0,102.0,88.0,36.50,146.0,90.67,63.995019,27.0,...,11.0,11.0,126.0,11.0,11.0,11.0,11.0,11.0,1,0
4,1,28,0,106.0,91.0,36.50,137.0,75.67,63.995019,25.0,...,12.0,12.0,127.0,12.0,12.0,12.0,12.0,12.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732145,120000,30,0,80.0,96.0,36.40,115.0,87.00,65.000000,15.0,...,11.0,2.0,129.0,11.0,11.0,11.0,11.0,11.0,1,0
732146,120000,31,0,74.0,97.0,36.40,114.0,83.00,67.000000,15.0,...,12.0,3.0,130.0,12.0,12.0,12.0,12.0,12.0,1,0
732147,120000,32,0,78.0,98.0,36.40,110.0,83.00,69.000000,15.0,...,13.0,4.0,131.0,13.0,13.0,13.0,13.0,13.0,1,0
732148,120000,33,0,82.0,99.0,36.60,124.0,91.00,71.000000,16.0,...,14.0,0.0,132.0,14.0,14.0,14.0,14.0,14.0,1,0


In [40]:
# narrow down our columns to just the variables
feature_cols = ts_limited_sepsis_data.columns.to_list()
ignore_cols = ['patient_id', 'hour', 'sepsislabel']
feature_cols = [x for x in feature_cols if x not in ignore_cols]

In [46]:
# Helper function to perform vectorization of features at each time step
def get_patient_feature_vector(row):
  vector = []
  for col in feature_cols:
    vector.append(row[col])
  return vector

# test on a few samples
ts_limited_sepsis_data.head().apply(get_patient_feature_vector, axis=1)

0    [108.0, 87.0, 36.67, 149.0, 89.67, 63.99501869...
1    [107.0, 90.0, 36.67, 156.0, 96.67, 63.99501869...
2    [104.0, 91.0, 36.67, 168.0, 141.33, 63.9950186...
3    [102.0, 88.0, 36.5, 146.0, 90.67, 63.995018699...
4    [106.0, 91.0, 36.5, 137.0, 75.67, 63.995018699...
dtype: object