## Import packages

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from utils.tools import forward_fill_pipeline, normalize_dataframe, normalize_df_with_statistics, calculate_missing_rate, export_missing_mask_pipeline, get_time_interval_term

data_dir = "./mimic-iii/"
Path(os.path.join(data_dir, 'processed')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(data_dir, 'statistics')).mkdir(parents=True, exist_ok=True)

SEED = 42

## Read data from files

### Record feature names

In [2]:
basic_records = ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime']
target_features = ['Outcome', 'LOS', 'Readmission']
demographic_features = ['Sex', 'Age'] # Sex and ICUType are binary features, others are continuous features
labtest_features = ['Capillary refill rate->0.0', 'Capillary refill rate->1.0',
        'Glascow coma scale eye opening->To Pain',
        'Glascow coma scale eye opening->3 To speech',
        'Glascow coma scale eye opening->1 No Response',
        'Glascow coma scale eye opening->4 Spontaneously',
        'Glascow coma scale eye opening->None',
        'Glascow coma scale eye opening->To Speech',
        'Glascow coma scale eye opening->Spontaneously',
        'Glascow coma scale eye opening->2 To pain',
        'Glascow coma scale motor response->1 No Response',
        'Glascow coma scale motor response->3 Abnorm flexion',
        'Glascow coma scale motor response->Abnormal extension',
        'Glascow coma scale motor response->No response',
        'Glascow coma scale motor response->4 Flex-withdraws',
        'Glascow coma scale motor response->Localizes Pain',
        'Glascow coma scale motor response->Flex-withdraws',
        'Glascow coma scale motor response->Obeys Commands',
        'Glascow coma scale motor response->Abnormal Flexion',
        'Glascow coma scale motor response->6 Obeys Commands',
        'Glascow coma scale motor response->5 Localizes Pain',
        'Glascow coma scale motor response->2 Abnorm extensn',
        'Glascow coma scale total->11', 'Glascow coma scale total->10',
        'Glascow coma scale total->13', 'Glascow coma scale total->12',
        'Glascow coma scale total->15', 'Glascow coma scale total->14',
        'Glascow coma scale total->3', 'Glascow coma scale total->5',
        'Glascow coma scale total->4', 'Glascow coma scale total->7',
        'Glascow coma scale total->6', 'Glascow coma scale total->9',
        'Glascow coma scale total->8',
        'Glascow coma scale verbal response->1 No Response',
        'Glascow coma scale verbal response->No Response',
        'Glascow coma scale verbal response->Confused',
        'Glascow coma scale verbal response->Inappropriate Words',
        'Glascow coma scale verbal response->Oriented',
        'Glascow coma scale verbal response->No Response-ETT',
        'Glascow coma scale verbal response->5 Oriented',
        'Glascow coma scale verbal response->Incomprehensible sounds',
        'Glascow coma scale verbal response->1.0 ET/Trach',
        'Glascow coma scale verbal response->4 Confused',
        'Glascow coma scale verbal response->2 Incomp sounds',
        'Glascow coma scale verbal response->3 Inapprop words',
        'Diastolic blood pressure', 'Fraction inspired oxygen', 'Glucose',
        'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation',
        'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight',
        'pH']
require_impute_features = labtest_features
normalize_features = ['Age'] + ['Diastolic blood pressure', 'Fraction inspired oxygen', 'Glucose',
        'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation',
        'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight',
        'pH'] + ['LOS']

In [3]:
df = pd.read_csv(os.path.join(data_dir, "processed", f"format_mimic3_ehr.csv"))
df

Unnamed: 0,PatientID,RecordTime,AdmissionTime,DischargeTime,Outcome,LOS,Readmission,Decompensation,Acute and unspecified renal failure,Acute cerebrovascular disease,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,10000_1,1,2186-08-14 10:50:09,2186-08-15 18:40:05,0,30.8312,0,0.0,1,0,...,,54.0,,77.000000,100.0,15.0,119.0,,,
1,10000_1,2,2186-08-14 10:50:09,2186-08-15 18:40:05,0,29.8312,0,0.0,1,0,...,,55.0,,80.333298,97.0,10.0,113.0,36.111111,,
2,10000_1,3,2186-08-14 10:50:09,2186-08-15 18:40:05,0,28.8312,0,0.0,1,0,...,,57.0,,69.333298,97.0,8.0,110.0,,,
3,10000_1,4,2186-08-14 10:50:09,2186-08-15 18:40:05,0,27.8312,0,0.0,1,0,...,,63.0,,71.333298,97.0,14.0,108.0,35.777802,,
4,10000_1,5,2186-08-14 10:50:09,2186-08-15 18:40:05,0,26.8312,0,0.0,1,0,...,,63.0,,70.666702,96.0,13.0,100.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509000,99999_1,22,2117-12-31 11:52:36,2118-01-01 14:51:27,0,4.9808,0,0.0,0,0,...,,91.0,,78.000000,100.0,19.0,142.0,,,
3509001,99999_1,23,2117-12-31 11:52:36,2118-01-01 14:51:27,0,3.9808,0,0.0,0,0,...,175.0,83.0,,77.000000,100.0,20.0,116.0,,,
3509002,99999_1,24,2117-12-31 11:52:36,2118-01-01 14:51:27,0,2.9808,0,0.0,0,0,...,,78.0,,80.000000,99.0,16.0,129.0,,,
3509003,99999_1,25,2117-12-31 11:52:36,2118-01-01 14:51:27,0,1.9808,0,0.0,0,0,...,,82.0,,79.000000,99.0,17.0,117.0,37.277778,,


In [4]:
# if a patient has multiple records, we only use the first 48 items
# we also discard the patients with less than 48 items

# Ensure dataframe is sorted by PatientID and RecordTime
df = df.sort_values(['PatientID', 'RecordTime'])

# Filter out patients with less than 48 records
df = df.groupby('PatientID').filter(lambda x: len(x) >= 48)

# Select the first 48 records for each patient
df = df.groupby('PatientID').head(48)


In [5]:
df

Unnamed: 0,PatientID,RecordTime,AdmissionTime,DischargeTime,Outcome,LOS,Readmission,Decompensation,Acute and unspecified renal failure,Acute cerebrovascular disease,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
51,10011_1,1,2126-08-14 22:34:00,2126-08-28 18:59:00,1,331.4168,1,0.0,0,0,...,,76.0,,62.666698,99.0,21.0,102.0,36.333334,,
52,10011_1,2,2126-08-14 22:34:00,2126-08-28 18:59:00,1,330.4168,1,0.0,0,0,...,,79.0,,65.666702,99.0,20.0,107.0,,,
53,10011_1,3,2126-08-14 22:34:00,2126-08-28 18:59:00,1,329.4168,1,0.0,0,0,...,79.0,72.0,,65.000000,99.0,20.0,103.0,36.222221,,6.5
54,10011_1,4,2126-08-14 22:34:00,2126-08-28 18:59:00,1,328.4168,1,0.0,0,0,...,,74.0,,71.000000,98.0,21.0,109.0,,,
55,10011_1,5,2126-08-14 22:34:00,2126-08-28 18:59:00,1,327.4168,1,0.0,0,0,...,,78.0,,67.333298,97.0,20.0,106.0,36.388901,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3222189,9_1,44,2149-11-09 13:07:02,2149-11-14 20:52:14,1,83.7544,1,0.0,1,1,...,,83.0,,104.000000,99.0,14.0,159.0,,,
3222190,9_1,45,2149-11-09 13:07:02,2149-11-14 20:52:14,1,82.7544,1,0.0,1,1,...,,85.0,,97.000000,99.0,14.0,144.0,,,
3222191,9_1,46,2149-11-09 13:07:02,2149-11-14 20:52:14,1,81.7544,1,0.0,1,1,...,,84.0,,85.000000,97.0,14.0,128.0,,,
3222192,9_1,47,2149-11-09 13:07:02,2149-11-14 20:52:14,1,80.7544,1,0.0,1,1,...,,90.0,,98.000000,97.0,15.0,147.0,35.888901,,


## Stratified split dataset into `Training`, `Validation` and `Test` sets

- Stratified dataset according to `Outcome` column
- Baseline: 70% Training, 10% Validation, 20% Test (No Calibration)
    - Name: train, val, test
- Calibration Method: 65% Training, 10% Validation, 5% Calibration, 20% Test
    - Name: traincal, val, calib, test

The validation and test set part are the same for both methods. Actually, traincal + calib = train.

1. test 20/100
2. val 10/80
3. calib 5/70

In [6]:
# Group the dataframe by patient ID
grouped = df.groupby('PatientID')

# Get the patient IDs and outcomes
patients = np.array(list(grouped.groups.keys()))
patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in patients])

# Get the train_val/test patient IDs
train_val_patients, test_patients = train_test_split(patients, test_size=20/100, random_state=SEED, stratify=patients_outcome)

# Get the train/val patient IDs
train_val_patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in train_val_patients])
train_patients, val_patients = train_test_split(train_val_patients, test_size=10/80, random_state=SEED, stratify=train_val_patients_outcome)


In [7]:
# Create train, val, test, [traincal, calib] dataframes for the current fold
train_df = df[df['PatientID'].isin(train_patients)]
val_df = df[df['PatientID'].isin(val_patients)]
test_df = df[df['PatientID'].isin(test_patients)]


In [8]:
save_dir = os.path.join(data_dir, 'processed', 'fold_0') # forward fill
Path(save_dir).mkdir(parents=True, exist_ok=True)

# # Save the train, val, and test dataframes for the current fold to csv files
# train_df.to_csv(os.path.join(save_dir, "train_raw.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_raw.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_raw.csv"), index=False)
# traincal_df.to_csv(os.path.join(save_dir, "traincal_raw.csv"), index=False)
# calib_df.to_csv(os.path.join(save_dir, "calib_raw.csv"), index=False)

In [9]:
# calculate the train_set's missing rate and save the csv to the certain dir
feature_missing_stats = calculate_missing_rate(train_df, demographic_features + labtest_features)
feature_missing_array = [feature_missing_stats[f]['missing_rate'] for f in demographic_features + labtest_features]

gfe = np.array([1-f for f in feature_missing_array])
pd.to_pickle(gfe, os.path.join(save_dir, "gfe.pkl"))

# finish exporting missing mask and local missing value
train_mask_string, train_mask_value = export_missing_mask_pipeline(train_df, feature_missing_array, demographic_features, labtest_features)
val_mask_string, val_mask_value = export_missing_mask_pipeline(val_df, feature_missing_array, demographic_features, labtest_features)
test_mask_string, test_mask_value = export_missing_mask_pipeline(test_df, feature_missing_array, demographic_features, labtest_features)

train_td = get_time_interval_term(train_mask_string)
val_td = get_time_interval_term(val_mask_string)
test_td = get_time_interval_term(test_mask_string)

# save them to pickle file
# pd.to_pickle(train_mask_string, os.path.join(save_dir, "train_mask_string.pkl"))
pd.to_pickle(train_mask_value, os.path.join(save_dir, "train_mask_value.pkl"))

# pd.to_pickle(val_mask_string, os.path.join(save_dir, "val_mask_string.pkl"))
pd.to_pickle(val_mask_value, os.path.join(save_dir, "val_mask_value.pkl"))

# pd.to_pickle(test_mask_string, os.path.join(save_dir, "test_mask_string.pkl"))
pd.to_pickle(test_mask_value, os.path.join(save_dir, "test_mask_value.pkl"))

pd.to_pickle(train_td, os.path.join(save_dir, "train_td.pkl"))
pd.to_pickle(val_td, os.path.join(save_dir, "val_td.pkl"))
pd.to_pickle(test_td, os.path.join(save_dir, "test_td.pkl"))



In [10]:
# Calculate the mean and std of the train set (include age, lab test features, and LOS) on the data in 5% to 95% quantile range
train_df, val_df, test_df, default_fill, los_info, train_mean, train_std = normalize_dataframe(train_df, val_df, test_df, normalize_features)


# Save the zscored dataframes to csv files
# train_df.to_csv(os.path.join(save_dir, "train_after_zscore.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_after_zscore.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_after_zscore.csv"), index=False)

# Forward Imputation after grouped by PatientID
# Notice: if a patient has never done certain lab test, the imputed value will be the median value calculated from train set
train_x, train_y, train_pid = forward_fill_pipeline(train_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
val_x, val_y, val_pid = forward_fill_pipeline(val_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
test_x, test_y, test_pid = forward_fill_pipeline(test_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)

# Save the imputed dataset to pickle file
pd.to_pickle(train_x, os.path.join(save_dir, "train_x.pkl"))
pd.to_pickle(train_y, os.path.join(save_dir, "train_y.pkl"))
pd.to_pickle(train_pid, os.path.join(save_dir, "train_pid.pkl"))
pd.to_pickle(val_x, os.path.join(save_dir, "val_x.pkl"))
pd.to_pickle(val_y, os.path.join(save_dir, "val_y.pkl"))
pd.to_pickle(val_pid, os.path.join(save_dir, "val_pid.pkl"))
pd.to_pickle(test_x, os.path.join(save_dir, "test_x.pkl"))
pd.to_pickle(test_y, os.path.join(save_dir, "test_y.pkl"))
pd.to_pickle(test_pid, os.path.join(save_dir, "test_pid.pkl"))
pd.to_pickle(los_info, os.path.join(save_dir, "los_info.pkl")) # LOS statistics (calculated from the train set)