## Import packages

In [10]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from utils.tools import forward_fill_pipeline, normalize_dataframe, normalize_df_with_statistics, calculate_missing_rate, export_missing_mask_pipeline, get_time_interval_term

data_dir = "./mimic-iv/"
Path(os.path.join(data_dir, 'processed')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(data_dir, 'statistics')).mkdir(parents=True, exist_ok=True)

SEED = 42

## Read data from files

### Record feature names

In [2]:
basic_records = ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime']
target_features = ['Outcome', 'LOS', 'Readmission']
demographic_features = ['Sex', 'Age'] # Sex and ICUType are binary features, others are continuous features
labtest_features = ['Capillary refill rate->0.0', 'Capillary refill rate->1.0',
        'Glascow coma scale eye opening->To Pain',
        'Glascow coma scale eye opening->3 To speech',
        'Glascow coma scale eye opening->1 No Response',
        'Glascow coma scale eye opening->4 Spontaneously',
        'Glascow coma scale eye opening->None',
        'Glascow coma scale eye opening->To Speech',
        'Glascow coma scale eye opening->Spontaneously',
        'Glascow coma scale eye opening->2 To pain',
        'Glascow coma scale motor response->1 No Response',
        'Glascow coma scale motor response->3 Abnorm flexion',
        'Glascow coma scale motor response->Abnormal extension',
        'Glascow coma scale motor response->No response',
        'Glascow coma scale motor response->4 Flex-withdraws',
        'Glascow coma scale motor response->Localizes Pain',
        'Glascow coma scale motor response->Flex-withdraws',
        'Glascow coma scale motor response->Obeys Commands',
        'Glascow coma scale motor response->Abnormal Flexion',
        'Glascow coma scale motor response->6 Obeys Commands',
        'Glascow coma scale motor response->5 Localizes Pain',
        'Glascow coma scale motor response->2 Abnorm extensn',
        'Glascow coma scale total->11', 'Glascow coma scale total->10',
        'Glascow coma scale total->13', 'Glascow coma scale total->12',
        'Glascow coma scale total->15', 'Glascow coma scale total->14',
        'Glascow coma scale total->3', 'Glascow coma scale total->5',
        'Glascow coma scale total->4', 'Glascow coma scale total->7',
        'Glascow coma scale total->6', 'Glascow coma scale total->9',
        'Glascow coma scale total->8',
        'Glascow coma scale verbal response->1 No Response',
        'Glascow coma scale verbal response->No Response',
        'Glascow coma scale verbal response->Confused',
        'Glascow coma scale verbal response->Inappropriate Words',
        'Glascow coma scale verbal response->Oriented',
        'Glascow coma scale verbal response->No Response-ETT',
        'Glascow coma scale verbal response->5 Oriented',
        'Glascow coma scale verbal response->Incomprehensible sounds',
        'Glascow coma scale verbal response->1.0 ET/Trach',
        'Glascow coma scale verbal response->4 Confused',
        'Glascow coma scale verbal response->2 Incomp sounds',
        'Glascow coma scale verbal response->3 Inapprop words',
        'Diastolic blood pressure', 'Fraction inspired oxygen', 'Glucose',
        'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation',
        'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight',
        'pH']
require_impute_features = labtest_features
normalize_features = ['Age'] + ['Diastolic blood pressure', 'Fraction inspired oxygen', 'Glucose',
        'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation',
        'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight',
        'pH'] + ['LOS']

In [3]:
df = pd.read_csv(os.path.join(data_dir, "processed", f"format_mimic4_ehr.csv"))
df

Unnamed: 0,PatientID,RecordTime,AdmissionTime,DischargeTime,Outcome,LOS,Readmission,Decompensation,Acute and unspecified renal failure,Acute cerebrovascular disease,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,10000032_1,0,2180-07-23 14:00:00,2180-07-23 23:50:47,0,9.846389,0,,0,0,...,,94.0,,64.0,97.0,23.0,88.0,37.055556,39.326426,
1,10000032_1,1,2180-07-23 14:00:00,2180-07-23 23:50:47,0,8.846389,0,0.0,0,0,...,,105.0,,,94.0,21.0,,,,
2,10000032_1,2,2180-07-23 14:00:00,2180-07-23 23:50:47,0,7.846389,0,0.0,0,0,...,,97.0,,67.0,95.0,20.0,95.0,37.055556,,
3,10000032_1,3,2180-07-23 14:00:00,2180-07-23 23:50:47,0,6.846389,0,0.0,0,0,...,,100.0,,60.0,95.0,21.0,86.0,,,
4,10000032_1,4,2180-07-23 14:00:00,2180-07-23 23:50:47,0,5.846389,0,0.0,0,0,...,,97.0,,56.0,98.0,16.0,93.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4055514,19999987_1,43,2145-11-02 22:59:00,2145-11-04 21:29:30,0,3.508333,0,0.0,1,1,...,,,,,,,,,,
4055515,19999987_1,44,2145-11-02 22:59:00,2145-11-04 21:29:30,0,2.508333,0,0.0,1,1,...,,93.0,,,,20.0,,,,
4055516,19999987_1,45,2145-11-02 22:59:00,2145-11-04 21:29:30,0,1.508333,0,0.0,1,1,...,,92.0,,131.0,100.0,23.0,163.0,37.833333,,
4055517,19999987_1,46,2145-11-02 22:59:00,2145-11-04 21:29:30,0,0.508333,0,0.0,1,1,...,,87.0,,,98.0,24.0,,37.000000,,


In [4]:
# if a patient has multiple records, we only use the first 48 items
# we also discard the patients with less than 48 items

# Ensure dataframe is sorted by PatientID and RecordTime
df = df.sort_values(['PatientID', 'RecordTime'])

# Filter out patients with less than 48 records
df = df.groupby('PatientID').filter(lambda x: len(x) >= 48)

# Select the first 48 records for each patient
df = df.groupby('PatientID').head(48)


In [5]:
df

Unnamed: 0,PatientID,RecordTime,AdmissionTime,DischargeTime,Outcome,LOS,Readmission,Decompensation,Acute and unspecified renal failure,Acute cerebrovascular disease,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
101,10001884_1,1,2131-01-11 04:20:05,2131-01-20 08:27:30,1,219.123611,1,0.0,0,0,...,,60.0,,70.0,98.0,10.0,167.0,,,
102,10001884_1,2,2131-01-11 04:20:05,2131-01-20 08:27:30,1,218.123611,1,0.0,0,0,...,,72.0,,75.0,100.0,20.0,102.0,,,
103,10001884_1,3,2131-01-11 04:20:05,2131-01-20 08:27:30,1,217.123611,1,0.0,0,0,...,140.0,70.0,,73.0,100.0,20.0,93.0,35.400000,,7.33
104,10001884_1,4,2131-01-11 04:20:05,2131-01-20 08:27:30,1,216.123611,1,0.0,0,0,...,,71.0,,80.0,100.0,20.0,102.0,35.800000,,
105,10001884_1,5,2131-01-11 04:20:05,2131-01-20 08:27:30,1,215.123611,1,0.0,0,0,...,,71.0,,92.0,98.0,20.0,138.0,36.300000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4055287,19999442_1,46,2148-11-19 14:23:43,2148-11-26 13:12:15,0,120.808889,0,0.0,0,1,...,,61.0,,91.0,99.0,16.0,142.0,37.055556,,
4055288,19999442_1,47,2148-11-19 14:23:43,2148-11-26 13:12:15,0,119.808889,0,0.0,0,1,...,,61.0,,93.0,97.0,15.0,151.0,,,
4055289,19999442_1,48,2148-11-19 14:23:43,2148-11-26 13:12:15,0,118.808889,0,0.0,0,1,...,,55.0,,93.0,98.0,10.0,135.0,,,
4055290,19999442_1,49,2148-11-19 14:23:43,2148-11-26 13:12:15,0,117.808889,0,0.0,0,1,...,,53.0,,94.0,97.0,15.0,137.0,,,


## Stratified split dataset into `Training`, `Validation` and `Test` sets

- Stratified dataset according to `Outcome` column
- Baseline: 70% Training, 10% Validation, 20% Test (No Calibration)
    - Name: train, val, test
- Calibration Method: 65% Training, 10% Validation, 5% Calibration, 20% Test
    - Name: traincal, val, calib, test

The validation and test set part are the same for both methods. Actually, traincal + calib = train.

1. test 20/100
2. val 10/80
3. calib 5/70

In [6]:
# Group the dataframe by patient ID
grouped = df.groupby('PatientID')

# Get the patient IDs and outcomes
patients = np.array(list(grouped.groups.keys()))
patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in patients])

# Get the train_val/test patient IDs
train_val_patients, test_patients = train_test_split(patients, test_size=20/100, random_state=SEED, stratify=patients_outcome)

# Get the train/val patient IDs
train_val_patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in train_val_patients])
train_patients, val_patients = train_test_split(train_val_patients, test_size=10/80, random_state=SEED, stratify=train_val_patients_outcome)


In [7]:
# Create train, val, test, [traincal, calib] dataframes for the current fold
train_df = df[df['PatientID'].isin(train_patients)]
val_df = df[df['PatientID'].isin(val_patients)]
test_df = df[df['PatientID'].isin(test_patients)]


In [8]:
save_dir = os.path.join(data_dir, 'processed', 'fold_0') # forward fill
Path(save_dir).mkdir(parents=True, exist_ok=True)

# # Save the train, val, and test dataframes for the current fold to csv files
# train_df.to_csv(os.path.join(save_dir, "train_raw.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_raw.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_raw.csv"), index=False)
# traincal_df.to_csv(os.path.join(save_dir, "traincal_raw.csv"), index=False)
# calib_df.to_csv(os.path.join(save_dir, "calib_raw.csv"), index=False)

In [11]:
# calculate the train_set's missing rate and save the csv to the certain dir
feature_missing_stats = calculate_missing_rate(train_df, demographic_features + labtest_features)
feature_missing_array = [feature_missing_stats[f]['missing_rate'] for f in demographic_features + labtest_features]

gfe = np.array([1-f for f in feature_missing_array])
pd.to_pickle(gfe, os.path.join(save_dir, "gfe.pkl"))

# finish exporting missing mask and local missing value
train_mask_string, train_mask_value = export_missing_mask_pipeline(train_df, feature_missing_array, demographic_features, labtest_features)
val_mask_string, val_mask_value = export_missing_mask_pipeline(val_df, feature_missing_array, demographic_features, labtest_features)
test_mask_string, test_mask_value = export_missing_mask_pipeline(test_df, feature_missing_array, demographic_features, labtest_features)

train_td = get_time_interval_term(train_mask_string)
val_td = get_time_interval_term(val_mask_string)
test_td = get_time_interval_term(test_mask_string)

# save them to pickle file
# pd.to_pickle(train_mask_string, os.path.join(save_dir, "train_mask_string.pkl"))
pd.to_pickle(train_mask_value, os.path.join(save_dir, "train_mask_value.pkl"))

# pd.to_pickle(val_mask_string, os.path.join(save_dir, "val_mask_string.pkl"))
pd.to_pickle(val_mask_value, os.path.join(save_dir, "val_mask_value.pkl"))

# pd.to_pickle(test_mask_string, os.path.join(save_dir, "test_mask_string.pkl"))
pd.to_pickle(test_mask_value, os.path.join(save_dir, "test_mask_value.pkl"))

pd.to_pickle(train_td, os.path.join(save_dir, "train_td.pkl"))
pd.to_pickle(val_td, os.path.join(save_dir, "val_td.pkl"))
pd.to_pickle(test_td, os.path.join(save_dir, "test_td.pkl"))



In [12]:
# Calculate the mean and std of the train set (include age, lab test features, and LOS) on the data in 5% to 95% quantile range
train_df, val_df, test_df, default_fill, los_info, train_mean, train_std = normalize_dataframe(train_df, val_df, test_df, normalize_features)


# Save the zscored dataframes to csv files
# train_df.to_csv(os.path.join(save_dir, "train_after_zscore.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_after_zscore.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_after_zscore.csv"), index=False)

# Forward Imputation after grouped by PatientID
# Notice: if a patient has never done certain lab test, the imputed value will be the median value calculated from train set
train_x, train_y, train_pid = forward_fill_pipeline(train_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
val_x, val_y, val_pid = forward_fill_pipeline(val_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
test_x, test_y, test_pid = forward_fill_pipeline(test_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)

# Save the imputed dataset to pickle file
pd.to_pickle(train_x, os.path.join(save_dir, "train_x.pkl"))
pd.to_pickle(train_y, os.path.join(save_dir, "train_y.pkl"))
pd.to_pickle(train_pid, os.path.join(save_dir, "train_pid.pkl"))
pd.to_pickle(val_x, os.path.join(save_dir, "val_x.pkl"))
pd.to_pickle(val_y, os.path.join(save_dir, "val_y.pkl"))
pd.to_pickle(val_pid, os.path.join(save_dir, "val_pid.pkl"))
pd.to_pickle(test_x, os.path.join(save_dir, "test_x.pkl"))
pd.to_pickle(test_y, os.path.join(save_dir, "test_y.pkl"))
pd.to_pickle(test_pid, os.path.join(save_dir, "test_pid.pkl"))
pd.to_pickle(los_info, os.path.join(save_dir, "los_info.pkl")) # LOS statistics (calculated from the train set)