## Import packages

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from datasets.preprocess.tools import forward_fill_pipeline, normalize_dataframe, normalize_df_with_statistics

data_dir = "./datasets/mimic-iv/"
Path(os.path.join(data_dir, 'processed')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(data_dir, 'statistics')).mkdir(parents=True, exist_ok=True)

SEED = 42

## Read data from files

### Record feature names

In [2]:
basic_records = ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime']
target_features = ['Outcome', 'LOS', 'Readmission']
demographic_features = ['Sex', 'Age'] # Sex and ICUType are binary features, others are continuous features
labtest_features = ['Capillary refill rate->0.0', 'Capillary refill rate->1.0',
        'Glascow coma scale eye opening->To Pain',
        'Glascow coma scale eye opening->3 To speech',
        'Glascow coma scale eye opening->1 No Response',
        'Glascow coma scale eye opening->4 Spontaneously',
        'Glascow coma scale eye opening->None',
        'Glascow coma scale eye opening->To Speech',
        'Glascow coma scale eye opening->Spontaneously',
        'Glascow coma scale eye opening->2 To pain',
        'Glascow coma scale motor response->1 No Response',
        'Glascow coma scale motor response->3 Abnorm flexion',
        'Glascow coma scale motor response->Abnormal extension',
        'Glascow coma scale motor response->No response',
        'Glascow coma scale motor response->4 Flex-withdraws',
        'Glascow coma scale motor response->Localizes Pain',
        'Glascow coma scale motor response->Flex-withdraws',
        'Glascow coma scale motor response->Obeys Commands',
        'Glascow coma scale motor response->Abnormal Flexion',
        'Glascow coma scale motor response->6 Obeys Commands',
        'Glascow coma scale motor response->5 Localizes Pain',
        'Glascow coma scale motor response->2 Abnorm extensn',
        'Glascow coma scale total->11', 'Glascow coma scale total->10',
        'Glascow coma scale total->13', 'Glascow coma scale total->12',
        'Glascow coma scale total->15', 'Glascow coma scale total->14',
        'Glascow coma scale total->3', 'Glascow coma scale total->5',
        'Glascow coma scale total->4', 'Glascow coma scale total->7',
        'Glascow coma scale total->6', 'Glascow coma scale total->9',
        'Glascow coma scale total->8',
        'Glascow coma scale verbal response->1 No Response',
        'Glascow coma scale verbal response->No Response',
        'Glascow coma scale verbal response->Confused',
        'Glascow coma scale verbal response->Inappropriate Words',
        'Glascow coma scale verbal response->Oriented',
        'Glascow coma scale verbal response->No Response-ETT',
        'Glascow coma scale verbal response->5 Oriented',
        'Glascow coma scale verbal response->Incomprehensible sounds',
        'Glascow coma scale verbal response->1.0 ET/Trach',
        'Glascow coma scale verbal response->4 Confused',
        'Glascow coma scale verbal response->2 Incomp sounds',
        'Glascow coma scale verbal response->3 Inapprop words',
        'Diastolic blood pressure', 'Fraction inspired oxygen', 'Glucose',
        'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation',
        'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight',
        'pH']
require_impute_features = labtest_features
normalize_features = ['Age'] + ['Diastolic blood pressure', 'Fraction inspired oxygen', 'Glucose',
        'Heart Rate', 'Height', 'Mean blood pressure', 'Oxygen saturation',
        'Respiratory rate', 'Systolic blood pressure', 'Temperature', 'Weight',
        'pH'] + ['LOS']

In [3]:
df = pd.read_csv(os.path.join(data_dir, "processed", f"format_mimic4_ehr.csv"))
# df

In [4]:
# if a patient has multiple records, we only use the first 48 items
# we also discard the patients with less than 48 items

# Ensure dataframe is sorted by PatientID and RecordTime
df = df.sort_values(['PatientID', 'RecordTime'])

# Filter out patients with less than 48 records
df = df.groupby('PatientID').filter(lambda x: len(x) >= 48)

# Select the first 48 records for each patient
df = df.groupby('PatientID').head(48)

df


Unnamed: 0,PatientID,RecordTime,AdmissionTime,DischargeTime,Outcome,LOS,Readmission,Decompensation,Acute and unspecified renal failure,Acute cerebrovascular disease,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
101,10001884_1,1,2131-01-11 04:20:05,2131-01-20 08:27:30,1,219.123611,1,0.0,0,0,...,,60.0,,70.0,98.0,10.0,167.0,,,
102,10001884_1,2,2131-01-11 04:20:05,2131-01-20 08:27:30,1,218.123611,1,0.0,0,0,...,,72.0,,75.0,100.0,20.0,102.0,,,
103,10001884_1,3,2131-01-11 04:20:05,2131-01-20 08:27:30,1,217.123611,1,0.0,0,0,...,140.0,70.0,,73.0,100.0,20.0,93.0,35.400000,,7.33
104,10001884_1,4,2131-01-11 04:20:05,2131-01-20 08:27:30,1,216.123611,1,0.0,0,0,...,,71.0,,80.0,100.0,20.0,102.0,35.800000,,
105,10001884_1,5,2131-01-11 04:20:05,2131-01-20 08:27:30,1,215.123611,1,0.0,0,0,...,,71.0,,92.0,98.0,20.0,138.0,36.300000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4055287,19999442_1,46,2148-11-19 14:23:43,2148-11-26 13:12:15,0,120.808889,0,0.0,0,1,...,,61.0,,91.0,99.0,16.0,142.0,37.055556,,
4055288,19999442_1,47,2148-11-19 14:23:43,2148-11-26 13:12:15,0,119.808889,0,0.0,0,1,...,,61.0,,93.0,97.0,15.0,151.0,,,
4055289,19999442_1,48,2148-11-19 14:23:43,2148-11-26 13:12:15,0,118.808889,0,0.0,0,1,...,,55.0,,93.0,98.0,10.0,135.0,,,
4055290,19999442_1,49,2148-11-19 14:23:43,2148-11-26 13:12:15,0,117.808889,0,0.0,0,1,...,,53.0,,94.0,97.0,15.0,137.0,,,


In [5]:
def assign_group(rt):
    return (rt - 1) // 12

# Assign group for each record
df['Group'] = df['RecordTime'].apply(assign_group)

aggregation_logic = {}
for f in demographic_features+labtest_features:
    aggregation_logic[f] = 'mean'

aggregation_logic['Outcome'] = 'mean'
aggregation_logic['LOS'] = 'mean'
aggregation_logic['Readmission'] = 'mean'


# Group by PatientID and Group, then aggregate
aggregated_df = df.groupby(['PatientID', 'Group']).agg(aggregation_logic)  # replace 'mean' with your desired aggregation

# Reset index if needed
aggregated_df = aggregated_df.reset_index()

aggregated_df

Unnamed: 0,PatientID,Group,Sex,Age,Capillary refill rate->0.0,Capillary refill rate->1.0,Glascow coma scale eye opening->To Pain,Glascow coma scale eye opening->3 To speech,Glascow coma scale eye opening->1 No Response,Glascow coma scale eye opening->4 Spontaneously,...,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH,Outcome,LOS,Readmission
0,10001884_1,0,0.0,68.0,,,0.200000,0.0,0.0,0.0,...,84.083333,98.727273,17.916667,115.000000,36.411111,,7.340000,1.0,213.623611,1.0
1,10001884_1,1,0.0,68.0,,,0.000000,0.0,0.0,0.0,...,91.500000,99.000000,18.583333,128.916667,36.720000,,7.340000,1.0,201.623611,1.0
2,10001884_1,2,0.0,68.0,,,0.000000,0.0,0.0,0.0,...,94.250000,98.272727,20.250000,131.833333,36.908333,,,1.0,189.623611,1.0
3,10001884_1,3,0.0,68.0,,,0.000000,0.0,0.0,0.0,...,103.250000,94.416667,23.166667,153.083333,36.761111,,7.400000,1.0,177.623611,1.0
4,10002155_1,-1,0.0,80.0,,,,,,,...,,,18.000000,,,53.000000,,0.0,148.293889,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102215,19999442_1,0,1.0,41.0,,,0.285714,0.0,0.0,0.0,...,102.125000,99.375000,17.500000,136.000000,37.194444,,7.433333,0.0,159.308889,0.0
102216,19999442_1,1,1.0,41.0,,,0.000000,0.0,0.0,0.0,...,92.250000,99.583333,17.333333,122.916667,37.314815,,7.440000,0.0,148.308889,0.0
102217,19999442_1,2,1.0,41.0,,,0.100000,0.0,0.0,0.0,...,95.666667,97.250000,17.666667,138.750000,36.759259,107.274508,,0.0,136.308889,0.0
102218,19999442_1,3,1.0,41.0,,,0.000000,0.0,0.0,0.0,...,93.833333,96.500000,14.916667,141.000000,36.870370,,,0.0,124.308889,0.0


In [6]:
aggregated_df = aggregated_df.rename(columns={'Group':'RecordTime'})
df = aggregated_df

df['LOS'] = 5-df['RecordTime']
df

Unnamed: 0,PatientID,RecordTime,Sex,Age,Capillary refill rate->0.0,Capillary refill rate->1.0,Glascow coma scale eye opening->To Pain,Glascow coma scale eye opening->3 To speech,Glascow coma scale eye opening->1 No Response,Glascow coma scale eye opening->4 Spontaneously,...,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH,Outcome,LOS,Readmission
0,10001884_1,0,0.0,68.0,,,0.200000,0.0,0.0,0.0,...,84.083333,98.727273,17.916667,115.000000,36.411111,,7.340000,1.0,5,1.0
1,10001884_1,1,0.0,68.0,,,0.000000,0.0,0.0,0.0,...,91.500000,99.000000,18.583333,128.916667,36.720000,,7.340000,1.0,4,1.0
2,10001884_1,2,0.0,68.0,,,0.000000,0.0,0.0,0.0,...,94.250000,98.272727,20.250000,131.833333,36.908333,,,1.0,3,1.0
3,10001884_1,3,0.0,68.0,,,0.000000,0.0,0.0,0.0,...,103.250000,94.416667,23.166667,153.083333,36.761111,,7.400000,1.0,2,1.0
4,10002155_1,-1,0.0,80.0,,,,,,,...,,,18.000000,,,53.000000,,0.0,6,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102215,19999442_1,0,1.0,41.0,,,0.285714,0.0,0.0,0.0,...,102.125000,99.375000,17.500000,136.000000,37.194444,,7.433333,0.0,5,0.0
102216,19999442_1,1,1.0,41.0,,,0.000000,0.0,0.0,0.0,...,92.250000,99.583333,17.333333,122.916667,37.314815,,7.440000,0.0,4,0.0
102217,19999442_1,2,1.0,41.0,,,0.100000,0.0,0.0,0.0,...,95.666667,97.250000,17.666667,138.750000,36.759259,107.274508,,0.0,3,0.0
102218,19999442_1,3,1.0,41.0,,,0.000000,0.0,0.0,0.0,...,93.833333,96.500000,14.916667,141.000000,36.870370,,,0.0,2,0.0


## Stratified split dataset into `Training`, `Validation` and `Test` sets (ML/DL settings)

- Stratified dataset according to `Outcome` column
- 50% Training, 45% Validation, 5% Test
  - Name: train, val, test


In [7]:
# Group the dataframe by patient ID
grouped = df.groupby('PatientID')

# Get the patient IDs and outcomes
patients = np.array(list(grouped.groups.keys()))
patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in patients])

# Get the train_val/test patient IDs
train_val_patients, test_patients = train_test_split(patients, test_size=1/100, random_state=SEED, stratify=patients_outcome)

# Get the train/val patient IDs
train_val_patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in train_val_patients])
train_patients, val_patients = train_test_split(train_val_patients, test_size=49/99, random_state=SEED, stratify=train_val_patients_outcome)
# Create train, val, test dataframes for the current fold
train_df = df[df['PatientID'].isin(train_patients)]
val_df = df[df['PatientID'].isin(val_patients)]
test_df = df[df['PatientID'].isin(test_patients)]
save_dir = os.path.join(data_dir, 'processed', 'fold_ml') # forward fill
Path(save_dir).mkdir(parents=True, exist_ok=True)

# # Save the train, val, and test dataframes for the current fold to csv files
# train_df.to_csv(os.path.join(save_dir, "train_raw.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_raw.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_raw.csv"), index=False)
# Calculate the mean and std of the train set (include age, lab test features, and LOS) on the data in 5% to 95% quantile range
train_df, val_df, test_df, default_fill, los_info, train_mean, train_std = normalize_dataframe(train_df, val_df, test_df, normalize_features)

# # Save the zscored dataframes to csv files
# train_df.to_csv(os.path.join(save_dir, "train_after_zscore.csv"), index=False)
# val_df.to_csv(os.path.join(save_dir, "val_after_zscore.csv"), index=False)
# test_df.to_csv(os.path.join(save_dir, "test_after_zscore.csv"), index=False)

# Forward Imputation after grouped by PatientID
# Notice: if a patient has never done certain lab test, the imputed value will be the median value calculated from train set
train_x, train_y, train_pid, _ = forward_fill_pipeline(train_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
val_x, val_y, val_pid, _ = forward_fill_pipeline(val_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)
test_x, test_y, test_pid, _ = forward_fill_pipeline(test_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)

# Save the imputed dataset to pickle file
pd.to_pickle(train_x, os.path.join(save_dir, "train_x.pkl"))
pd.to_pickle(train_y, os.path.join(save_dir, "train_y.pkl"))
pd.to_pickle(train_pid, os.path.join(save_dir, "train_pid.pkl"))
pd.to_pickle(val_x, os.path.join(save_dir, "val_x.pkl"))
pd.to_pickle(val_y, os.path.join(save_dir, "val_y.pkl"))
pd.to_pickle(val_pid, os.path.join(save_dir, "val_pid.pkl"))
pd.to_pickle(test_x, os.path.join(save_dir, "test_x.pkl"))
pd.to_pickle(test_y, os.path.join(save_dir, "test_y.pkl"))
pd.to_pickle(test_pid, os.path.join(save_dir, "test_pid.pkl"))
pd.to_pickle(los_info, os.path.join(save_dir, "los_info.pkl")) # LOS statistics (calculated from the train set)

### Hold-out dataset setting (Stratified) LLM settings

- 50% training, 45% validation, 5% testing


In [None]:
# Group the dataframe by patient ID
grouped = df.groupby('PatientID')

# Get the patient IDs and outcomes
patients = np.array(list(grouped.groups.keys()))
patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in patients])

# Get the train_val/test patient IDs
train_val_patients, test_patients = train_test_split(patients, test_size=1/100, random_state=SEED, stratify=patients_outcome)

# Get the train/val patient IDs
train_val_patients_outcome = np.array([grouped.get_group(patient_id)['Outcome'].iloc[0] for patient_id in train_val_patients])
train_patients, val_patients = train_test_split(train_val_patients, test_size=49/99, random_state=SEED, stratify=train_val_patients_outcome)
# Create train, val, test dataframes for the current fold
train_df = df[df['PatientID'].isin(train_patients)]
val_df = df[df['PatientID'].isin(val_patients)]
test_df = df[df['PatientID'].isin(test_patients)]
save_dir = os.path.join(data_dir, 'processed', 'fold_llm') # forward fill
Path(save_dir).mkdir(parents=True, exist_ok=True)


# Calculate the mean and std of the train set (include age, lab test features, and LOS) on the data in 5% to 95% quantile range
default_fill = normalize_dataframe(train_df, val_df, test_df, normalize_features, require_norm_later=False)

# Forward Imputation after grouped by PatientID
# Notice: if a patient has never done certain lab test, the imputed value will be the median value calculated from train set

test_x, test_y, test_pid, test_x_record_times = forward_fill_pipeline(test_df, default_fill, demographic_features, labtest_features, target_features, require_impute_features)

# Save the imputed dataset to pickle file
pd.to_pickle(test_x, os.path.join(save_dir, "test_x.pkl"))
pd.to_pickle(test_y, os.path.join(save_dir, "test_y.pkl"))
pd.to_pickle(test_pid, os.path.join(save_dir, "test_pid.pkl"))
pd.to_pickle(los_info, os.path.join(save_dir, "los_info.pkl")) # LOS statistics (calculated from the train set)
pd.to_pickle(test_x_record_times, os.path.join(save_dir, "test_x_record_times.pkl"))

all_features = demographic_features + labtest_features
pd.to_pickle(all_features, os.path.join(save_dir, "all_features.pkl"))

In [8]:
test_patients = pd.read_pickle('datasets/mimic-iv/processed/fold_llm/test_pid.pkl')
test_df = df[df['PatientID'].isin(test_patients)]
test_x, test_y, test_pid, test_x_record_times = forward_fill_pipeline(test_df, None, demographic_features, labtest_features, target_features, [])
pd.to_pickle(test_x, "datasets/mimic-iv/processed/fold_llm/test_x_no_impute.pkl")