Based on https://github.com/YerevaNN/mimic3-benchmarks & https://github.com/kaggarwal/ClinicalNotesICU

Import packages

In [None]:
import os
import pickle

from mimic3models.in_hospital_mortality import utils as ihm_utils
from mimic3benchmark.readers import InHospitalMortalityReader
from mimic3models.preprocessing import Discretizer, Normalizer

Config

In [None]:
data_dir = "./datasets/mimiciii/"
ihm_dir = 'data/in-hospital-mortality/'
ihm_normalizer_state = 'mimic3models/in_hospital_mortality/ihm_ts1.0.input_str_previous.start_time_zero.normalizer'
timestep = 1.0
imputation = 'previous'

Build readers, discretizers, normalizers

In [None]:
train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(ihm_dir, 'train'),
                                   listfile=os.path.join(
                                   ihm_dir, 'train', 'listfile.csv'),
                                   period_length=48.0)

test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(ihm_dir, 'test'),
                                   listfile=os.path.join(
                                   ihm_dir, 'test', 'listfile.csv'),
                                   period_length=48.0)

discretizer = Discretizer(timestep=float(timestep),
                          store_masks=True,
                          impute_strategy=imputation,
                          start_time='zero')

discretizer_header = discretizer.transform(
    train_reader.read_example(0)["X"])[1].split(',')

cont_channels = [i for (i, x) in enumerate(
    discretizer_header) if x.find("->") == -1]

Choose here which columns to standardize

In [None]:
normalizer = Normalizer(fields=cont_channels)
normalizer_state = ihm_normalizer_state
if normalizer_state is None:
    normalizer_state = 'ihm_ts{}.input_str_{}.start_time_zero.normalizer'.format(
        timestep, imputation)
    normalizer_state = os.path.join(
        os.path.dirname(__file__), normalizer_state)
normalizer.load_params(normalizer_state)

normalizer = None
train_ihm = ihm_utils.load_data(
    train_reader, discretizer, normalizer, small_part=False, return_names=True)

test_ihm = ihm_utils.load_data(
    test_reader, discretizer, normalizer, small_part=False, return_names=True)

print("Number of train_ihm_names: ", len(train_ihm['names']))
print("Number of test_ihm_names: ", len(test_ihm['names']))

Save

In [None]:
save_dir = os.path.join(data_dir, 'ihm')

with open(os.path.join(save_dir, 'train.pkl'), 'wb') as f:
    pickle.dump(train_ihm['data'], f, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(save_dir, 'test.pkl'), 'wb') as f:
    pickle.dump(test_ihm['data'], f, protocol=pickle.HIGHEST_PROTOCOL)