In [1]:
import os
import numpy as np
import pickle
import json
import math
from reader import InHospitalMortalityReader, PhenotypingReader, LengthOfStayReader, DecompensationReader
from tqdm import tqdm

In [2]:
with open('resources/channel_info.json') as f:
    series_channel_info = json.load(f)

with open('resources/discretizer_config.json') as f:
    series_config = json.load(f)
    id_to_channel = series_config['id_to_channel']
    is_categorical_channel = series_config['is_categorical_channel']
    normal_values = series_config['normal_values']
    possible_values = series_config['possible_values']

In [3]:
def read_chunk(reader, chunk_size):
    data = {}
    for i in range(chunk_size):
        ret = reader.read_next()
        for k, v in ret.items():
            if k not in data:
                data[k] = []
            data[k].append(v)
    data["header"] = data["header"][0]
    return data

In [9]:
period_length = 48
path = 'in-hospital-mortality'

data_all = []
mask_all = []
label_all = []
name_all = []
for mode in ['train', 'val', 'test']:
    reader = InHospitalMortalityReader(dataset_dir=os.path.join(path, 'train' if mode != 'test' else 'test'),
            listfile=os.path.join(path, mode + '_listfile.csv'), period_length=period_length)
    N = reader.get_number_of_examples()
    ret = read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    label_all += labels
    name_all += names
    for patient, name in zip(data, names):
        data_patient = np.zeros(shape=(len(id_to_channel), period_length), dtype=np.float32)
        mask_patient = np.zeros(shape=(len(id_to_channel), period_length), dtype=np.float32)
        last_time = -1
        for row in patient:
            time = int(float(row[0]))
            if time == period_length:
                time -= 1
            if time > period_length:
                raise ValueError('This should not happen')
                break
            for index in range(len(row) - 1):
                value = row[index + 1]
                if value == '':
                    # continue
                    if mask_patient[index, time] == 0 and time - last_time > 0:
                        if last_time >= 0:
                            data_patient[index, last_time + 1:time + 1] = data_patient[index, last_time]
                        else:
                            if is_categorical_channel[id_to_channel[index]]:
                                data_patient[index, last_time + 1:time + 1] = series_channel_info[id_to_channel[index]]['values'][normal_values[id_to_channel[index]]]
                            else:
                                data_patient[index, last_time + 1:time + 1] = float(normal_values[id_to_channel[index]])
                else:
                    mask_patient[index, time] = 1
                    if is_categorical_channel[id_to_channel[index]]:
                        data_patient[index, time] = series_channel_info[id_to_channel[index]]['values'][value]
                    else:
                        data_patient[index, time] = float(value)
            last_time = time
        if last_time < period_length - 1:
            data_patient[:, last_time + 1:period_length] = data_patient[:, last_time, None]
        data_all.append(data_patient.transpose(-1, -2))
        mask_all.append(mask_patient.transpose(-1, -2))
print(len(data_all), len(mask_all), len(label_all), len(name_all))

FileNotFoundError: [Errno 2] No such file or directory: 'in-hospital-mortality/train_listfile.csv'

In [None]:
data_all = np.array(data_all)
mask_all = np.array(mask_all)
data_all_concat = np.concatenate(data_all, axis=0)
x_masked = np.ma.masked_array(data_all_concat, np.concatenate(mask_all, axis=0) == 0)
mean = np.mean(x_masked, 0)
std = np.std(x_masked, 0)
print(mean, std)
data_normalized = np.where(mask_all == 1, (data_all - mean.reshape(1, 1, -1)) / std.reshape(1, 1, -1), 0)

(21139, 48, 17)
[0.13342736248236953 61.46879638121556 0.5394284355238489
 3.119246761936625 5.290596950448792 11.61723736264916 3.180522838632143
 143.22674480392305 86.3000568382371 168.72015948168453 78.73798307306927
 97.69934906068663 19.29756913160995 120.31029497386514 37.0390698629806
 83.27462812014336 7.282118883324198] [0.34003602959594353 250.38515121981368 0.20068257457910565
 1.262228263075122 1.404701993389681 3.9093606844685147 1.897389901895581
 69.23859216770816 19.169864801840426 15.020152083998529 154.8090453358367
 1030.9393550934726 6.63088371639584 25.232502325232193 9.535566627978861
 26.058995236709116 2.217256925266793]


In [None]:
# pickle.dump((data_all.tolist(), label_all, np.array(), mask_all.tolist(), name_all), open('mortality.pkl', 'wb'))
pickle.dump((data_normalized.tolist(), label_all, mask_all.tolist(), name_all), open('mortality_normalized.pkl', 'wb'))

In [None]:
cnt = 0
cnt1 = 0
for i in range(len(mask_all)):
    for j in range(len(mask_all[i])):
        cnt += sum(mask_all[i][j])
        cnt1 += len(mask_all[i][j])
print('Observed Rate:', cnt / cnt1)

Observed Rate: 0.433020198239663


In [None]:
sum(label_all) / len(label_all)

0.13231467902928237

In [None]:
period_length = 48
path = 'phenotyping'

data_all = []
mask_all = []
label_all = []
name_all = []
for mode in ['train', 'val', 'test']:
    reader = PhenotypingReader(dataset_dir=os.path.join(path, 'train' if mode != 'test' else 'test'),
            listfile=os.path.join(path, mode + '_listfile.csv'))
    N = reader.get_number_of_examples()
    ret = read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    label_all += labels
    name_all += names
    for patient, name, t in tqdm(zip(data, names, ts), total=len(data)):
        N_bins = min(int(t + 1 - 1e-6), period_length)
        data_patient = np.zeros(shape=(len(id_to_channel), N_bins), dtype=np.float32)
        mask_patient = np.zeros(shape=(len(id_to_channel), N_bins), dtype=np.float32)
        last_time = -1
        for row in patient:
            time = int(float(row[0]))
            if time == N_bins:
                time -= 1
            if time > N_bins:
                # raise ValueError('This should not happen')
                break
            for index in range(len(row) - 1):
                value = row[index + 1]
                if value == '':
                    if mask_patient[index, time] == 0 and time - last_time > 0:
                        # if last_time >= 0:
                        #     data_patient[index, last_time + 1:time + 1] = data_patient[index, last_time]
                        # else:
                        if is_categorical_channel[id_to_channel[index]]:
                            data_patient[index, last_time + 1:time + 1] = series_channel_info[id_to_channel[index]]['values'][normal_values[id_to_channel[index]]]
                        else:
                            data_patient[index, last_time + 1:time + 1] = float(normal_values[id_to_channel[index]])
                else:
                    mask_patient[index, time] += 1
                    if is_categorical_channel[id_to_channel[index]]:
                        data_patient[index, time] += series_channel_info[id_to_channel[index]]['values'][value]
                    else:
                        data_patient[index, time] += float(value)
            last_time = time
        data_patient = np.where(mask_patient > 0, data_patient / mask_patient, data_patient)
        mask_patient = np.where(mask_patient > 0, 1, 0)
        # assert np.count_nonzero(data_patient == np.nan) == 0
        data_all.append(data_patient.transpose(-1, -2))
        mask_all.append(mask_patient.transpose(-1, -2))
print(len(data_all), len(mask_all), len(label_all), len(name_all))

KeyboardInterrupt: 

In [None]:
max_bins = 0
ts = []
label_all = []
for mode in ['train', 'val', 'test']:
    reader = PhenotypingReader(dataset_dir=os.path.join(path, 'train' if mode != 'test' else 'test'),
            listfile=os.path.join(path, mode + '_listfile.csv'))
    N = reader.get_number_of_examples()
    ret = read_chunk(reader, N)
    data = ret["X"]
    ts += ret["t"]
    labels = ret["y"]
    label_all += labels
print(ts)
label_all = np.array(label_all)
print(np.sum(label_all) / len(label_all))

KeyboardInterrupt: 

In [None]:
print([len(data) for data in data_all])

[]


In [None]:
data_all_concat = np.concatenate(data_all, axis=0)
mean = np.mean(data_all_concat, 0)
std = np.std(data_all_concat, 0)
print(mean, std)
data_normalized = [((data - mean.reshape(1, -1)) / std.reshape(1, -1)).tolist() for data in data_all]
mask_all = [mask.tolist() for mask in mask_all]

pickle.dump((data_normalized, label_all, mask_all, name_all), open('phenotyping_normalized.pkl', 'wb'))

cnt = 0
cnt1 = 0
for i in range(len(mask_all)):
    for j in range(len(mask_all[i])):
        cnt += sum(mask_all[i][j])
        cnt1 += len(mask_all[i][j])
print('Observed Rate:', cnt / cnt1)

[3.52940027e-04 6.57730408e+01 2.25078121e-01 3.93079996e+00
 6.04431057e+00 1.43859215e+01 4.74529743e+00 1.34997192e+02
 9.01411362e+01 1.66275803e+02 8.40404968e+01 1.02522522e+02
 2.07139950e+01 1.28963440e+02 3.74240494e+01 8.24912872e+01
 7.47933054e+00] [1.87711418e-02 2.13222717e+02 1.11536264e-01 1.38611352e+00
 2.04904175e+00 3.90546131e+00 1.78289831e+00 2.66230988e+02
 3.87443848e+01 2.92791004e+01 1.20651512e+02 7.68572571e+02
 9.21587463e+02 1.61897217e+02 1.20404739e+01 3.22362085e+03
 2.04241657e+00]
Observed Rate: 0.41815378281625576


In [4]:
period_length = 24
path = 'decompensation'

data_all = []
mask_all = []
label_all = []
name_all = []
for mode in ['train', 'val', 'test']:
    reader = DecompensationReader(dataset_dir=os.path.join(path, 'train' if mode != 'test' else 'test'),
            listfile=os.path.join(path, mode + '_listfile.csv'))
    N = reader.get_number_of_examples()
    ret = read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    label_all += labels
    name_all += names
    for patient, name, t in tqdm(zip(data, names, ts), total=len(data)):
        N_bins = min(int(t + 1 - 1e-6), period_length)
        data_patient = np.zeros(shape=(len(id_to_channel), N_bins), dtype=np.float32)
        mask_patient = np.zeros(shape=(len(id_to_channel), N_bins), dtype=np.float32)
        last_time = -1
        for row in patient:
            time = int(float(row[0]))
            if time == N_bins:
                time -= 1
            if time > N_bins:
                # raise ValueError('This should not happen')
                break
            for index in range(len(row) - 1):
                value = row[index + 1]
                if value == '':
                    if mask_patient[index, time] == 0 and time - last_time > 0:
                        # if last_time >= 0:
                        #     data_patient[index, last_time + 1:time + 1] = data_patient[index, last_time]
                        # else:
                        if is_categorical_channel[id_to_channel[index]]:
                            data_patient[index, last_time + 1:time + 1] = series_channel_info[id_to_channel[index]]['values'][normal_values[id_to_channel[index]]]
                        else:
                            data_patient[index, last_time + 1:time + 1] = float(normal_values[id_to_channel[index]])
                else:
                    mask_patient[index, time] += 1
                    if is_categorical_channel[id_to_channel[index]]:
                        data_patient[index, time] += series_channel_info[id_to_channel[index]]['values'][value]
                    else:
                        data_patient[index, time] += float(value)
            last_time = time
        data_patient = np.where(mask_patient > 0, data_patient / mask_patient, data_patient)
        mask_patient = np.where(mask_patient > 0, 1, 0)
        # assert np.count_nonzero(data_patient == np.nan) == 0
        data_all.append(data_patient.transpose(-1, -2))
        mask_all.append(mask_patient.transpose(-1, -2))
print(len(data_all), len(mask_all), len(label_all), len(name_all))

  data_patient = np.where(mask_patient > 0, data_patient / mask_patient, data_patient)
  data_patient = np.where(mask_patient > 0, data_patient / mask_patient, data_patient)
100%|██████████| 29143/29143 [01:27<00:00, 334.03it/s]
100%|██████████| 6346/6346 [00:18<00:00, 338.37it/s]
100%|██████████| 6255/6255 [00:18<00:00, 332.03it/s]

41744 41744 41744 41744





In [None]:
print([len(data) for data in data_all])

[48, 48, 23, 48, 17, 48, 23, 48, 48, 44, 30, 48, 18, 48, 48, 29, 48, 48, 9, 48, 42, 48, 24, 48, 48, 38, 48, 27, 48, 48, 39, 24, 36, 48, 48, 48, 48, 31, 19, 48, 16, 48, 27, 48, 48, 48, 45, 31, 48, 16, 48, 20, 48, 48, 48, 48, 43, 47, 33, 46, 16, 48, 48, 32, 40, 30, 47, 48, 24, 48, 19, 16, 48, 48, 48, 25, 48, 33, 48, 48, 48, 48, 48, 48, 47, 38, 48, 24, 48, 23, 16, 25, 28, 47, 38, 46, 43, 32, 48, 48, 29, 48, 48, 48, 29, 32, 48, 43, 16, 32, 24, 48, 19, 48, 24, 43, 48, 47, 28, 25, 30, 18, 48, 38, 22, 48, 48, 48, 48, 31, 36, 29, 48, 48, 34, 48, 48, 23, 48, 14, 27, 45, 26, 28, 48, 48, 22, 24, 48, 48, 31, 35, 25, 21, 48, 33, 42, 48, 48, 48, 43, 29, 48, 48, 20, 29, 48, 48, 48, 20, 48, 48, 40, 23, 25, 24, 27, 48, 48, 15, 48, 48, 44, 10, 2, 48, 48, 48, 21, 24, 21, 48, 48, 13, 48, 21, 18, 48, 48, 48, 15, 48, 48, 34, 45, 48, 48, 46, 23, 21, 44, 24, 45, 48, 48, 19, 38, 37, 48, 48, 27, 48, 48, 45, 20, 23, 44, 48, 41, 48, 48, 16, 34, 19, 48, 43, 18, 48, 48, 41, 5, 45, 48, 27, 20, 48, 37, 39, 21, 48, 31

In [5]:
data_all_concat = np.concatenate(data_all, axis=0)
mean = np.mean(data_all_concat, 0)
std = np.std(data_all_concat, 0)
print(mean, std)
data_normalized = [((data - mean.reshape(1, -1)) / std.reshape(1, -1)).tolist() for data in data_all]
mask_all = [mask.tolist() for mask in mask_all]

pickle.dump((data_normalized, label_all, mask_all, name_all), open('decompensation_normalized.pkl', 'wb'))

cnt = 0
cnt1 = 0
for i in range(len(mask_all)):
    for j in range(len(mask_all[i])):
        cnt += sum(mask_all[i][j])
        cnt1 += len(mask_all[i][j])
print('Observed Rate:', cnt / cnt1)

[3.6462824e-04 6.7509583e+01 2.3202808e-01 4.0216231e+00 6.1931319e+00
 1.4663564e+01 4.8571868e+00 1.3957463e+02 9.2549011e+01 1.6925241e+02
 8.6117294e+01 1.0576494e+02 2.1513866e+01 1.3186606e+02 3.8373699e+01
 8.1611977e+01 7.6491199e+00] [1.9084949e-02 1.9184895e+02 1.1985364e-01 1.3376771e+00 1.9729450e+00
 3.5173326e+00 1.7233016e+00 3.4439746e+02 3.7673790e+01 2.2092722e+01
 1.2524643e+02 1.0012802e+03 1.2013160e+03 1.4728951e+02 1.1425072e+01
 1.7897316e+01 2.0367055e+00]
Observed Rate: 0.4313917233395309


In [6]:
sum(label_all) / len(label_all)

0.035310463779225754

In [5]:
path = 'decompensation'
max_bins = 0
ts = []
label_all = []
for mode in ['train', 'val', 'test']:
    reader = DecompensationReader(dataset_dir=os.path.join(path, 'train' if mode != 'test' else 'test'),
            listfile=os.path.join(path, mode + '_listfile.csv'))
    N = reader.get_number_of_examples()
    ret = read_chunk(reader, N)
    data = ret["X"]
    ts += ret["t"]
    labels = ret["y"]
    label_all += labels
print(ts)
label_all = np.array(label_all)
print(np.sum(label_all) / len(label_all))

[48.0, 48.0, 23.0, 48.0, 17.0, 48.0, 23.0, 48.0, 48.0, 44.0, 30.0, 48.0, 18.0, 48.0, 48.0, 29.0, 48.0, 48.0, 9.0, 48.0, 42.0, 48.0, 24.0, 48.0, 48.0, 38.0, 48.0, 27.0, 48.0, 48.0, 39.0, 24.0, 36.0, 48.0, 48.0, 48.0, 48.0, 31.0, 19.0, 48.0, 16.0, 48.0, 27.0, 48.0, 48.0, 48.0, 45.0, 31.0, 48.0, 16.0, 48.0, 20.0, 48.0, 48.0, 48.0, 48.0, 43.0, 47.0, 33.0, 46.0, 16.0, 48.0, 48.0, 32.0, 40.0, 30.0, 47.0, 48.0, 24.0, 48.0, 19.0, 16.0, 48.0, 48.0, 48.0, 25.0, 48.0, 33.0, 48.0, 48.0, 48.0, 48.0, 48.0, 48.0, 47.0, 38.0, 48.0, 24.0, 48.0, 23.0, 16.0, 25.0, 28.0, 47.0, 38.0, 46.0, 43.0, 32.0, 48.0, 48.0, 29.0, 48.0, 48.0, 48.0, 29.0, 32.0, 48.0, 43.0, 16.0, 32.0, 24.0, 48.0, 19.0, 48.0, 24.0, 43.0, 48.0, 47.0, 28.0, 25.0, 30.0, 18.0, 48.0, 38.0, 22.0, 48.0, 48.0, 48.0, 48.0, 31.0, 36.0, 29.0, 48.0, 48.0, 34.0, 48.0, 48.0, 23.0, 48.0, 14.0, 27.0, 45.0, 26.0, 28.0, 48.0, 48.0, 22.0, 24.0, 48.0, 48.0, 31.0, 35.0, 25.0, 21.0, 48.0, 33.0, 42.0, 48.0, 48.0, 48.0, 43.0, 29.0, 48.0, 48.0, 20.0, 29.0, 48.0

In [None]:
period_length = 24
path = 'length-of-stay'

data_all = []
mask_all = []
label_all = []
name_all = []
for mode in ['train', 'val', 'test']:
    reader = LengthOfStayReader(dataset_dir=os.path.join(path, 'train' if mode != 'test' else 'test'),
            listfile=os.path.join(path, mode + '_listfile.csv'))
    N = reader.get_number_of_examples()
    ret = read_chunk(reader, N)
    data = ret["X"]
    ts = ret["t"]
    labels = ret["y"]
    names = ret["name"]
    label_all += labels
    name_all += names
    for patient, name, t in tqdm(zip(data, names, ts), total=len(data)):
        N_bins = min(int(t + 1 - 1e-6), period_length)
        data_patient = np.zeros(shape=(len(id_to_channel), N_bins), dtype=np.float32)
        mask_patient = np.zeros(shape=(len(id_to_channel), N_bins), dtype=np.float32)
        last_time = -1
        for row in patient:
            time = int(float(row[0]))
            if time == N_bins:
                time -= 1
            if time > N_bins:
                # raise ValueError('This should not happen')
                break
            for index in range(len(row) - 1):
                value = row[index + 1]
                if value == '':
                    if mask_patient[index, time] == 0 and time - last_time > 0:
                        # if last_time >= 0:
                        #     data_patient[index, last_time + 1:time + 1] = data_patient[index, last_time]
                        # else:
                        if is_categorical_channel[id_to_channel[index]]:
                            data_patient[index, last_time + 1:time + 1] = series_channel_info[id_to_channel[index]]['values'][normal_values[id_to_channel[index]]]
                        else:
                            data_patient[index, last_time + 1:time + 1] = float(normal_values[id_to_channel[index]])
                else:
                    mask_patient[index, time] += 1
                    if is_categorical_channel[id_to_channel[index]]:
                        data_patient[index, time] += series_channel_info[id_to_channel[index]]['values'][value]
                    else:
                        data_patient[index, time] += float(value)
            last_time = time
        data_patient = np.where(mask_patient > 0, data_patient / mask_patient, data_patient)
        mask_patient = np.where(mask_patient > 0, 1, 0)
        # assert np.count_nonzero(data_patient == np.nan) == 0
        data_all.append(data_patient.transpose(-1, -2))
        mask_all.append(mask_patient.transpose(-1, -2))
print(len(data_all), len(mask_all), len(label_all), len(name_all))

  data_patient = np.where(mask_patient > 0, data_patient / mask_patient, data_patient)
  data_patient = np.where(mask_patient > 0, data_patient / mask_patient, data_patient)
100%|██████████| 23220/23220 [01:14<00:00, 312.41it/s]
100%|██████████| 5106/5106 [00:16<00:00, 311.23it/s]
100%|██████████| 5034/5034 [00:16<00:00, 304.80it/s]

33360 33360 33360 33360





In [None]:
data_all_concat = np.concatenate(data_all, axis=0)
mean = np.mean(data_all_concat, 0)
std = np.std(data_all_concat, 0)
print(mean, std)
data_normalized = [((data - mean.reshape(1, -1)) / std.reshape(1, -1)).tolist() for data in data_all]
mask_all = [mask.tolist() for mask in mask_all]

pickle.dump((data_normalized, label_all, mask_all, name_all), open('lengthofstay_normalized.pkl', 'wb'))

cnt = 0
cnt1 = 0
for i in range(len(mask_all)):
    for j in range(len(mask_all[i])):
        cnt += sum(mask_all[i][j])
        cnt1 += len(mask_all[i][j])
print('Observed Rate:', cnt / cnt1)

[4.0155376e-04 6.8193481e+01 2.3595384e-01 4.0348959e+00 6.2306409e+00
 1.4745326e+01 4.8527946e+00 1.4117714e+02 9.4147171e+01 1.7033537e+02
 8.6961792e+01 1.0695137e+02 2.2040630e+01 1.3308524e+02 3.8764061e+01
 8.2098953e+01 7.7307324e+00] [2.0026991e-02 2.0938132e+02 1.2357162e-01 1.3062618e+00 1.9164379e+00
 3.3472373e+00 1.6949662e+00 3.7657034e+02 3.7187466e+01 1.7419001e+01
 1.3589072e+02 1.0969733e+03 1.3163010e+03 1.5891556e+02 1.1114523e+01
 1.6920904e+01 2.0583203e+00]
Observed Rate: 0.4376028588893591
