# Task Specific Data Preparation

In this notebook we are going to have a look at the task specific data preparation. We remember the four tasks:

- In Hospital Mortality
- Decompensation
- Length of Stay
- Phenotyping

In [1]:
import os
import argparse
import pandas as pd
import random
import numpy as np
import pdb
from pathlib import Path
import yaml

In [5]:
dataset_folder = Path(os.getenv("DATA"), "mimic-iii-demo")

timeserie_df = pd.read_csv(Path('resources', 'episode1_timeseries.csv'))
label_df = pd.read_csv(Path('resources', 'episode1.csv'))
stays_df = pd.read_csv(Path('resources', 'stays.csv'))
eps = 1e-6
label_start_time = 48

In [6]:
timeserie_df.head()

Unnamed: 0,Hours,Capillary refill rate,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,0.305833,,,,,,,,,,,,,,,,52.3,
1,0.739167,,57.0,,,,,,,,,70.0,,,115.0,,,
2,0.7725,,,,,,,,,149.0,,,94.0,24.0,,,,
3,1.239167,,,,,,,,132.0,,,,,,,,,
4,1.239167,,,,,,,,132.0,,,,,,,,,


## In Hospital Mortaliy

### Characteristics:
- Binary
- Single

### Inputs:
- label_df
- timeseries_df
- label_start_time

In [23]:
label_start_time = 48

In [24]:
# Binary y value
mortality = int(label_df.iloc[0]["Mortality"])

In [10]:
# First label_start_time as samples
X = timeserie_df[(timeserie_df['Hours'] < label_start_time + eps) & (timeserie_df['Hours'] > -eps)]
X.head()

Unnamed: 0,Hours,Capillary refill rate,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,0.305833,,,,,,,,,,,,,,,,52.3,
1,0.739167,,57.0,,,,,,,,,70.0,,,115.0,,,
2,0.7725,,,,,,,,,149.0,,,94.0,24.0,,,,
3,1.239167,,,,,,,,132.0,,,,,,,,,
4,1.239167,,,,,,,,132.0,,,,,,,,,


In [11]:
y = mortality

In [12]:
y

0

## Decompenstation

### Characteristics
- Binary
- Series

### Inputs
- sample_rate
- sample_start_time
- horizon
- label_df
- timeseries_df

In [13]:
sample_rate = 1.0
sample_start_time = 4.0
eps = 1e-6
horizon = 24.0

In [14]:
# Length of stay to trim dataframe
los = 24.0 * label_df.iloc[0]['Length of Stay']  # in hours
# Mortality of patient
mortality = int(label_df.iloc[0]["Mortality"])

In [15]:
# Stay information
stay = stays_df[stays_df.ICUSTAY_ID == label_df.iloc[0]['Icustay']]
deathtime = stay['DEATHTIME'].iloc[0]
intime = stay['INTIME'].iloc[0]

In [17]:
# Lived time either infinite or during ICU stay
if pd.isnull(deathtime):
    lived_time = 1e18
else:
    lived_time = (datetime.strptime(deathtime, "%Y-%m-%d %H:%M:%S") -
                  datetime.strptime(intime, "%Y-%m-%d %H:%M:%S")).total_seconds() / 3600.0
lived_time

1e+18

In [None]:
# Return samples
X = timeserie_df[(timeserie_df['Hours'] < los + eps) & (timeserie_df['Hours'] > -eps)]

In [29]:
# Sample indices, barely used
event_times =  timeserie_df['Hours'][(timeserie_df['Hours'] < los + eps) & (timeserie_df['Hours'] > -eps)]

# Series starting from shortest time or first event time
sample_times = np.arange(0.0, min(los, lived_time) + eps, sample_rate)
sample_times = list(filter(lambda x: x > sample_start_time, sample_times))
sample_times = list(filter(lambda x: x > event_times[0], sample_times))

In [28]:
# If death event less than #horizon H away from sample time, then True
y = list()
for t in sample_times:
    if mortality == 0:
        cur_mortality = 0
    else:
        cur_mortality = int(lived_time - t < horizon)
    y.append((t, cur_mortality))

In [19]:
y

[(5.0, 0),
 (6.0, 0),
 (7.0, 0),
 (8.0, 0),
 (9.0, 0),
 (10.0, 0),
 (11.0, 0),
 (12.0, 0),
 (13.0, 0),
 (14.0, 0),
 (15.0, 0),
 (16.0, 0),
 (17.0, 0),
 (18.0, 0),
 (19.0, 0),
 (20.0, 0),
 (21.0, 0),
 (22.0, 0),
 (23.0, 0),
 (24.0, 0),
 (25.0, 0),
 (26.0, 0),
 (27.0, 0),
 (28.0, 0),
 (29.0, 0),
 (30.0, 0),
 (31.0, 0),
 (32.0, 0),
 (33.0, 0),
 (34.0, 0),
 (35.0, 0),
 (36.0, 0),
 (37.0, 0),
 (38.0, 0),
 (39.0, 0),
 (40.0, 0),
 (41.0, 0),
 (42.0, 0),
 (43.0, 0),
 (44.0, 0),
 (45.0, 0),
 (46.0, 0),
 (47.0, 0),
 (48.0, 0),
 (49.0, 0),
 (50.0, 0),
 (51.0, 0),
 (52.0, 0),
 (53.0, 0),
 (54.0, 0),
 (55.0, 0),
 (56.0, 0),
 (57.0, 0),
 (58.0, 0),
 (59.0, 0),
 (60.0, 0),
 (61.0, 0),
 (62.0, 0),
 (63.0, 0),
 (64.0, 0),
 (65.0, 0),
 (66.0, 0),
 (67.0, 0),
 (68.0, 0),
 (69.0, 0),
 (70.0, 0),
 (71.0, 0),
 (72.0, 0),
 (73.0, 0),
 (74.0, 0),
 (75.0, 0),
 (76.0, 0),
 (77.0, 0),
 (78.0, 0),
 (79.0, 0),
 (80.0, 0),
 (81.0, 0),
 (82.0, 0),
 (83.0, 0),
 (84.0, 0),
 (85.0, 0),
 (86.0, 0),
 (87.0, 0),
 (88.0, 0

## Lenght of Stay

### Characteristics
- Numerical
- Series

### Inputs
- sample_rate
- label_df
- timeseries_df

In [25]:
# Total length of stay to trim data and generate return
los = 24.0 * label_df.iloc[0]['Length of Stay']  # in hours

In [26]:
# Return samples
X = timeserie_df[(timeserie_df['Hours'] < los + eps) & (timeserie_df['Hours'] > -eps)]

In [27]:
# Sample indices, barely used
event_times =  timeserie_df['Hours'][(timeserie_df['Hours'] < los + eps) & (timeserie_df['Hours'] > -eps)]

# Series starting from shortest time or first event time
sample_times = np.arange(0.0, los + eps, sample_rate)
sample_times = list(filter(lambda x: x > sample_start_time, sample_times))
sample_times = list(filter(lambda x: x > event_times[0], sample_times))

In [21]:
# Remaining LOS from sample time
y = list()
for t in sample_times:
    y.append((t, los - t))

In [22]:
y

[(5.0, 105.85840000000002),
 (6.0, 104.85840000000002),
 (7.0, 103.85840000000002),
 (8.0, 102.85840000000002),
 (9.0, 101.85840000000002),
 (10.0, 100.85840000000002),
 (11.0, 99.85840000000002),
 (12.0, 98.85840000000002),
 (13.0, 97.85840000000002),
 (14.0, 96.85840000000002),
 (15.0, 95.85840000000002),
 (16.0, 94.85840000000002),
 (17.0, 93.85840000000002),
 (18.0, 92.85840000000002),
 (19.0, 91.85840000000002),
 (20.0, 90.85840000000002),
 (21.0, 89.85840000000002),
 (22.0, 88.85840000000002),
 (23.0, 87.85840000000002),
 (24.0, 86.85840000000002),
 (25.0, 85.85840000000002),
 (26.0, 84.85840000000002),
 (27.0, 83.85840000000002),
 (28.0, 82.85840000000002),
 (29.0, 81.85840000000002),
 (30.0, 80.85840000000002),
 (31.0, 79.85840000000002),
 (32.0, 78.85840000000002),
 (33.0, 77.85840000000002),
 (34.0, 76.85840000000002),
 (35.0, 75.85840000000002),
 (36.0, 74.85840000000002),
 (37.0, 73.85840000000002),
 (38.0, 72.85840000000002),
 (39.0, 71.85840000000002),
 (40.0, 70.85840000

## Phenotyping

### Characteristics
- Categorical (17 categories)

### Inputs
- 

In [34]:
# Contains phenotypes and a list of codes referring to the phenotype
phenotypes_yaml = yaml.full_load(open(Path(dataset_folder, "resources", "hcup_ccs_2015_definitions.yaml"), "r"))

In [35]:
# Dictionary mapping codes (int) to groups (str)
code_to_group = {}
for group in phenotypes_yaml:
    codes = phenotypes_yaml[group]['codes']
    for code in codes:
        if code not in code_to_group:
            code_to_group[code] = group
        else:
            assert code_to_group[code] == group



In [36]:
code_to_group

{'01000': 'Tuberculosis',
 '01001': 'Tuberculosis',
 '01002': 'Tuberculosis',
 '01003': 'Tuberculosis',
 '01004': 'Tuberculosis',
 '01005': 'Tuberculosis',
 '01006': 'Tuberculosis',
 '01010': 'Tuberculosis',
 '01011': 'Tuberculosis',
 '01012': 'Tuberculosis',
 '01013': 'Tuberculosis',
 '01014': 'Tuberculosis',
 '01015': 'Tuberculosis',
 '01016': 'Tuberculosis',
 '01080': 'Tuberculosis',
 '01081': 'Tuberculosis',
 '01082': 'Tuberculosis',
 '01083': 'Tuberculosis',
 '01084': 'Tuberculosis',
 '01085': 'Tuberculosis',
 '01086': 'Tuberculosis',
 '01090': 'Tuberculosis',
 '01091': 'Tuberculosis',
 '01092': 'Tuberculosis',
 '01093': 'Tuberculosis',
 '01094': 'Tuberculosis',
 '01095': 'Tuberculosis',
 '01096': 'Tuberculosis',
 '01100': 'Tuberculosis',
 '01101': 'Tuberculosis',
 '01102': 'Tuberculosis',
 '01103': 'Tuberculosis',
 '01104': 'Tuberculosis',
 '01105': 'Tuberculosis',
 '01106': 'Tuberculosis',
 '01110': 'Tuberculosis',
 '01111': 'Tuberculosis',
 '01112': 'Tuberculosis',
 '01113': 'T

In [47]:
# index is ID of phenotype in the yaml
id_to_group = sorted(phenotypes_yaml.keys())
# Index is pheontype designation and item is id
group_to_id = dict((x, i) for (i, x) in enumerate(id_to_group))
id_to_group

['Abdominal hernia',
 'Abdominal pain',
 'Acquired foot deformities',
 'Acute and chronic tonsillitis',
 'Acute and unspecified renal failure',
 'Acute bronchitis',
 'Acute cerebrovascular disease',
 'Acute myocardial infarction',
 'Acute posthemorrhagic anemia',
 'Adjustment disorders',
 'Administrative/social admission',
 'Adverse effects of medical care',
 'Adverse effects of medical drugs',
 'Alcohol-related disorders',
 'Allergic reactions',
 'Anal and rectal conditions',
 'Anxiety disorders',
 'Aortic and peripheral arterial embolism or thrombosis',
 'Aortic; peripheral; and visceral artery aneurysms',
 'Appendicitis and other appendiceal conditions',
 'Aspiration pneumonitis; food/vomitus',
 'Asthma',
 'Attention-deficit, conduct, and disruptive behavior disorders',
 'Bacterial infection; unspecified site',
 'Benign neoplasm of uterus',
 'Biliary tract disease',
 'Birth trauma',
 'Blindness and vision defects',
 'Burns',
 'Calculus of urinary tract',
 'Cancer of bladder',
 'Canc

In [40]:
# Length of stay to trim dataframe
los = 24.0 * label_df.iloc[0]['Length of Stay']  # in hours

In [43]:
# Stay diagnoses
icustay = label_df['Icustay'].iloc[0]
diagnoses_df = pd.read_csv(Path("resources", "diagnoses.csv"),
                           dtype={"ICD9_CODE": str})
diagnoses_df = diagnoses_df[diagnoses_df.ICUSTAY_ID == icustay]

In [44]:
# Return samples
X = timeserie_df[(timeserie_df['Hours'] < los + eps) & (timeserie_df['Hours'] > -eps)]

In [45]:
cur_labels = [0] * len(id_to_group)

for index, row in diagnoses_df.iterrows():
    if row['USE_IN_BENCHMARK']:
        code = row['ICD9_CODE']
        group = code_to_group[code]
        group_id = group_to_id[group]
        cur_labels[group_id] = 1

# Only use in benchmark = True labels
cur_labels = [x for (i, x) in enumerate(cur_labels)
              if phenotypes_yaml[id_to_group[i]]['use_in_benchmark']]

y = list()
y.append((los, cur_labels))

In [46]:
y

[(110.85840000000002,
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1])]

In [None]:
[0]*len(id_to_group)