# extract_data_eicu


## Prepare


In [2]:
from functools import reduce
import json
import pandas as pd
import numpy as np
from common_eicu import *
from math import isnan


In [3]:
TEST_ROWS = 50_000

COMPACT_MODE = False
# COMPACT_MODE = True

TEST_MODE = False
# TEST_MODE = True

# COMPRESS_OUTPUT = False
COMPRESS_OUTPUT = True


In [4]:
if COMPACT_MODE:
    NON_TEMPORAL_COLUMNS = NON_TEMPORAL_COLUMNS_COMPACT
    LAB_VARIABLES = LAB_VARIABLES_COMPACT
    EXAM_ITEMS = EXAM_ITEMS_COMPACT
    TREATMENT_KEYWORDS = TREATMENT_KEYWORDS_COMPACT
    CATEGORICAL_COLUMNS = CATEGORICAL_COLUMNS_COMPACT
    CONDITION_ONLY_COLUMNS = CONDITION_ONLY_COLUMNS_COMPACT
    APERIODIC_COLUMNS = APERIODIC_COLUMNS_COMPACT
    PERIODIC_COLUMNS = PERIODIC_COLUMNS_COMPACT
else:
    NON_TEMPORAL_COLUMNS = NON_TEMPORAL_COLUMNS_FULL
    LAB_VARIABLES = LAB_VARIABLES_FULL
    EXAM_ITEMS = EXAM_ITEMS_FULL
    TREATMENT_KEYWORDS = TREATMENT_KEYWORDS_FULL
    CATEGORICAL_COLUMNS = CATEGORICAL_COLUMNS_FULL
    CONDITION_ONLY_COLUMNS = CONDITION_ONLY_COLUMNS_FULL
    APERIODIC_COLUMNS = APERIODIC_COLUMNS_FULL
    PERIODIC_COLUMNS = PERIODIC_COLUMNS_FULL


## Non-Temporal Data


In [5]:
# generate the map of non-temporal data sources
data_sources = {}
with open(CATALOGUE_PATH, 'r') as catalogue_file:
    catalogue = json.load(catalogue_file)
    for column_name in NON_TEMPORAL_COLUMNS:
        if not column_name in catalogue:
            raise Exception(
                f'Cannot find column "{column_name}" in catalogue!'
            )
        file_path = catalogue[column_name]
        if file_path in data_sources:
            data_sources[file_path].append(column_name)
        else:
            data_sources[file_path] = [column_name]


In [6]:
# collect non-temporal data frames
data_frames = []
for input_path, column_names in data_sources.items():
    usecols = [KEY_IDENTITY, *column_names]
    data_frame = pd.read_csv(
        input_path,
        usecols=usecols,
        index_col=KEY_IDENTITY,
    )
    data_frame.columns = map(
        map_column_name,
        data_frame.columns,
    )
    data_frames.append(data_frame)


In [7]:
def map_age(age):
    if age != age:
        return age
    elif age == '> 89':
        return 90
    else:
        return int(age)


In [8]:
def map_gender(gender):
    if gender == 'Female':
        return 0
    elif gender == 'Male':
        return 1
    else:
        return pd.NA


In [9]:
# join non-temporal data frames
df_non_temporal = pd.concat(
    data_frames,
    axis='columns',
    join='outer',
)

# fix some columns
df_non_temporal['age'] = df_non_temporal['age'].map(map_age)
df_non_temporal['gender'] = df_non_temporal['gender'].map(map_gender)

df_non_temporal.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 200859 entries, 141168 to 3353263
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   gender  200682 non-null  object 
 1   age     200764 non-null  float64
 2   height  196644 non-null  float64
 3   weight  184141 non-null  float64
dtypes: float64(3), object(1)
memory usage: 7.7+ MB


## Init Temporal Data


In [10]:
# identity -> { raw_col -> { 'offsets': [], 'values': []  } }
temporal_data = {}

# init indices
for index in df_non_temporal.index:
    temporal_data[index] = {}

len(temporal_data)


200859

## Filter Diagnosis


In [11]:
df_sepsis = pd.read_csv(
    relative_path('./data/sepsis_eicu.csv.gz'),
    usecols=[KEY_IDENTITY, KEY_DIAGNOSIS_STRING],
    nrows=(TEST_ROWS if TEST_MODE else None),
)

non_sepsis = set(temporal_data.keys())
diagnosis_iterator = SimpleProgress(df_sepsis.index)
for index in diagnosis_iterator:
    identity = df_sepsis.at[index, KEY_IDENTITY]
    non_sepsis.discard(identity)

for identity in non_sepsis:
    del temporal_data[identity]


100%


In [12]:
len(temporal_data)


23479

## Treatment Info


In [13]:
df_treatment = pd.read_csv(
    relative_path('./data/treatment_eicu_filtered.csv.gz'),
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init treatment columns in temporal_data
for record in temporal_data.values():
    for keyword in TREATMENT_KEYWORDS:
        record[keyword] = {
            'offsets': [],
            'values': [],
        }

# collect treatment info
treatment_iterator = SimpleProgress(df_treatment.index)
for index in treatment_iterator:
    identity = df_treatment.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    record = temporal_data[identity]
    raw_value = df_treatment.at[index, KEY_TREATMENT_STRING]
    treatment_string = str(raw_value).lower()
    for keyword in TREATMENT_KEYWORDS:
        if not keyword in treatment_string:
            continue
        offset = df_treatment.at[index, KEY_TREATMENT_OFFSET]
        store = record[keyword]
        store['offsets'].append(offset)
        store['values'].append(1)


100%


## Exam Items


In [14]:
df_exam = pd.read_csv(
    relative_path('./data/exam_eicu_filtered.csv.gz'),
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init exam columns in temporal_data
for record in temporal_data.values():
    for item_name in EXAM_ITEMS:
        record[item_name] = {
            'offsets': [],
            'values': [],
        }

# collect exam items
exam_iterator = SimpleProgress(df_exam.index)
for index in exam_iterator:
    identity = df_exam.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    item_name = df_exam.at[index, KEY_EXAM_NAME]
    if not item_name in EXAM_ITEMS:
        continue
    offset = df_exam.at[index, KEY_EXAM_OFFSET]
    value = df_exam.at[index, KEY_EXAM_RESULT]
    record = temporal_data[identity]
    store = record[item_name]
    store['offsets'].append(offset)
    store['values'].append(value)


100%


## Lab Variables


In [15]:
df_lab = pd.read_csv(
    relative_path('./data/lab_eicu_filtered.csv.gz'),
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init lab columns in temporal_data
for record in temporal_data.values():
    for var_name in LAB_VARIABLES:
        record[var_name] = {
            'offsets': [],
            'values': [],
        }

# collect lab variables
lab_iterator = SimpleProgress(df_lab.index)
for index in lab_iterator:
    identity = df_lab.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    var_name = df_lab.at[index, KEY_LAB_NAME]
    if not var_name in LAB_VARIABLES:
        continue
    offset = df_lab.at[index, KEY_LAB_OFFSET]
    value = df_lab.at[index, KEY_LAB_RESULT]
    record = temporal_data[identity]
    store = record[var_name]
    store['offsets'].append(offset)
    store['values'].append(value)


100%


## Aperiodic Data


In [16]:
df_aperiodic = pd.read_csv(
    relative_path('./data/aperiodic_eicu_filtered.csv.gz'),
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init aperiodic columns in temporal_data
for record in temporal_data.values():
    for name in APERIODIC_COLUMNS:
        record[name] = {
            'offsets': [],
            'values': [],
        }

# collect aperiodic columns
aperiodic_iterator = SimpleProgress(df_aperiodic.index)
for index in aperiodic_iterator:
    identity = df_aperiodic.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    offset = df_aperiodic.at[index, KEY_APERIODIC_OFFSET]
    record = temporal_data[identity]
    for name in APERIODIC_COLUMNS:
        value = df_aperiodic.at[index, name]
        store = record[name]
        store['offsets'].append(offset)
        store['values'].append(value)


100%


## Periodic Data


In [17]:
df_periodic = pd.read_csv(
    relative_path('./data/periodic_eicu_filtered.csv.gz'),
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init periodic columns in temporal_data
for record in temporal_data.values():
    for name in PERIODIC_COLUMNS:
        record[name] = {
            'offsets': [],
            'values': [],
        }

# collect periodic columns
periodic_iterator = SimpleProgress(df_periodic.index)
for index in periodic_iterator:
    identity = df_periodic.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    offset = df_periodic.at[index, KEY_PERIODIC_OFFSET]
    record = temporal_data[identity]
    for name in PERIODIC_COLUMNS:
        value = df_periodic.at[index, name]
        store = record[name]
        store['offsets'].append(offset)
        store['values'].append(value)


100%


## Join Temporal Data


In [18]:
# construct temporal data rows

raw_temporal_columns = [
    KEY_IDENTITY,
    KEY_OFFSET,
    *TREATMENT_KEYWORDS,
    *EXAM_ITEMS,
    *LAB_VARIABLES,
    *APERIODIC_COLUMNS,
    *PERIODIC_COLUMNS,
]

temporal_data_rows = []
temporal_data_iterator = SimpleProgress(temporal_data.items())
for identity, record in temporal_data_iterator:

    stores = record.values()
    all_offsets = [store['offsets'] for store in stores]

    offset_begin = max(
        min(offsets) if len(offsets) > 0 else MIN_OFFSET
        for offsets in all_offsets
    )
    offset_end = max(
        max(offsets) if len(offsets) > 0 else MIN_OFFSET
        for offsets in all_offsets
    )
    if offset_begin < MIN_OFFSET:
        offset_begin = MIN_OFFSET

    for offset in range(offset_begin, offset_end + 1):
        row = []

        for column_name in raw_temporal_columns:

            if column_name == KEY_IDENTITY:
                row.append(identity)
                continue
            elif column_name == KEY_OFFSET:
                row.append(offset)
                continue

            store = record[column_name]
            offsets = store['offsets']
            values = store['values']
            count = len(offsets)

            indices = list(
                filter(
                    lambda i: offsets[i] == offset,
                    range(count)
                )
            )
            if len(indices) == 0:
                if len(offsets) > 0 and offsets[-1] < offset:
                    row.append(values[-1])
                else:
                    row.append(pd.NA)
            else:
                value = mean(list(values[i] for i in indices))
                row.append(value)

        temporal_data_rows.append(row)


100%


In [19]:
df_temporal = pd.DataFrame(
    temporal_data_rows,
    columns=map(
        map_column_name,
        raw_temporal_columns,
    ),
)

# fill NAs in treatment columns with zeros
for keyword in TREATMENT_KEYWORDS:
    column_name = map_column_name(keyword)
    df_temporal[column_name].fillna(0, inplace=True)

# fill other NAs
df_temporal.groupby(KEY_IDENTITY, sort=False).ffill()
df_temporal.groupby(KEY_IDENTITY, sort=False).bfill()

df_temporal.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134582 entries, 0 to 134581
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   patientunitstayid  134582 non-null  int64  
 1   offset             134582 non-null  int64  
 2   vasopressor        134582 non-null  float64
 3   urine              45035 non-null   object 
 4   creatinine         130482 non-null  object 
 5   platelet           128924 non-null  object 
 6   inr                101202 non-null  object 
 7   pt                 98800 non-null   object 
 8   ptt                81385 non-null   object 
 9   lactate            103912 non-null  object 
 10  rdw                121697 non-null  object 
 11  bilirubin          112850 non-null  object 
 12  bicarbonate        122619 non-null  object 
 13  crp                11357 non-null   object 
 14  lymph              109655 non-null  object 
 15  albumin            116795 non-null  object 
 16  pr

## Construct Output


In [20]:
df_output = pd.merge(
    df_non_temporal,
    df_temporal,
    on=KEY_IDENTITY,
)
df_output.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 134582 entries, 0 to 134581
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   patientunitstayid  134582 non-null  int64  
 1   gender             134572 non-null  object 
 2   age                134582 non-null  float64
 3   height             133864 non-null  float64
 4   weight             130300 non-null  float64
 5   offset             134582 non-null  int64  
 6   vasopressor        134582 non-null  float64
 7   urine              45035 non-null   object 
 8   creatinine         130482 non-null  object 
 9   platelet           128924 non-null  object 
 10  inr                101202 non-null  object 
 11  pt                 98800 non-null   object 
 12  ptt                81385 non-null   object 
 13  lactate            103912 non-null  object 
 14  rdw                121697 non-null  object 
 15  bilirubin          112850 non-null  object 
 16  bi

In [21]:
# consider -1 as NAs
df_output.replace(-1.0, pd.NA)

# fill NAs in non-categorical columns with means
df_output.fillna(
    df_output.drop(columns=CATEGORICAL_COLUMNS).mean(),
    inplace=True,
)

# and those in categorical columns with mode values
df_output.fillna(
    df_output[CATEGORICAL_COLUMNS].mode().iloc[0, :],
    inplace=True,
)

df_output.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 134582 entries, 0 to 134581
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   patientunitstayid  134582 non-null  int64  
 1   gender             134582 non-null  float64
 2   age                134582 non-null  float64
 3   height             134582 non-null  float64
 4   weight             134582 non-null  float64
 5   offset             134582 non-null  int64  
 6   vasopressor        134582 non-null  float64
 7   urine              134582 non-null  float64
 8   creatinine         134582 non-null  float64
 9   platelet           134582 non-null  float64
 10  inr                134582 non-null  float64
 11  pt                 134582 non-null  float64
 12  ptt                134582 non-null  float64
 13  lactate            134582 non-null  float64
 14  rdw                134582 non-null  float64
 15  bilirubin          134582 non-null  float64
 16  bi

## Post Processing


In [22]:
def post_process_compact(df):
    pass


In [23]:
def post_process_full(df):

    post_process_compact(df)

    df['bmi'] = df['weight'] / (df['height'] / 100) ** 2
    del df['weight']
    del df['height']


In [24]:
if COMPACT_MODE:
    post_process_compact(df_output)
else:
    post_process_full(df_output)


In [25]:
df_output.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 134582 entries, 0 to 134581
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   patientunitstayid  134582 non-null  int64  
 1   gender             134582 non-null  float64
 2   age                134582 non-null  float64
 3   offset             134582 non-null  int64  
 4   vasopressor        134582 non-null  float64
 5   urine              134582 non-null  float64
 6   creatinine         134582 non-null  float64
 7   platelet           134582 non-null  float64
 8   inr                134582 non-null  float64
 9   pt                 134582 non-null  float64
 10  ptt                134582 non-null  float64
 11  lactate            134582 non-null  float64
 12  rdw                134582 non-null  float64
 13  bilirubin          134582 non-null  float64
 14  bicarbonate        134582 non-null  float64
 15  crp                134582 non-null  float64
 16  ly

In [26]:
print(
    'patient count: {:,d}'.format(
        len(df_output[KEY_IDENTITY].unique())
    )
)


patient count: 23,479


## Check PICS


In [27]:
def check_pics(index):
    return all(
        indicator(df_output.at[index, col])
        for col, indicator in PICS_CONDITIONS.items()
    )


In [28]:
# Compute PICS flags:
# `FLAG_POSITIVE` if all PICS conditions
# are fulfilled today or yesterday;
# `FLAG_NEGATIVE` otherwise.

df_output[KEY_FLAG] = FLAG_NEGATIVE  # init

last_identity = None
last_index = None
output_iterator = SimpleProgress(df_output.index)
for current_index in output_iterator:

    current_identity = df_output.at[current_index, KEY_IDENTITY]

    if check_pics(current_index):
        df_output.at[current_index, KEY_FLAG] = FLAG_POSITIVE
        if last_identity == current_identity:
            df_output.at[last_index, KEY_FLAG] = FLAG_POSITIVE

    if (
        last_identity != current_identity
        and last_index != None
        and df_output.at[last_index, KEY_FLAG] != FLAG_POSITIVE
    ):
        df_output.at[last_index, KEY_FLAG] = pd.NA

    last_identity = current_identity
    last_index = current_index


100%


In [29]:
# remove extra columns
df_output.drop(
    columns=[
        KEY_IDENTITY,
        KEY_OFFSET,
        *CONDITION_ONLY_COLUMNS,
    ],
    inplace=True,
)

# remove NA flags
df_output.dropna(inplace=True)

df_output.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 111511 entries, 1 to 134581
Data columns (total 22 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   gender       111511 non-null  float64
 1   age          111511 non-null  float64
 2   vasopressor  111511 non-null  float64
 3   urine        111511 non-null  float64
 4   creatinine   111511 non-null  float64
 5   platelet     111511 non-null  float64
 6   inr          111511 non-null  float64
 7   pt           111511 non-null  float64
 8   ptt          111511 non-null  float64
 9   lactate      111511 non-null  float64
 10  rdw          111511 non-null  float64
 11  bilirubin    111511 non-null  float64
 12  bicarbonate  111511 non-null  float64
 13  crp          111511 non-null  float64
 14  lymph        111511 non-null  float64
 15  albumin      111511 non-null  float64
 16  prealbumin   111511 non-null  float64
 17  wbc          111511 non-null  float64
 18  bp           111511 non-

## Output


In [30]:
OUTPUT_PATH = relative_path(
    './data/data_eicu_'
    + ('compact' if COMPACT_MODE else 'full')
    + ('_test' if TEST_MODE else '')
    + '.csv'
    + ('.gz' if COMPRESS_OUTPUT else '')
)

df_output.to_csv(
    OUTPUT_PATH,
    index=False,
    compression=('gzip' if COMPRESS_OUTPUT else None),
)
