# extract_data_eicu


## Prepare


In [32]:
from functools import reduce
import json
import pandas as pd
import numpy as np
from common_eicu import *
from math import isnan


In [90]:
TEST_ROWS = 50_000

# COMPACT_MODE = False
COMPACT_MODE = True

TEST_MODE = False
# TEST_MODE = True

# COMPRESS_OUTPUT = False
COMPRESS_OUTPUT = True


In [49]:
if COMPACT_MODE:
    NON_TEMPORAL_COLUMNS = NON_TEMPORAL_COLUMNS_COMPACT
    LAB_VARIABLES = LAB_VARIABLES_COMPACT
    EXAM_ITEMS = EXAM_ITEMS_COMPACT
    TREATMENT_KEYWORDS = TREATMENT_KEYWORDS_COMPACT
    CATEGORICAL_COLUMNS = CATEGORICAL_COLUMNS_COMPACT
    CONDITION_ONLY_COLUMNS = CONDITION_ONLY_COLUMNS_COMPACT
else:
    NON_TEMPORAL_COLUMNS = NON_TEMPORAL_COLUMNS_FULL
    LAB_VARIABLES = LAB_VARIABLES_FULL
    EXAM_ITEMS = EXAM_ITEMS_FULL
    TREATMENT_KEYWORDS = TREATMENT_KEYWORDS_FULL
    CATEGORICAL_COLUMNS = CATEGORICAL_COLUMNS_FULL
    CONDITION_ONLY_COLUMNS = CONDITION_ONLY_COLUMNS_FULL


## Non-Temporal Data


In [5]:
# generate the map of non-temporal data sources
data_sources = {}
with open(CATALOGUE_PATH, 'r') as catalogue_file:
    catalogue = json.load(catalogue_file)
    for column_name in NON_TEMPORAL_COLUMNS:
        if not column_name in catalogue:
            raise Exception(
                f'Cannot find column "{column_name}" in catalogue!'
            )
        file_path = catalogue[column_name]
        if file_path in data_sources:
            data_sources[file_path].append(column_name)
        else:
            data_sources[file_path] = [column_name]


In [6]:
# collect non-temporal data frames
data_frames = []
for input_path, column_names in data_sources.items():
    usecols = [KEY_IDENTITY, *column_names]
    data_frame = pd.read_csv(
        input_path,
        usecols=usecols,
        index_col=KEY_IDENTITY,
    )
    data_frame.columns = map(
        map_column_name,
        data_frame.columns,
    )
    data_frames.append(data_frame)


In [None]:
def map_age(age):
    if age != age:
        return age
    elif age == '> 89':
        return 90
    else:
        return int(age)


In [146]:
# join non-temporal data frames
df_non_temporal = pd.concat(
    data_frames,
    axis='columns',
    join='outer',
)

# fix age
df_non_temporal['age'] = \
    df_non_temporal['age'].map(map_age)

df_non_temporal.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 200859 entries, 141168 to 3353263
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   age     200764 non-null  float64
dtypes: float64(1)
memory usage: 3.1 MB


## Init Temporal Data


In [8]:
# identity -> { col -> { 'offsets': [], 'values': []  } }
temporal_data = {}

# init indices
for index in df_non_temporal.index:
    temporal_data[index] = {}

len(temporal_data)


200859

## Filter Diagnosis


In [9]:
df_sepsis = pd.read_csv(
    relative_path('./sepsis_eicu.csv.gz'),
    usecols=[KEY_IDENTITY, KEY_DIAGNOSIS_STRING],
    nrows=(TEST_ROWS if TEST_MODE else None),
)

non_sepsis = set(temporal_data.keys())
diagnosis_iterator = SimpleProgress(df_sepsis.index)
for index in diagnosis_iterator:
    identity = df_sepsis.at[index, KEY_IDENTITY]
    non_sepsis.discard(identity)

for identity in non_sepsis:
    del temporal_data[identity]


100%


In [10]:
len(temporal_data)


23479

## Treatment Info


In [102]:
df_treatment = pd.read_csv(
    relative_path('treatment_eicu_filtered.csv.gz'),
    usecols=TREATMENT_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init treatment columns in temporal_data
for record in temporal_data.values():
    for keyword in TREATMENT_KEYWORDS:
        column_name = map_column_name(keyword)
        record[column_name] = {
            'offsets': [],
            'values': [],
        }

# collect treatment info
treatment_iterator = SimpleProgress(df_treatment.index)
for index in treatment_iterator:
    identity = df_treatment.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    raw_value = df_treatment.at[index, KEY_TREATMENT_STRING]
    treatment_string = str(raw_value).lower()
    for keyword in TREATMENT_KEYWORDS:
        if not keyword in treatment_string:
            continue
        offset = df_treatment.at[index, KEY_TREATMENT_OFFSET]
        column_name = map_column_name(keyword)
        store = temporal_data[identity][column_name]
        store['offsets'].append(offset)
        store['values'].append(1)


100%


## Exam Items


In [18]:
df_exam = pd.read_csv(
    relative_path('exam_eicu_filtered.csv.gz'),
    usecols=EXAM_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init exam columns in temporal_data
for record in temporal_data.values():
    for name in EXAM_ITEMS:
        column_name = map_column_name(name)
        record[column_name] = {
            'offsets': [],
            'values': [],
        }

# collect exam items
exam_iterator = SimpleProgress(df_exam.index)
for index in exam_iterator:
    identity = df_exam.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    item_name = df_exam.at[index, KEY_EXAM_NAME]
    if not item_name in EXAM_ITEMS:
        continue
    column_name = map_column_name(item_name)
    offset = df_exam.at[index, KEY_EXAM_OFFSET]
    value = df_exam.at[index, KEY_EXAM_RESULT]
    store = temporal_data[identity][column_name]
    store['offsets'].append(offset)
    store['values'].append(value)


100%


## Lab Variables


In [13]:
df_lab = pd.read_csv(
    relative_path('lab_eicu_filtered.csv.gz'),
    usecols=LAB_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init lab columns in temporal_data
for record in temporal_data.values():
    for name in LAB_VARIABLES:
        column_name = map_column_name(name)
        record[column_name] = {
            'offsets': [],
            'values': [],
        }

# collect lab variables
lab_iterator = SimpleProgress(df_lab.index)
for index in lab_iterator:
    identity = df_lab.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    var_name = df_lab.at[index, KEY_LAB_NAME]
    if not var_name in LAB_VARIABLES:
        continue
    column_name = map_column_name(var_name)
    offset = df_lab.at[index, KEY_LAB_OFFSET]
    value = df_lab.at[index, KEY_LAB_RESULT]
    store = temporal_data[identity][column_name]
    store['offsets'].append(offset)
    store['values'].append(value)


100%


## Join Temporal Data


In [124]:
# construct temporal data rows

temporal_column_names = list(
    map(map_column_name, [
        KEY_IDENTITY,
        KEY_OFFSET,
        *TREATMENT_KEYWORDS,
        *EXAM_ITEMS,
        *LAB_VARIABLES,
    ])
)
temporal_data_rows = []

temporal_data_iterator = SimpleProgress(temporal_data.items())
for identity, record in temporal_data_iterator:

    stores = record.values()
    all_offsets = [store['offsets'] for store in stores]

    offset_begin = max(
        min(offsets) if len(offsets) > 0 else MIN_OFFSET
        for offsets in all_offsets
    )
    offset_end = max(
        max(offsets) if len(offsets) > 0 else MIN_OFFSET
        for offsets in all_offsets
    )
    if offset_begin < MIN_OFFSET:
        offset_begin = MIN_OFFSET

    for offset in range(offset_begin, offset_end + 1):
        row = []

        for column_name in temporal_column_names:

            if column_name == KEY_IDENTITY:
                row.append(identity)
                continue
            elif column_name == KEY_OFFSET:
                row.append(offset)
                continue

            store = record[column_name]
            offsets = store['offsets']
            values = store['values']
            count = len(offsets)

            indices = list(
                filter(
                    lambda i: offsets[i] == offset,
                    range(count)
                )
            )
            if len(indices) == 0:
                if len(offsets) > 0 and offsets[-1] < offset:
                    row.append(values[-1])
                else:
                    row.append(pd.NA)
            else:
                value = mean(list(values[i] for i in indices))
                row.append(value)

        temporal_data_rows.append(row)


100%


In [125]:
df_temporal = pd.DataFrame(
    temporal_data_rows,
    columns=temporal_column_names,
)
df_temporal.groupby(KEY_IDENTITY, sort=False).ffill()
df_temporal.groupby(KEY_IDENTITY, sort=False).bfill()

df_temporal.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131377 entries, 0 to 131376
Data columns (total 21 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   patientunitstayid       131377 non-null  int64 
 1   offset                  131377 non-null  int64 
 2   vasopressor             42639 non-null   object
 3   BP (systolic) Current   114932 non-null  object
 4   BP (diastolic) Current  114925 non-null  object
 5   hr                      111410 non-null  object
 6   urine                   44271 non-null   object
 7   creatinine              127359 non-null  object
 8   platelet                125886 non-null  object
 9   inr                     98854 non-null   object
 10  pt                      96502 non-null   object
 11  ptt                     79642 non-null   object
 12  lactate                 101509 non-null  object
 13  rdw                     118966 non-null  object
 14  bilirubin               110110 non-n

## Construct Output


In [14]:
df_output = pd.merge(
    df_non_temporal,
    df_temporal,
    on=KEY_IDENTITY,
)
df_output.info()


NameError: name 'df_temporal' is not defined

In [None]:
# consider -1 as NAs
df_output.replace(-1.0, pd.NA)

# fill NAs in non-categorical columns with means
df_output.fillna(
    df_output.drop(columns=CATEGORICAL_COLUMNS).mean(),
    inplace=True,
)

# and those in categorical columns with mode values
df_output.fillna(
    df_output[CATEGORICAL_COLUMNS].mode().iloc[0, :],
    inplace=True,
)

df_output.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 131377 entries, 0 to 131376
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   patientunitstayid       131377 non-null  int64  
 1   age                     131377 non-null  float64
 2   offset                  131377 non-null  int64  
 3   vasopressor             131377 non-null  float64
 4   BP (systolic) Current   131377 non-null  float64
 5   BP (diastolic) Current  131377 non-null  float64
 6   hr                      131377 non-null  float64
 7   urine                   131377 non-null  float64
 8   creatinine              131377 non-null  float64
 9   platelet                131377 non-null  float64
 10  inr                     131377 non-null  float64
 11  pt                      131377 non-null  float64
 12  ptt                     131377 non-null  float64
 13  lactate                 131377 non-null  float64
 14  rdw                 

## Post Processing


In [None]:
def post_process_compact(df):

    # Mean Artery Pressure
    df['map'] = (
        df['BP (systolic) Current'] * 2
        + df['BP (diastolic) Current']
    ) / 3
    del df['BP (systolic) Current']
    del df['BP (diastolic) Current']


In [None]:
def post_process_full(df):

    post_process_compact(df)

    df['bmi'] = df['weight'] / (df['height'] / 100) ** 2
    del df['weight']
    del df['height']


In [None]:
if COMPACT_MODE:
    post_process_compact(df_output)
else:
    post_process_full(df_output)


In [None]:
df_output.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 131377 entries, 0 to 131376
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   patientunitstayid  131377 non-null  int64  
 1   age                131377 non-null  float64
 2   offset             131377 non-null  int64  
 3   vasopressor        131377 non-null  float64
 4   hr                 131377 non-null  float64
 5   urine              131377 non-null  float64
 6   creatinine         131377 non-null  float64
 7   platelet           131377 non-null  float64
 8   inr                131377 non-null  float64
 9   pt                 131377 non-null  float64
 10  ptt                131377 non-null  float64
 11  lactate            131377 non-null  float64
 12  rdw                131377 non-null  float64
 13  bilirubin          131377 non-null  float64
 14  bicarbonate        131377 non-null  float64
 15  crp                131377 non-null  float64
 16  ly

In [None]:
print(
    'patient count: {:,d}'.format(
        len(df_output[KEY_IDENTITY].unique())
    )
)


patient count: 23,479


## Check PICS


In [None]:
def check_pics(index):
    return all(
        indicator(df_output.at[index, col])
        for col, indicator in PICS_CONDITIONS.items()
    )


In [None]:
# Compute PICS flags:
# `FLAG_POSITIVE` if all PICS conditions
# are fulfilled today or yesterday;
# `FLAG_NEGATIVE` otherwise.

df_output[KEY_FLAG] = FLAG_NEGATIVE  # init

last_identity = None
last_index = None
output_iterator = SimpleProgress(df_output.index)
for current_index in output_iterator:

    current_identity = df_output.at[current_index, KEY_IDENTITY]

    if check_pics(current_index):
        df_output.at[current_index, KEY_FLAG] = FLAG_POSITIVE
        if last_identity == current_identity:
            df_output.at[last_index, KEY_FLAG] = FLAG_POSITIVE

    if (
        last_identity != current_identity
        and last_index != None
        and df_output.at[last_index, KEY_FLAG] != FLAG_POSITIVE
    ):
        df_output.at[last_index, KEY_FLAG] = pd.NA

    last_identity = current_identity
    last_index = current_index


100%


In [None]:
# remove extra columns
df_output.drop(
    columns=[
        KEY_IDENTITY,
        KEY_OFFSET,
        *CONDITION_ONLY_COLUMNS,
    ],
    inplace=True,
)

# remove NA flags
df_output.dropna(inplace=True)

df_output.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 108296 entries, 1 to 131376
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   age          108296 non-null  float64
 1   vasopressor  108296 non-null  float64
 2   hr           108296 non-null  float64
 3   urine        108296 non-null  float64
 4   creatinine   108296 non-null  float64
 5   platelet     108296 non-null  float64
 6   inr          108296 non-null  float64
 7   pt           108296 non-null  float64
 8   ptt          108296 non-null  float64
 9   lactate      108296 non-null  float64
 10  rdw          108296 non-null  float64
 11  bilirubin    108296 non-null  float64
 12  bicarbonate  108296 non-null  float64
 13  crp          108296 non-null  float64
 14  lymph        108296 non-null  float64
 15  albumin      108296 non-null  float64
 16  prealbumin   108296 non-null  float64
 17  wbc          108296 non-null  float64
 18  map          108296 non-

## Output


In [None]:
OUTPUT_PATH = relative_path(
    'data_eicu_'
    + ('compact' if COMPACT_MODE else 'full')
    + ('_test' if TEST_MODE else '')
    + '.csv'
    + ('.gz' if COMPRESS_OUTPUT else '')
)

df_output.to_csv(
    OUTPUT_PATH,
    index=False,
    compression=('gzip' if COMPRESS_OUTPUT else None),
)
