# extract_data_eicu


## Prepare


In [1]:
from functools import reduce
import json
import pandas as pd
import numpy as np
from common_eicu import *
from math import isnan


In [2]:
# COMPACT_MODE = False
COMPACT_MODE = True

TEST_MODE = False
# TEST_MODE = True

# COMPRESS_OUTPUT = False
COMPRESS_OUTPUT = True

FILTERED_LAB_PATH = relative_path('lab_filtered.csv.gz')
FILTERED_EXAM_PATH = relative_path('exam_filtered.csv.gz')
FILTERED_TREATMENT_PATH = relative_path('treatment_filtered.csv.gz')

OUTPUT_PATH = relative_path(
    'data_eicu_'
    + ('compact' if COMPACT_MODE else 'full')
    + '_raw'
    + ('_test' if TEST_MODE else '')
    + '.csv'
    + ('.gz' if COMPRESS_OUTPUT else '')
)



In [3]:
# identity -> { col -> { 'offsets': [], 'values': []  } }
temporal_data = {}


In [4]:
def post_process_compact(df):

    def map_age(age):
        if age != age:
            return age
        elif age == '> 89':
            return 90
        else:
            return int(age)

    df['age'] = df['age'].map(map_age)

    df['vasopressor'] = df['vasopressor'].map(
        lambda v: v if v != v else int(v)
    )


In [5]:
def post_process_full(df):

    post_process_compact(df)

    df['bmi'] = df['weight'] / (df['height'] / 100) ** 2
    del df['weight']
    del df['height']


## Required Columns


In [6]:
# generate the map of non-temporal data sources
data_sources = {}
with open(CATALOGUE_PATH, 'r') as catalogue_file:
    catalogue = json.load(catalogue_file)
    for column_name in REQUIRED_COLUMNS:
        if not column_name in catalogue:
            raise Exception(
                f'Cannot find column "{column_name}" in catalogue!'
            )
        file_path = catalogue[column_name]
        if file_path in data_sources:
            data_sources[file_path].append(column_name)
        else:
            data_sources[file_path] = [column_name]


In [7]:
# collect non-temporal data frames
data_frames = []
for input_path, column_names in data_sources.items():
    usecols = [KEY_IDENTITY] + column_names
    data_frame = pd.read_csv(
        input_path,
        usecols=usecols,
        index_col=KEY_IDENTITY,
    )
    data_frame.columns = map(
        map_column_name,
        data_frame.columns,
    )
    data_frames.append(data_frame)


In [12]:
# join non-temporal data frames
df_non_temporal = reduce(
    lambda df_0, df_1: df_0.join(df_1),
    data_frames,
)
df_non_temporal.drop_duplicates(inplace=True)
df_non_temporal.dropna(inplace=True)
df_non_temporal.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 148502 entries, 141168 to 3353263
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   age                148502 non-null  object 
 1   urine              148502 non-null  float64
 2   wbc                148502 non-null  float64
 3   meanbp             148502 non-null  float64
 4   actualhospitallos  148502 non-null  float64
dtypes: float64(4), object(1)
memory usage: 6.8+ MB


In [9]:
# init indices of temporal data
for index in df_non_temporal.index:
    temporal_data[index] = {}


## Treatment Info


In [10]:
df_treatment = pd.read_csv(
    FILTERED_TREATMENT_PATH,
    usecols=TREATMENT_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init treatment columns in temporal_data
for record in temporal_data.values():
    for keyword in REQUIRED_TREATMENT_KEYWORDS:
        column_name = map_column_name(keyword)
        record[column_name] = {
            'offsets': [MIN_OFFSET],
            'values': [False],
        }

# collect treatment info
treatment_iterator = SimpleProgress(df_treatment.index)
for index in treatment_iterator:
    identity = df_treatment.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    raw_value = df_treatment.at[index, KEY_TREATMENT_STRING]
    treatment_string = str(raw_value).lower()
    for keyword in REQUIRED_TREATMENT_KEYWORDS:
        if not keyword in treatment_string:
            continue
        offset = df_treatment.at[index, KEY_TREATMENT_OFFSET]
        column_name = map_column_name(keyword)
        store = temporal_data[identity][column_name]
        store['offsets'].append(offset)
        store['values'].append(True)


100.00%


## Exam Items


In [None]:
df_exam = pd.read_csv(
    FILTERED_EXAM_PATH,
    usecols=EXAM_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init exam columns in temporal_data
for record in temporal_data.values():
    for name in REQUIRED_EXAM_ITEMS:
        column_name = map_column_name(name)
        record[column_name] = {
            'offsets': [],
            'values': [],
        }

# collect exam items
exam_iterator = SimpleProgress(df_exam.index)
for index in exam_iterator:
    identity = df_exam.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    item_name = df_exam.at[index, KEY_EXAM_NAME]
    if not item_name in REQUIRED_EXAM_ITEMS:
        continue
    column_name = map_column_name(item_name)
    offset = df_exam.at[index, KEY_EXAM_OFFSET]
    value = df_exam.at[index, KEY_EXAM_RESULT]
    store = temporal_data[identity][column_name]
    store['offsets'].append(offset)
    store['values'].append(value)


## Lab Variables


In [None]:
df_lab = pd.read_csv(
    FILTERED_LAB_PATH,
    usecols=LAB_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# init lab columns in temporal_data
for record in temporal_data.values():
    for name in REQUIRED_LAB_VARIABLES:
        column_name = map_column_name(name)
        record[column_name] = {
            'offsets': [],
            'values': [],
        }

# collect lab variables
lab_iterator = SimpleProgress(df_lab.index)
for index in lab_iterator:
    identity = df_lab.at[index, KEY_IDENTITY]
    if not identity in temporal_data:
        continue
    var_name = df_lab.at[index, KEY_LAB_NAME]
    if not var_name in REQUIRED_LAB_VARIABLES:
        continue
    var_record = lab_var_dict[var_name]
    column_name = map_column_name(var_name)
    offset = df_lab.at[index, KEY_LAB_OFFSET]
    value = df_lab.at[index, KEY_LAB_RESULT]
    store = temporal_data[identity][column_name]
    store['offsets'].append(offset)
    store['values'].append(value)


## Join Data


In [11]:
# construct temporal data rows

temporal_column_names = list(
    map(map_column_name, [
        KEY_IDENTITY,
        KEY_OFFSET,
        *REQUIRED_TREATMENT_KEYWORDS,
        *REQUIRED_EXAM_ITEMS,
        *REQUIRED_LAB_VARIABLES,
    ])
)
temporal_data_rows = []

temporal_data_iterator = SimpleProgress(temporal_data.items())
for identity, record in temporal_data_iterator:

    begin = max(min(store['offsets']) for store in record)
    end = max(max(store['offsets']) for store in record)
    if begin < MIN_OFFSET:
        begin = MIN_OFFSET

    for offset in range(begin, end + 1):
        row = []

        for column_name in temporal_column_names:

            if column_name == KEY_IDENTITY:
                row.append(identity)
                continue
            elif column_name == KEY_OFFSET:
                row.append(offset)
                continue

            store = record[column_name]
            offsets = store['offsets']

            try:
                index = offsets.index(offset)
            except ValueError:  # not found
                row.append(np.nan)
                continue

            value = store['values'][index]
            row.append(value)

        temporal_data_rows.append(row)


In [None]:
df_temporal = pd.DataFrame(
    temporal_data_rows,
    columns=temporal_column_names,
)
df_temporal.groupby(KEY_IDENTITY).ffill(inplace=True)
df_temporal.dropna(inplace=True)
df_temporal.info()


In [None]:
# join temporal and non-temporal data frames
df_output = pd.merge(
    df_non_temporal,
    df_temporal,
    on=KEY_IDENTITY,
)
df_output.info()


## Post Processing


In [None]:
if COMPACT_MODE:
    post_process_compact(df_output)
else:
    post_process_full(df_output)


## Output


In [None]:
df_output.dropna(inplace=True)

if COMPRESS_OUTPUT:
    df_output.to_csv(
        OUTPUT_PATH,
        compression='gzip',
    )
else:
    df_output.to_csv(
        OUTPUT_PATH,
    )
