# filter_eicu


## common


In [10]:
%load_ext cython
import pandas as pd
from common_eicu import *


The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [11]:
TEST_MODE = False
# TEST_MODE = True


In [12]:
%%cython
import cython

MINUTES_PER_DAY: cython.int = 24 * 60

def minutes2days(offset: cython.int) -> cython.int:
    return int(offset / MINUTES_PER_DAY)


## diagnosis


In [13]:
%%time

DIAGNOSIS_USE_COLS = [
    KEY_IDENTITY,
    KEY_DIAGNOSIS_STRING,
]

df_diagnosis = pd.read_csv(
    DIAGNOSIS_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=DIAGNOSIS_USE_COLS,
)

# filter diagnosis strings
diagnosis_mask = df_diagnosis[KEY_DIAGNOSIS_STRING].map(
    lambda diagnosis_string: (
        SEPSIS_KEYWORD in str(diagnosis_string).lower()
    )
)
df_diagnosis = df_diagnosis[diagnosis_mask].copy()

if TEST_MODE:
    DIAGNOSIS_OUTPUT_PATH = relative_path(
        './data/sepsis_eicu_test.csv'
    )
else:
    DIAGNOSIS_OUTPUT_PATH = relative_path(
        './data/sepsis_eicu.csv.gz'
    )

df_diagnosis.to_csv(
    DIAGNOSIS_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
    index=False,
)


CPU times: total: 3.62 s
Wall time: 3.64 s


## exam


In [14]:
%%time

EXAM_USE_COLS = [
    KEY_IDENTITY,
    KEY_EXAM_OFFSET,
    KEY_EXAM_NAME,
    KEY_EXAM_RESULT,
]

df_exam = pd.read_csv(
    EXAM_PATH,
    usecols=EXAM_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# filter exam items
exam_values = {
    KEY_EXAM_NAME: EXAM_ITEMS,
}
exam_mask = df_exam.isin(exam_values).any(axis='columns')
df_exam = df_exam[exam_mask].copy()

# transform offset
df_exam[KEY_EXAM_OFFSET] = df_exam[KEY_EXAM_OFFSET].map(minutes2days)

# fix value type
df_exam[KEY_EXAM_RESULT] = df_exam[KEY_EXAM_RESULT].astype('float')

# Aggregation won't be performed on exam data frame
# because that depends on exam name.
# Instead, it will be done during data extraction.

EXAM_OUTPUT_PATH = relative_path(
    './data/exam_eicu_processed'
    + ('_test.csv' if TEST_MODE else '.csv.gz')
)

df_exam.to_csv(
    EXAM_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
)


CPU times: total: 14.5 s
Wall time: 14.5 s


## lab


In [16]:
%%time

LAB_USE_COLS = [
    KEY_IDENTITY,
    KEY_LAB_OFFSET,
    KEY_LAB_NAME,
    KEY_LAB_RESULT,
]

print('Importing data...')
df_lab = pd.read_csv(
    LAB_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=LAB_USE_COLS,
)

print('Filtering...')
lab_values = {
    KEY_LAB_NAME: LAB_VARIABLES,
}
lab_mask = df_lab.isin(lab_values).any(axis='columns')
df_lab = df_lab[lab_mask].copy()

print('Transforming offset...')
df_lab[KEY_LAB_OFFSET] = df_lab[KEY_LAB_OFFSET].map(minutes2days)

print('Computing daily means...')
df_exam = df_exam \
    .groupby([KEY_IDENTITY, KEY_EXAM_OFFSET, KEY_EXAM_NAME]) \
    .mean()

print('Writing to file...')
LAB_OUTPUT_PATH = relative_path(
    './data/lab_eicu_processed'
    + ('_test.csv' if TEST_MODE else '.csv.gz')
)
df_lab.to_csv(
    LAB_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
    index=False,
)

print('Done.')


Importing data...
Filtering...
Transforming offset...
Computing daily means...
Writing to file...
Done.
CPU times: total: 2min 49s
Wall time: 2min 50s


## treatment


In [17]:
%%time

TREATMENT_USE_COLS = [
    KEY_IDENTITY,
    KEY_TREATMENT_OFFSET,
    KEY_TREATMENT_STRING,
]

df_treatment = pd.read_csv(
    TREATMENT_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=TREATMENT_USE_COLS,
)

# filter treatment strings
treatment_mask = df_treatment[KEY_TREATMENT_STRING].map(
    lambda treatment_string: any(
        keyword in treatment_string
        for keyword in TREATMENT_KEYWORDS
    )
)
df_treatment = df_treatment[treatment_mask].copy()

# transform offset
df_treatment[KEY_TREATMENT_OFFSET] = \
    df_treatment[KEY_TREATMENT_OFFSET].map(minutes2days)

TREATMENT_OUTPUT_PATH = relative_path(
    './data/treatment_eicu_processed'
    + ('_test.csv' if TEST_MODE else '.csv.gz')
)

df_treatment.to_csv(
    TREATMENT_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
    index=False,
)


CPU times: total: 5.66 s
Wall time: 5.67 s


## aperiodic


In [18]:
%%time

APERIODIC_USE_COLS = [
    KEY_IDENTITY,
    KEY_APERIODIC_OFFSET,
    *APERIODIC_COLUMNS,
]

print('Importing data...')
df_aperiodic = pd.read_csv(
    APERIODIC_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=APERIODIC_USE_COLS,
)

print('Transforming offsets...')
df_aperiodic[KEY_APERIODIC_OFFSET] = \
    df_aperiodic[KEY_APERIODIC_OFFSET].map(minutes2days)

print('Computing daily means...')
df_aperiodic = df_aperiodic \
    .groupby([KEY_IDENTITY, KEY_APERIODIC_OFFSET]) \
    .mean()

APERIODIC_OUTPUT_PATH = relative_path(
    './data/aperiodic_eicu_processed'
    + ('_test.csv' if TEST_MODE else '.csv.gz')
)

print('Exporting data...')
df_aperiodic.to_csv(
    APERIODIC_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
)

print('Done.')


Importing data...
Transforming offsets...
Computing daily means...
Exporting data...
Done.
CPU times: total: 22.9 s
Wall time: 23 s


## periodic


In [19]:
%%time

PERIODIC_USE_COLS = [
    KEY_IDENTITY,
    KEY_PERIODIC_OFFSET,
    *PERIODIC_COLUMNS,
]

print('Importing data...')
df_periodic = pd.read_csv(
    PERIODIC_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=PERIODIC_USE_COLS,
)

print('Transforming offsets...')
df_periodic[KEY_PERIODIC_OFFSET] = \
    df_periodic[KEY_PERIODIC_OFFSET].map(minutes2days)

print('Computing daily means...')
df_periodic = df_periodic \
    .groupby([KEY_IDENTITY, KEY_PERIODIC_OFFSET]) \
    .mean()

PERIODIC_OUTPUT_PATH = relative_path(
    './data/periodic_eicu_processed'
    + ('_test.csv' if TEST_MODE else '.csv.gz')
)

print('Exporting data...')
df_periodic.to_csv(
    PERIODIC_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
)

print('Done.')


Importing data...
Transforming offsets...
Computing daily means...
Exporting data...
Done.
CPU times: total: 3min 5s
Wall time: 3min 10s


## infusion drug


In [20]:
%%time

INFUSION_USE_COLS = [
    KEY_IDENTITY,
    KEY_INFUSION_OFFSET,
    KEY_INFUSION_NAME,
    KEY_INFUSION_AMOUNT,
]

df_infusion = pd.read_csv(
    INFUSION_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=INFUSION_USE_COLS,
)


def map_infusion_name(name):
    if name == name:  # not NA
        for keyword in INFUSION_KEYWORDS:
            if keyword in name.lower():
                return keyword
    return pd.NA


# map name
df_infusion[KEY_INFUSION_NAME] = \
    df_infusion[KEY_INFUSION_NAME].map(map_infusion_name)

# transform offset
df_infusion[KEY_INFUSION_OFFSET] = \
    df_infusion[KEY_INFUSION_OFFSET].map(minutes2days)

# compute daily sums
df_infusion = df_infusion \
    .groupby([KEY_IDENTITY, KEY_INFUSION_OFFSET, KEY_INFUSION_NAME]) \
    .sum()

# drop NAs
df_infusion.dropna(inplace=True)

INFUSION_OUTPUT_PATH = relative_path(
    './data/infusion_eicu_processed'
    + ('_test.csv' if TEST_MODE else '.csv.gz')
)

df_infusion.to_csv(
    INFUSION_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
)


CPU times: total: 7.22 s
Wall time: 7.24 s
