# filter_eicu


## common


In [13]:
%load_ext cython
import pandas as pd
from common_eicu import *


In [6]:
TEST_MODE = False
# TEST_MODE = True


In [14]:
%%cython
import cython

MINUTES_PER_DAY: cython.int = 24 * 60

def offset2days(offset: cython.int) -> cython.int:
    return int(offset / MINUTES_PER_DAY)


## diagnosis


In [None]:
DIAGNOSIS_USE_COLS = [
    KEY_IDENTITY,
    KEY_DIAGNOSIS_STRING,
]

df_diagnosis = pd.read_csv(
    DIAGNOSIS_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=DIAGNOSIS_USE_COLS,
)

# filter diagnosis strings
diagnosis_mask = df_diagnosis[KEY_DIAGNOSIS_STRING].map(
    lambda diagnosis_string: (
        SEPSIS_KEYWORD in str(diagnosis_string).lower()
    )
)
df_diagnosis = df_diagnosis[diagnosis_mask].copy()

if TEST_MODE:
    DIAGNOSIS_OUTPUT_PATH = relative_path('sepsis_eicu_test.csv')
else:
    DIAGNOSIS_OUTPUT_PATH = relative_path('sepsis_eicu.csv.gz')

df_diagnosis.to_csv(
    DIAGNOSIS_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
    index=False,
)


## exam


In [None]:
EXAM_USE_COLS = [
    KEY_IDENTITY,
    KEY_EXAM_OFFSET,
    KEY_EXAM_NAME,
    KEY_EXAM_RESULT,
]

df_exam = pd.read_csv(
    EXAM_PATH,
    usecols=EXAM_USE_COLS,
    nrows=(TEST_ROWS if TEST_MODE else None),
)

# filter exam items
exam_values = {
    KEY_EXAM_NAME: EXAM_ITEMS_FULL,
}
exam_mask = df_exam.isin(exam_values).any(axis='columns')
df_exam = df_exam[exam_mask].copy()

# transform offset
df_exam[KEY_EXAM_OFFSET] = \
    df_exam[KEY_EXAM_OFFSET].map(offset2days)

if TEST_MODE:
    EXAM_OUTPUT_PATH = relative_path('exam_eicu_filtered_test.csv')
else:
    EXAM_OUTPUT_PATH = relative_path('exam_eicu_filtered.csv.gz')

df_exam.to_csv(
    EXAM_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
    index=False,
)


## lab


In [None]:
LAB_USE_COLS = [
    KEY_IDENTITY,
    KEY_LAB_OFFSET,
    KEY_LAB_NAME,
    KEY_LAB_RESULT,
]

df_lab = pd.read_csv(
    LAB_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=LAB_USE_COLS,
)

# filter lab items
lab_values = {
    KEY_LAB_NAME: LAB_VARIABLES_FULL,
}
lab_mask = df_lab.isin(lab_values).any(axis='columns')
df_lab = df_lab[lab_mask].copy()

# transform offset
df_lab[KEY_LAB_OFFSET] = \
    df_lab[KEY_LAB_OFFSET].map(offset2days)

if TEST_MODE:
    LAB_OUTPUT_PATH = relative_path('lab_eicu_filtered_test.csv')
else:
    LAB_OUTPUT_PATH = relative_path('lab_eicu_filtered.csv.gz')

df_lab.to_csv(
    LAB_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
    index=False,
)


## treatment


In [None]:
TREATMENT_USE_COLS = [
    KEY_IDENTITY,
    KEY_TREATMENT_OFFSET,
    KEY_TREATMENT_STRING,
]

df_treatment = pd.read_csv(
    TREATMENT_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=TREATMENT_USE_COLS,
)

# filter treatment strings
treatment_mask = df_treatment[KEY_TREATMENT_STRING].map(
    lambda treatment_string: any(
        keyword in treatment_string
        for keyword in TREATMENT_KEYWORDS_FULL
    )
)
df_treatment = df_treatment[treatment_mask].copy()

# transform offset
df_treatment[KEY_TREATMENT_OFFSET] = \
    df_treatment[KEY_TREATMENT_OFFSET].map(offset2days)

if TEST_MODE:
    TREATMENT_OUTPUT_PATH = relative_path('treatment_eicu_filtered_test.csv')
else:
    TREATMENT_OUTPUT_PATH = relative_path('treatment_eicu_filtered.csv.gz')

df_treatment.to_csv(
    TREATMENT_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
    index=False,
)


## aperiodic


In [15]:
APERIODIC_USE_COLS = [
    KEY_IDENTITY,
    KEY_APERIODIC_OFFSET,
    *APERIODIC_COLUMNS_FULL,
]

print('Importing data...')
df_aperiodic = pd.read_csv(
    APERIODIC_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=APERIODIC_USE_COLS,
)

print('Transforming offsets...')
df_aperiodic[KEY_APERIODIC_OFFSET] = \
    df_aperiodic[KEY_APERIODIC_OFFSET].map(offset2days)

print('Computing daily means...')
df_aperiodic = df_aperiodic \
    .groupby([KEY_IDENTITY, KEY_APERIODIC_OFFSET]) \
    .mean()

if TEST_MODE:
    APERIODIC_OUTPUT_PATH = relative_path('aperiodic_eicu_filtered_test.csv')
else:
    APERIODIC_OUTPUT_PATH = relative_path('aperiodic_eicu_filtered.csv.gz')

print('Exporting data...')
df_aperiodic.to_csv(
    APERIODIC_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
)

print('Done.')


Importing data...
Transforming offsets...
Computing daily means...
Exporting data...
Done.


## periodic


In [16]:
PERIODIC_USE_COLS = [
    KEY_IDENTITY,
    KEY_PERIODIC_OFFSET,
    *PERIODIC_COLUMNS_FULL,
]

print('Importing data...')
df_periodic = pd.read_csv(
    PERIODIC_PATH,
    nrows=(TEST_ROWS if TEST_MODE else None),
    usecols=PERIODIC_USE_COLS,
)

print('Transforming offsets...')
df_periodic[KEY_PERIODIC_OFFSET] = \
    df_periodic[KEY_PERIODIC_OFFSET].map(offset2days)

print('Computing daily means...')
df_periodic = df_periodic \
    .groupby([KEY_IDENTITY, KEY_PERIODIC_OFFSET]) \
    .mean()

if TEST_MODE:
    PERIODIC_OUTPUT_PATH = relative_path('periodic_eicu_filtered_test.csv')
else:
    PERIODIC_OUTPUT_PATH = relative_path('periodic_eicu_filtered.csv.gz')

print('Exporting data...')
df_periodic.to_csv(
    PERIODIC_OUTPUT_PATH,
    compression=(None if TEST_MODE else 'gzip'),
)

print('Done.')


Importing data...
Transforming offsets...
Computing daily means...
Exporting data...
Done.
