### Create dataset

In [1]:
from functools import reduce
import numpy as np
import pandas as pd
import datetime as dt
from cyclops.processors.impute import np_ffill_bfill, np_fill_null_num

from cyclops.processors.aggregate import (
    Aggregator,
    tabular_as_aggregated,
    timestamp_ffill_agg,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    RESTRICT_TIMESTAMP,
    TIMESTEP,
)
from cyclops.processors.column_names import ENCOUNTER_ID
from cyclops.processors.constants import FEATURES, NUMERIC, ORDINAL, STANDARD
from cyclops.processors.feature.feature import TabularFeatures
from cyclops.processors.constants import ALL, FEATURES, MEAN, NUMERIC, ORDINAL, STANDARD
from cyclops.processors.feature.feature import TemporalFeatures
from cyclops.processors.feature.vectorize import (
    Vectorized,
    intersect_vectorized,
    split_vectorized,
    vec_index_exp,
)
from cyclops.utils.file import (
    join,
    load_dataframe,
    load_pickle,
    save_dataframe,
    save_pickle,
    yield_dataframes,
    yield_pickled_files,
)
from drift_detection.gemini.utils import get_use_case_params, impute, get_source_target
from drift_detection.gemini.constants import DIAGNOSIS_DICT, HOSPITALS, COMORBIDITIES

In [2]:
DIAGNOSIS_DICT

{'Certain infectious and parasitic diseases': ('A00', 'B99'),
 'Neoplasms': ('C00', 'D49'),
 'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism': ('D50',
  'D89'),
 'Endocrine, nutritional and metabolic diseases': ('E00', 'E89'),
 'Mental, Behavioral and Neurodevelopmental disorders': ('F01', 'F99'),
 'Diseases of the nervous system': ('G00', 'G99'),
 'Diseases of the eye and adnexa': ('H00', 'H59'),
 'Diseases of the ear and mastoid process': ('H60', 'H95'),
 'Diseases of the circulatory system': ('I00', 'I99'),
 'Diseases of the respiratory system': ('J00', 'J99'),
 'Diseases of the digestive system': ('K00', 'K95'),
 'Diseases of the skin and subcutaneous tissue': ('L00', 'L99'),
 'Diseases of the musculoskeletal system and connective tissue': ('M00',
  'M99'),
 'Diseases of the genitourinary system': ('N00', 'N99'),
 'Pregnancy, childbirth and the puerperium': ('O00', 'O99'),
 'Certain conditions originating in the perinatal period'

## Load cohort data

In [3]:
DATASET = "gemini"
USE_CASE = "mortality"
ID = SPLIT = input("Select data split: ")
DIAGNOSIS_TRAJECTORY = input("Select diagnosis trajectory to filter on: ") 
HOSPITAL = input("Select hospital to filter on: ") 

splice_map = {
    "hospital_id": HOSPITALS
}

if DIAGNOSIS_TRAJECTORY != "all":
    diagnosis_trajectory = '_'.join(DIAGNOSIS_DICT[DIAGNOSIS_TRAJECTORY])
    ID = ID +"_"+ diagnosis_trajectory
    splice_map["diagnosis_trajectory"] = [diagnosis_trajectory]
    
if HOSPITAL != "all":
    ID = HOSPITAL + "_" + ID 
    splice_map["hospital_id"] = [HOSPITAL]
    
use_case_params = get_use_case_params(DATASET, USE_CASE)

Select data split:  weekend
Select diagnosis trajectory to filter on:  all
Select hospital to filter on:  all


In [None]:
# Read in total data
tab_vectorized = load_pickle(use_case_params.TAB_VECTORIZED_FILE)
temp_vectorized = load_pickle(use_case_params.TEMP_VECTORIZED_FILE)
comb_vectorized = load_pickle(use_case_params.COMB_VECTORIZED_FILE)

# Read in tabular features
tab_features = load_pickle(use_case_params.TAB_FEATURES_FILE)
tab_features.data

In [5]:
# Normalize only numeric features (e.g., not binary indicators)
# Note: Normalization is not occuring here, we are only doing the setup
numeric_features = tab_features.features_by_type(NUMERIC)
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

comb_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Add custom variables

In [6]:
#create admit month/day column
tab_features.data['admit_month'] = tab_features.data['admit_timestamp'].dt.month
tab_features.data['admit_day'] = tab_features.data['admit_timestamp'].dt.weekday
#create night shift indicator column
tab_features.data['night_shift'] = np.where(((tab_features.data['admit_timestamp'].dt.hour > 19.5) | (tab_features.data['admit_timestamp'].dt.hour < 7.5)),1,0)
tab_features.data['from_nursing_home_mapped'] = tab_features.data['from_nursing_home_mapped'].astype(int)
tab_features.data['from_acute_care_institution_mapped'] = tab_features.data['from_acute_care_institution_mapped'].astype(int)

## Split data into source and target

In [7]:
# get source and target
tab_x_source, tab_x_target = get_source_target(tab_features, tab_vectorized, SPLIT, splice_map, train_frac=0.8, axis="encounter_id")

## Get balanced source and target split

In [11]:
#g = tab_features.groupby('class')
#g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

<cyclops.processors.feature.feature.TabularFeatures at 0x7fe2ad230910>

## Split data source into train and val

In [8]:
# intersect tabular and temporal vectors of source data
tab_x_source, temp_x_source, comb_x_source = intersect_vectorized(
    [tab_x_source, temp_vectorized, comb_vectorized], axes=ENCOUNTER_ID
)

In [None]:
# split source data into training and validation 
tab_x_source_splits, temp_x_source_splits, comb_x_source_splits = split_vectorized(
    [tab_x_source, temp_x_source, comb_x_source],
    [0.8, 0.2],
    axes=ENCOUNTER_ID,
)

In [None]:
# intersect tabular and temporal vectors of target data
tab_test, temp_test, comb_test = intersect_vectorized(
    [tab_x_target, temp_vectorized, comb_vectorized], axes=ENCOUNTER_ID
)
tab_test.shape, temp_test.shape, comb_test.shape

##  Dataset splits

In [12]:
tab_train_val_splits, temp_train_val_splits, comb_train_val_splits = split_vectorized(
    [tab_x_source, temp_x_source, comb_x_source],
    [0.8, 0.2],
    axes=ENCOUNTER_ID,
)
tab_train, tab_val = tab_train_val_splits
temp_train, temp_val = temp_train_val_splits
comb_train, comb_val = comb_train_val_splits

In [13]:
tab_train.shape, tab_val.shape, tab_test.shape

((42279, 52), (10570, 52), (85338, 52))

In [14]:
temp_train.shape, temp_val.shape, temp_test.shape

((1, 42279, 68, 6), (1, 10570, 68, 6), (1, 85338, 68, 6))

In [15]:
comb_train.shape, comb_val.shape, comb_test.shape

((1, 42279, 92, 6), (1, 10570, 92, 6), (1, 85338, 92, 6))

## Split features/targets

In [16]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

((42279, 51), (42279, 1))

In [17]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

((10570, 51), (10570, 1))

In [18]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

((85338, 51), (85338, 1))

In [19]:
temp_train_X, temp_train_y = temp_train.split_out(
    EVENT_NAME, use_case_params.TEMP_TARGETS
)
temp_train_X.shape, temp_train_y.shape

((1, 42279, 67, 6), (1, 42279, 1, 6))

In [20]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

((1, 10570, 67, 6), (1, 10570, 1, 6))

In [21]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

((1, 85338, 67, 6), (1, 85338, 1, 6))

In [22]:
comb_train_X, comb_train_y = comb_train.split_out(
    EVENT_NAME, use_case_params.TEMP_TARGETS
)
comb_train_X.shape, comb_train_y.shape

((1, 42279, 91, 6), (1, 42279, 1, 6))

In [23]:
comb_val_X, comb_val_y = comb_val.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
comb_val_X.shape, comb_val_y.shape

((1, 10570, 91, 6), (1, 10570, 1, 6))

In [24]:
comb_test_X, comb_test_y = comb_test.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
comb_test_X.shape, comb_test_y.shape

((1, 85338, 91, 6), (1, 85338, 1, 6))

In [25]:
temp_train_X = impute(temp_train_X)
temp_val_X = impute(temp_val_X)
temp_test_X = impute(temp_test_X)

comb_train_X = impute(comb_train_X)
comb_val_X = impute(comb_val_X)
comb_test_X = impute(comb_test_X)

  mean = np.nanmean(data_slice)


In [26]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X_"+ID),
    (tab_train_y, "tab_train_y_"+ID),
    (tab_val_X, "tab_val_X_"+ID),
    (tab_val_y, "tab_val_y_"+ID),
    (tab_test_X, "tab_test_X_"+ID),
    (tab_test_y, "tab_test_y_"+ID),
    (temp_train_X, "temp_train_X_"+ID),
    (temp_train_y, "temp_train_y_"+ID),
    (temp_val_X, "temp_val_X_"+ID),
    (temp_val_y, "temp_val_y_"+ID),
    (temp_test_X, "temp_test_X_"+ID),
    (temp_test_y, "temp_test_y_"+ID),
    (comb_train_X, "comb_train_X_"+ID),
    (comb_train_y, "comb_train_y_"+ID),
    (comb_val_X, "comb_val_X_"+ID),
    (comb_val_y, "comb_val_y_"+ID),
    (comb_test_X, "comb_test_X_"+ID),
    (comb_test_y, "comb_test_y_"+ID),
]
for vec, name in vectorized:
    save_pickle(vec, use_case_params.TAB_VEC_COMB + name + "_not_normalized.pkl")

2023-01-23 15:38:36,552 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_train_X_day_not_normalized.pkl
2023-01-23 15:38:37,350 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_train_y_day_not_normalized.pkl
2023-01-23 15:38:37,607 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_val_X_day_not_normalized.pkl
2023-01-23 15:38:37,921 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_val_y_day_not_normalized.pkl
2023-01-23 15:38:37,957 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_test_X_day_not_normalized.pkl


## Normalize separately

In [27]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
) = splits

In [28]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X_"+ID),
    (tab_train_y, "tab_train_y_"+ID),
    (tab_val_X, "tab_val_X_"+ID),
    (tab_val_y, "tab_val_y_"+ID),
    (tab_test_X, "tab_test_X_"+ID),
    (tab_test_y, "tab_test_y_"+ID),
    (temp_train_X, "temp_train_X_"+ID),
    (temp_train_y, "temp_train_y_"+ID),
    (temp_val_X, "temp_val_X_"+ID),
    (temp_val_y, "temp_val_y_"+ID),
    (temp_test_X, "temp_test_X_"+ID),
    (temp_test_y, "temp_test_y_"+ID),
    (comb_train_X, "comb_train_X_"+ID),
    (comb_train_y, "comb_train_y_"+ID),
    (comb_val_X, "comb_val_X_"+ID),
    (comb_val_y, "comb_val_y_"+ID),
    (comb_test_X, "comb_test_X_"+ID),
    (comb_test_y, "comb_test_y_"+ID),
]
for vec, name in vectorized:
    save_pickle(vec, use_case_params.TAB_VEC_COMB + name + ".pkl")

2023-01-23 15:39:20,965 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_train_X_day.pkl
2023-01-23 15:39:22,013 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_train_y_day.pkl
2023-01-23 15:39:22,846 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_val_X_day.pkl
2023-01-23 15:39:24,061 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_val_y_day.pkl
2023-01-23 15:39:24,510 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_test_X_day.pkl
2023-01-23 15:39:26,457 [1;37mINFO[0m cyclops.utils.file - Pickling data 

## Normalize using training data

In [None]:
comb_train_X.fit_normalizer()
normalizer = comb_train_X.normalizer
save_pickle(normalizer, "normalizer.pkl")

In [None]:
splits = (
    comb_train_X,
    comb_val_X,
    comb_test_X,
)

for split in splits:
    split.normalizer = normalizer
    split.normalize()

(
    comb_train_X,
    comb_val_X,
    comb_test_X,
) = splits

In [None]:
# Store data (serialize)
vectorized = [
    (comb_train_X, "comb_train_X_"+ID),
    (comb_train_y, "comb_train_y_"+ID),
    (comb_val_X, "comb_val_X_"+ID),
    (comb_val_y, "comb_val_y_"+ID),
    (comb_test_X, "comb_test_X_"+ID),
    (comb_test_y, "comb_test_y_"+ID),
]
for vec, name in vectorized:
    save_pickle(vec, use_case_params.TAB_VEC_COMB + name + ".pkl")