### Processing temporal features.

# Imports

In [1]:
from functools import reduce
import numpy as np
import pandas as pd
import datetime as dt

from cyclops.processors.aggregate import (
    Aggregator,
    tabular_as_aggregated,
    timestamp_ffill_agg,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    RESTRICT_TIMESTAMP,
    TIMESTEP,
)
from cyclops.processors.column_names import ENCOUNTER_ID
from cyclops.processors.constants import FEATURES, NUMERIC, ORDINAL, STANDARD
from cyclops.processors.feature.feature import TabularFeatures
from cyclops.processors.constants import ALL, FEATURES, MEAN, NUMERIC, ORDINAL, STANDARD
from cyclops.processors.feature.feature import TemporalFeatures
from cyclops.processors.feature.vectorize import (
    Vectorized,
    intersect_vectorized,
    split_vectorized,
    vec_index_exp,
)
from cyclops.processors.impute import np_ffill_bfill, np_fill_null_num
from cyclops.utils.file import (
    join,
    load_dataframe,
    load_pickle,
    save_dataframe,
    save_pickle,
    yield_dataframes,
    yield_pickled_files,
)
from drift_detection.gemini.utils import get_use_case_params, compute_timestep
from drift_detection.gemini.constants import COMORBIDITIES

## Choose dataset and use-case

In [100]:
DATASET = "gemini"
USE_CASE = "mortality_cm"

use_case_params = get_use_case_params(DATASET, USE_CASE)

In [101]:
cohort = load_dataframe(use_case_params.ENCOUNTERS_FILE)
cohort = cohort.reset_index(drop=True)

2022-11-16 14:38:08,665 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/encounters.parquet


## Add comorbidities

In [102]:
#create features for comorbidities
for key, dic in COMORBIDITIES.items():
    cohort[key] = cohort[list(dic.keys())[0]].isin(list(dic.values())[0]).astype(int)
features = use_case_params.TAB_FEATURES + list(COMORBIDITIES.keys())
cohort.head(5)

Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,age,sex,hospital_id,outcome_death,readmission,from_nursing_home_mapped,from_acute_care_institution_mapped,...,Cancer,Dyspnea,COPD,Asthma,Pulmonary embolism,Connective tissue disease,Inflammatory bowel disease,Osteoarthritis,Rheumatoid arthritis,HIV
0,11100040,2018-07-03 21:36:00,2018-08-03 09:35:00,84,M,SMH,False,planned_from_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
1,11100041,2016-12-26 18:21:00,2016-12-27 11:00:00,76,F,SMH,False,new_to_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
2,11100072,2016-08-13 15:20:00,2016-08-16 16:45:00,72,F,SMH,False,new_to_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
3,11100095,2015-12-20 18:36:00,2015-12-30 10:25:00,84,M,SMH,False,unplanned_8_to_28_day_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
4,11100097,2019-05-23 06:09:00,2019-05-28 13:07:00,65,M,SMH,False,planned_from_acute,True,False,...,0,0,0,0,0,0,0,0,0,0


In [103]:
tab_features = TabularFeatures(
    data=cohort,
    features=features,
    by=ENCOUNTER_ID,
    force_types=use_case_params.TAB_FEATURES_TYPES,
)
 
numeric_features = tab_features.features_by_type(NUMERIC)
ordinal_features = tab_features.features_by_type(ORDINAL)

if len(ordinal_features) > 0:
    print(ordinal_features[0], "mapping:")
    print(tab_features.meta[ordinal_features[0]].get_mapping())

tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)
save_pickle(tab_vectorized, use_case_params.TAB_VECTORIZED_FILE)
save_pickle(tab_features, use_case_params.TAB_FEATURES_FILE)

hospital_id mapping:
{0: 'MSH', 1: 'SBK', 2: 'SMH', 3: 'THPC', 4: 'THPM', 5: 'UHNTG', 6: 'UHNTW'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/tab_features.pkl'

In [104]:
timestamps = load_dataframe(use_case_params.ENCOUNTERS_FILE)[
    [
        ENCOUNTER_ID,
        ADMIT_TIMESTAMP,
        DISCHARGE_TIMESTAMP,
        use_case_params.TARGET_TIMESTAMP,
    ]
]
start_timestamps = (
    timestamps[[ENCOUNTER_ID, ADMIT_TIMESTAMP]]
    .set_index(ENCOUNTER_ID)
    .rename({ADMIT_TIMESTAMP: RESTRICT_TIMESTAMP}, axis=1)
)
start_timestamps

2022-11-16 14:38:39,162 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/encounters.parquet


Unnamed: 0_level_0,restrict_timestamp
encounter_id,Unnamed: 1_level_1
11100040,2018-07-03 21:36:00
11100041,2016-12-26 18:21:00
11100072,2016-08-13 15:20:00
11100095,2015-12-20 18:36:00
11100097,2019-05-23 06:09:00
...,...
15999822,2012-04-09 21:44:00
15999864,2018-04-30 20:30:00
15999918,2019-09-18 05:59:00
15999943,2015-01-17 08:17:00


## Temporal-specific processing

In [105]:
# Determine which events to keep
# Keep only the most popular events where the values are not null
all_top_events = []
for i, events in enumerate(yield_dataframes(use_case_params.CLEANED_DIR, log=False)):
    top_events = (
        events[EVENT_NAME][~events[EVENT_VALUE].isna()]
        .value_counts()[: use_case_params.TOP_N_EVENTS]
        .index
    )

    all_top_events.append(top_events)

    del events

# Take only the events common to every file
top_events = reduce(np.intersect1d, tuple(all_top_events))

top_events

Index(['sodium', 'potassium', 'lymphocyte', 'hemoglobin', 'bicarbonate',
       'creatinine', 'white blood cell count', 'platelet count',
       'mean cell volume', 'hematocrit', 'neutrophils',
       'glucose point of care', 'glucose random', 'urinalysis',
       'blood urea nitrogen', 'calcium', 'albumin', 'inr', 'x-ray', 'alt',
       'alp', 'bilirubin', 'ast', 'pt', 'aptt', 'lactate venous', 'ct',
       'troponin', 'arterial pao2', 'arterial paco2', 'unmapped_intervention',
       'arterial ph', 'high sensitivity troponin', 'venous pco2', 'ketone',
       'tsh', 'ultrasound', 'ldh', 'venous ph', 'urine specific gravity',
       'echo', 'vitamin b12', 'lactate arterial', 'urine sodium', 'rbc',
       'urine osmolality', 'ferritin', 'serum osmolality', 'mri',
       'endoscopy_mapped', 'crp', 'other', 'non-rbc', 'calcium, ionized',
       'hba1c', 'interventional', 'esr', 'fibrinogen', 'serum alcohol',
       'glucose fasting', 'd-dimer', 'inv_mech_vent_mapped', 'vitamin d',
       

In [106]:
len(top_events)

67

In [107]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=use_case_params.TIMESTEP_SIZE,
    window_duration=use_case_params.WINDOW_DURATION,
)

In [108]:
# Aggregate
skip_n = 0
generator = yield_dataframes(use_case_params.CLEANED_DIR, skip_n=skip_n, log=False)

for save_count, events in enumerate(generator):
    # Take only the top events
    events = events[events[EVENT_NAME].isin(top_events)]

    # Aggregate
    events = events.reset_index(drop=True)
    tmp_features = TemporalFeatures(
        events,
        features=EVENT_VALUE,
        by=[ENCOUNTER_ID, EVENT_NAME],
        timestamp_col=EVENT_TIMESTAMP,
        aggregator=aggregator,
    )

    aggregated = tmp_features.aggregate(window_start_time=start_timestamps)

    save_dataframe(
        aggregated,
        join(use_case_params.AGGREGATED_DIR, "batch_" + f"{save_count + skip_n:04d}"),
    )
    del events

2022-11-16 14:39:25,070 [1;37mINFO[0m cyclops.processors.clean - Dropped nulls over columns: event_timestamp. Removed 6861 rows.
2022-11-16 14:47:53,321 [1;37mINFO[0m cyclops.utils.profile - Finished executing function __call__ in 512.932359 s
2022-11-16 14:47:53,326 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/2-agg/batch_0000.parquet


In [109]:
# Vectorize
skip_n = 0
generator = yield_dataframes(use_case_params.AGGREGATED_DIR, skip_n=skip_n, log=False)
for save_count, aggregated in enumerate(generator):
    vec = aggregator.vectorize(aggregated)
    save_pickle(
        vec,
        join(use_case_params.VECTORIZED_DIR, "batch_" + f"{save_count + skip_n:04d}"),
    )

2022-11-16 14:49:37,874 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 96.458791 s
2022-11-16 14:49:37,904 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/3-vec/batch_0000.pkl


In [110]:
# Take all Vectorized objects and turn them into a single object
vecs = list([vec for vec in yield_pickled_files(use_case_params.VECTORIZED_DIR)])
encounter_axis = vecs[0].get_axis(ENCOUNTER_ID)
res = np.concatenate([vec.data for vec in vecs], axis=encounter_axis)
indexes = vecs[0].indexes
indexes[encounter_axis] = np.concatenate([vec.indexes[encounter_axis] for vec in vecs])
temp_vectorized = Vectorized(res, indexes, vecs[0].axis_names)
del res

2022-11-16 14:50:17,717 [1;37mINFO[0m cyclops.utils.file - Loading pickled data from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/3-vec/batch_0000.pkl


In [111]:
temp_vectorized.shape

(1, 138187, 67, 6)

In [112]:
temp_vectorized.axis_names

['aggfuncs', 'encounter_id', 'event_name', 'timestep']

## Target creation

In [113]:
timestamps["target"] = timestamps[use_case_params.TARGET_TIMESTAMP] - pd.DateOffset(
    hours=use_case_params.PREDICT_OFFSET
)
timestamps = compute_timestep(timestamps, use_case_params.TIMESTEP_SIZE, "target")
timestamps = compute_timestep(timestamps, use_case_params.TIMESTEP_SIZE, DISCHARGE_TIMESTAMP)
timestamps

Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,deathtime,target,target_after_admit,target_timestep,discharge_timestamp_after_admit,discharge_timestamp_timestep
0,11100040,2018-07-03 21:36:00,2018-08-03 09:35:00,NaT,NaT,NaT,,30 days 11:59:00,30.0
1,11100041,2016-12-26 18:21:00,2016-12-27 11:00:00,NaT,NaT,NaT,,0 days 16:39:00,0.0
2,11100072,2016-08-13 15:20:00,2016-08-16 16:45:00,NaT,NaT,NaT,,3 days 01:25:00,3.0
3,11100095,2015-12-20 18:36:00,2015-12-30 10:25:00,NaT,NaT,NaT,,9 days 15:49:00,9.0
4,11100097,2019-05-23 06:09:00,2019-05-28 13:07:00,NaT,NaT,NaT,,5 days 06:58:00,5.0
...,...,...,...,...,...,...,...,...,...
143044,15999822,2012-04-09 21:44:00,2012-04-10 17:21:00,NaT,NaT,NaT,,0 days 19:37:00,0.0
143045,15999864,2018-04-30 20:30:00,2018-05-06 12:46:00,NaT,NaT,NaT,,5 days 16:16:00,5.0
143046,15999918,2019-09-18 05:59:00,2019-09-20 14:45:00,NaT,NaT,NaT,,2 days 08:46:00,2.0
143047,15999943,2015-01-17 08:17:00,2015-01-23 11:33:00,NaT,NaT,NaT,,6 days 03:16:00,6.0


In [114]:
timestamps[~timestamps[use_case_params.TARGET_TIMESTAMP].isna()]

Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,deathtime,target,target_after_admit,target_timestep,discharge_timestamp_after_admit,discharge_timestamp_timestep
20,11100856,2019-08-11 18:24:00,2019-08-14 17:15:00,2019-08-14 17:15:00,2019-07-31 17:15:00,-12 days +22:51:00,-12.0,2 days 22:51:00,2.0
32,11101834,2020-03-31 16:43:00,2020-04-06 12:04:00,2020-04-06 12:04:00,2020-03-23 12:04:00,-9 days +19:21:00,-9.0,5 days 19:21:00,5.0
37,11102089,2011-01-02 17:00:00,2011-01-26 06:00:00,2011-01-26 06:00:00,2011-01-12 06:00:00,9 days 13:00:00,9.0,23 days 13:00:00,23.0
56,11103117,2019-03-17 23:37:00,2019-04-01 23:03:00,2019-04-01 23:03:00,2019-03-18 23:03:00,0 days 23:26:00,0.0,14 days 23:26:00,14.0
86,11104649,2017-06-11 16:39:00,2017-06-12 11:30:00,2017-06-12 11:30:00,2017-05-29 11:30:00,-14 days +18:51:00,-14.0,0 days 18:51:00,0.0
...,...,...,...,...,...,...,...,...,...
143009,15998960,2010-12-15 17:35:00,2010-12-19 15:00:00,2010-12-19 15:00:00,2010-12-05 15:00:00,-11 days +21:25:00,-11.0,3 days 21:25:00,3.0
143032,15999355,2011-08-29 21:52:00,2011-09-06 19:45:00,2011-09-06 19:45:00,2011-08-23 19:45:00,-7 days +21:53:00,-7.0,7 days 21:53:00,7.0
143033,15999370,2017-04-23 15:01:00,2017-04-26 05:07:00,2017-04-26 05:07:00,2017-04-12 05:07:00,-12 days +14:06:00,-12.0,2 days 14:06:00,2.0
143035,15999393,2017-01-05 16:22:00,2017-01-10 03:55:00,2017-01-10 03:55:00,2016-12-27 03:55:00,-10 days +11:33:00,-10.0,4 days 11:33:00,4.0


In [115]:
encounter_order = pd.Series(temp_vectorized.get_index(ENCOUNTER_ID))
encounter_order = encounter_order.rename(ENCOUNTER_ID).to_frame()
encounter_order

Unnamed: 0,encounter_id
0,11100040
1,11100041
2,11100072
3,11100095
4,11100097
...,...
138182,15999822
138183,15999864
138184,15999918
138185,15999943


In [116]:
discharge_timestep = DISCHARGE_TIMESTAMP + "_timestep"
timesteps = timestamps[[ENCOUNTER_ID, "target_timestep", discharge_timestep]]
aligned_timestamps = pd.merge(encounter_order, timesteps, on=ENCOUNTER_ID, how="left")
aligned_timestamps

Unnamed: 0,encounter_id,target_timestep,discharge_timestamp_timestep
0,11100040,,30.0
1,11100041,,0.0
2,11100072,,3.0
3,11100095,,9.0
4,11100097,,5.0
...,...,...,...
138182,15999822,,0.0
138183,15999864,,5.0
138184,15999918,,2.0
138185,15999943,,6.0


In [117]:
num_timesteps = int(use_case_params.WINDOW_DURATION / use_case_params.TIMESTEP_SIZE)
shape = (len(aligned_timestamps), num_timesteps)

arr1 = timestamp_ffill_agg(
    aligned_timestamps["target_timestep"], num_timesteps, fill_nan=2
)
arr2 = timestamp_ffill_agg(
    aligned_timestamps[discharge_timestep], num_timesteps, val=-1, fill_nan=2
)
targets = np.minimum(arr1, arr2)
targets[targets == 2] = 0
targets[126:146]

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0., -1., -1., -1., -1., -1.],
       [ 0.,  0., -1., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0., -1., -1., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0., -1., -1., -1.],
       [-1., -1., -1., -1., -1., -1.],
       [ 0.,  0., -1., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [-1., -1., -1., -1., -1., -1.]])

In [118]:
aligned_timestamps.iloc[126:146]

Unnamed: 0,encounter_id,target_timestep,discharge_timestamp_timestep
126,11106664,,20.0
127,11106666,,1.0
128,11106673,,2.0
129,11106716,,14.0
130,11106811,,6.0
131,11106827,,10.0
132,11106961,,1.0
133,11106980,,5.0
134,11107038,,10.0
135,11107053,,17.0


In [119]:
targets = np.expand_dims(np.expand_dims(targets, 0), 2)
targets.shape

(1, 138187, 1, 6)

In [120]:
temp_vectorized.shape

(1, 138187, 67, 6)

In [121]:
# Include target
# temp_vectorized = temp_vectorized.remove_with_index(EVENT_NAME, TEMP_TARGETS)
# print(temp_vectorized.shape)
temp_vectorized = temp_vectorized.concat_over_axis(
    EVENT_NAME, targets, use_case_params.TEMP_TARGETS
)
temp_vectorized.shape

(1, 138187, 68, 6)

In [122]:
only_targets = temp_vectorized.take_with_index(EVENT_NAME, use_case_params.TEMP_TARGETS)
assert np.isnan(only_targets.data).sum() == 0

In [123]:
save_pickle(temp_vectorized, use_case_params.TEMP_VECTORIZED_FILE)

2022-11-16 14:50:20,123 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/temp_vectorized.pkl


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/temp_vectorized.pkl'

## Combined processing

In [124]:
temp_vectorized = load_pickle(use_case_params.TEMP_VECTORIZED_FILE)

2022-11-16 14:50:39,408 [1;37mINFO[0m cyclops.utils.file - Loading pickled data from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/temp_vectorized.pkl


In [125]:
tab_features.data

Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,age,sex,hospital_id,outcome_death,readmission,from_nursing_home_mapped,from_acute_care_institution_mapped,...,Cancer,Dyspnea,COPD,Asthma,Pulmonary embolism,Connective tissue disease,Inflammatory bowel disease,Osteoarthritis,Rheumatoid arthritis,HIV
0,11100040,2018-07-03 21:36:00,2018-08-03 09:35:00,84,1,SMH,False,planned_from_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
1,11100041,2016-12-26 18:21:00,2016-12-27 11:00:00,76,0,SMH,False,new_to_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
2,11100072,2016-08-13 15:20:00,2016-08-16 16:45:00,72,0,SMH,False,new_to_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
3,11100095,2015-12-20 18:36:00,2015-12-30 10:25:00,84,1,SMH,False,unplanned_8_to_28_day_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
4,11100097,2019-05-23 06:09:00,2019-05-28 13:07:00,65,1,SMH,False,planned_from_acute,True,False,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143044,15999822,2012-04-09 21:44:00,2012-04-10 17:21:00,61,1,THPM,False,new_to_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
143045,15999864,2018-04-30 20:30:00,2018-05-06 12:46:00,58,0,THPC,False,new_to_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
143046,15999918,2019-09-18 05:59:00,2019-09-20 14:45:00,68,0,THPM,False,new_to_acute,False,False,...,0,0,0,0,0,0,0,0,0,0
143047,15999943,2015-01-17 08:17:00,2015-01-23 11:33:00,78,0,THPM,False,nota,False,False,...,0,0,0,0,0,0,0,0,0,0


In [126]:
tab = tab_features.get_data(to_binary_indicators=ordinal_features).reset_index()

# Take only the encounters with temporal events
tab = tab[np.in1d(tab[ENCOUNTER_ID].values, temp_vectorized.get_index(ENCOUNTER_ID))]

# Aggregate tabular
tab_aggregated = tabular_as_aggregated(
    tab=tab,
    index=ENCOUNTER_ID,
    var_name=EVENT_NAME,
    value_name=EVENT_VALUE,
    strategy=ALL,
    num_timesteps=aggregator.window_duration // aggregator.timestep_size,
)
tab_aggregated

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,event_value
encounter_id,event_name,timestep,Unnamed: 3_level_1
11100040,Asthma,0,0.0
11100040,Asthma,1,0.0
11100040,Asthma,2,0.0
11100040,Asthma,3,0.0
11100040,Asthma,4,0.0
...,...,...,...
15999969,triage_level_urgent,1,0.0
15999969,triage_level_urgent,2,0.0
15999969,triage_level_urgent,3,0.0
15999969,triage_level_urgent,4,0.0


In [127]:
# Vectorize tabular
tab_aggregated_vec = aggregator.vectorize(tab_aggregated)
tab_aggregated_vec.shape

2022-11-16 14:52:40,769 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 48.473379 s


(1, 138187, 70, 6)

In [128]:
temp_vectorized.shape

(1, 138187, 68, 6)

In [129]:
# Combine
comb_vectorized = temp_vectorized.concat_over_axis(
    EVENT_NAME, tab_aggregated_vec.data, tab_aggregated_vec.get_index(EVENT_NAME)
)
comb_vectorized.shape

(1, 138187, 138, 6)

In [130]:
# Don't include any of the tabular targets - split out to avoid label leakage
comb_vectorized, _ = comb_vectorized.split_out(EVENT_NAME, use_case_params.TAB_TARGETS)
label_leakage_cols = [i for e in use_case_params.EXCLUDE for i in comb_vectorized.get_index(EVENT_NAME) if e in i]
comb_vectorized, _ = comb_vectorized.split_out(EVENT_NAME, label_leakage_cols)
comb_vectorized.shape

(1, 138187, 110, 6)

In [131]:
comb_vectorized.get_index(EVENT_NAME)

array(['Asthma', 'COPD', 'Cancer', 'Cerebrovascular disease',
       'Congestive heart failure', 'Connective tissue disease',
       'Diabetes', 'Dyspnea', 'HIV', 'Hyperlipidemia', 'Hypertension',
       'Inflammatory bowel disease', 'Ischemic heart disease',
       'Kidney disease', 'Osteoarthritis', 'Other heart disease',
       'Pulmonary embolism', 'Rheumatoid arthritis',
       'admit_via_ambulance_', 'admit_via_ambulance_air',
       'admit_via_ambulance_ground', 'admit_via_ambulance_no_ambulance',
       'admit_via_ambulance_no_info', 'age', 'albumin', 'alp', 'alt',
       'aptt', 'arterial paco2', 'arterial pao2', 'arterial ph', 'ast',
       'bicarbonate', 'bilirubin', 'blood urea nitrogen', 'calcium',
       'calcium, ionized', 'creatinine', 'crp', 'ct', 'd-dimer',
       'dialysis_mapped', 'echo', 'endoscopy_mapped', 'esr', 'ferritin',
       'fibrinogen', 'from_acute_care_institution_mapped',
       'from_nursing_home_mapped', 'glucose fasting',
       'glucose point of car

In [132]:
np.isnan(tab_aggregated_vec.data).sum() / tab_aggregated_vec.data.size

0.0

In [133]:
tab_aggregated_vec.data

array([[[[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         ...,
         [1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        ...,

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0.

In [134]:
np.isnan(temp_vectorized.data).sum() / temp_vectorized.data.size

0.8710733976990827

In [135]:
np.isnan(comb_vectorized.data).sum() / comb_vectorized.data.size

0.5384817367594329

In [137]:
save_pickle(comb_vectorized, use_case_params.COMB_VECTORIZED_FILE)

2022-11-16 15:21:23,316 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/comb_vectorized.pkl


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_cm/./data/comb_vectorized.pkl'