# Imports

In [3]:
import time
import pandas as pd
from cyclops.processors.clean import normalize_names, normalize_values
from cyclops.processors.column_names import (
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_VALUE,
)
from cyclops.processors.feature.split import intersect_datasets
from cyclops.utils.file import join, save_dataframe
from drift_detection.gemini.mortality_allDx.constants import (
    CLEANED_DIR,
    ENCOUNTERS_FILE,
    OUTCOME_DEATH,
    QUERIED_DIR,
    TARGET_TIMESTAMP,
)
from drift_detection.gemini.query import main

2023-03-06 18:50:16,396 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


# Query

In [None]:
t = time.time()
cohort, events = main()
print(time.time() - t)
cohort

2023-03-06 18:50:24,254 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2023-03-06 18:50:24,257 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 3.460177 s
2023-03-06 18:50:25,557 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2023-03-06 18:50:25,560 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 1.301137 s
2023-03-06 18:50:32,267 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2023-03-06 18:50:32,270 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 6.384751 s
2023-03-06 18:50:57,870 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2023-03-06 18:50:57,873 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.404854 s
2023-03-06 18:51:00,779 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2023-03-06 18:51:00,782 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query

In [1]:
cohort[OUTCOME_DEATH].sum() / len(cohort)

NameError: name 'cohort' is not defined

In [None]:
events

In [None]:
# Intersect over encounter IDs to get only those encounters common to both
cohort, events = intersect_datasets([cohort, events], ENCOUNTER_ID)

In [6]:
save_dataframe(events, join(QUERIED_DIR, "batch_0000.parquet"))

2022-11-08 13:16:27,570 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/0-queried/batch_0000.parquet


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/0-queried/batch_0000.parquet'

# Clean / Preprocess

In [None]:
death_events = cohort[cohort[OUTCOME_DEATH] == True]  # noqa: E712
death_events = death_events[[ENCOUNTER_ID, DISCHARGE_TIMESTAMP]]
death_events = death_events.rename({DISCHARGE_TIMESTAMP: TARGET_TIMESTAMP}, axis=1)
cohort = pd.merge(cohort, death_events, on=ENCOUNTER_ID, how="left")

In [8]:
save_dataframe(cohort, ENCOUNTERS_FILE)

2022-11-08 13:21:01,087 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/encounters.parquet


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/encounters.parquet'

In [9]:
# Normalize names and string values
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Convert values to numeric, dropping those which can't be converted
events[EVENT_VALUE] = pd.to_numeric(events[EVENT_VALUE], errors="coerce")
print("Length before:", len(events))
events = events[~events[EVENT_VALUE].isna()]
print("Length after:", len(events))
events

Length before: 17502603
Length after: 17029291


Unnamed: 0,encounter_id,event_name,event_value,event_value_unit,event_timestamp,event_category
17371788,11100040,unmapped_intervention,1.0,,2018-07-04 00:00:00,interventions
15174588,11100040,glucose point of care,10.9,mmol/L,2018-07-11 21:08:00,labs
15174587,11100040,glucose point of care,10.9,mmol/L,2018-07-04 21:18:00,labs
15174586,11100040,glucose point of care,10.8,mmol/L,2018-07-06 12:25:00,labs
15174585,11100040,glucose point of care,10.7,mmol/L,2018-07-18 17:15:00,labs
...,...,...,...,...,...,...
66069,15999969,albumin,28.0,g/L,2015-02-20 03:50:00,labs
66068,15999969,aptt,26.5,SEC,2015-02-17 22:00:00,labs
66067,15999969,aptt,24.8,SEC,2015-02-19 05:50:00,labs
66076,15999969,arterial paco2,31.0,MMHG,2015-02-19 05:50:00,labs


In [10]:
save_dataframe(events, join(CLEANED_DIR, "batch_0000.parquet"))

2022-11-08 13:26:45,574 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/1-cleaned/batch_0000.parquet


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/1-cleaned/batch_0000.parquet'