In [None]:
import os
import sys

src_path = os.path.abspath("../..")
print(src_path)
sys.path.append(src_path)

In [None]:
from src.utils import create_directory, raw_data_path, processed_data_path, set_seed

In [None]:
set_seed(seed=42)

In [None]:
import pandas as pd

In [None]:
mimic_iv_path = os.path.join(raw_data_path, "physionet.org/files/mimiciv/2.2")
output_path = os.path.join(processed_data_path, "mimic4")

In [None]:
cohort = pd.read_csv(os.path.join(output_path, "cohort.csv"))
print(cohort.shape)
cohort.head()

In [None]:
cohort["hadm_intime"] = pd.to_datetime(cohort["hadm_intime"])
cohort["hadm_outtime"] = pd.to_datetime(cohort["hadm_outtime"])
cohort["stay_intime"] = pd.to_datetime(cohort["stay_intime"])
cohort["stay_outtime"] = pd.to_datetime(cohort["stay_outtime"])

In [None]:
hadm_ids = set(cohort.hadm_id.unique().tolist())
len(hadm_ids)

helper

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from pandarallel import pandarallel

In [None]:
pandarallel.initialize(progress_bar=True)

In [None]:
def save_group(group_df, hadm_id, event_type):
    file_path = f"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv"
    group_df.to_csv(file_path, index=False)
    return True

dict

In [None]:
d_items = pd.read_csv(os.path.join(mimic_iv_path, "icu/d_items.csv.gz"))
print(d_items.shape)
d_items.head()

In [None]:
d_labitems = pd.read_csv(os.path.join(mimic_iv_path, "hosp/d_labitems.csv.gz"))
print(d_labitems.shape)
d_labitems.head()

## labevents

In [None]:
event_type = "labevents"

In [None]:
!rm -r {output_path}/event_{event_type}

In [None]:
create_directory(f"{output_path}/event_{event_type}")

In [None]:
df_raw = pd.read_csv(os.path.join(mimic_iv_path, f"hosp/{event_type}.csv.gz"))
print(df_raw.shape)
df_raw.head()

In [None]:
df = df_raw.merge(cohort[["hadm_id", "hadm_intime"]], on=["hadm_id"], how="inner")
print(df.shape)
df.head()

In [None]:
df["charttime"] = pd.to_datetime(df["charttime"])
df["storetime"] = pd.to_datetime(df["storetime"])
df["timestamp"] = (df.charttime - df.hadm_intime).dt.total_seconds() / 3600
df["timestamp_avail"] = (df.storetime - df.hadm_intime).dt.total_seconds() / 3600
df.head()

In [None]:
df = df.sort_values(["subject_id", "hadm_id", "timestamp", "specimen_id"], ascending=True)
df.head()

In [None]:
df = df.merge(d_labitems, on="itemid", how="left")
df.head()

In [None]:
import numpy as np


df.value = df.value.replace("___", np.NaN)

In [None]:
df.isna().sum()

In [None]:
df.value = df.value.fillna(df.valuenum.apply(lambda x: f"{x:.2f}" if not pd.isna(x) else np.nan))

In [None]:
df.isna().sum()

In [None]:
df = df.dropna(subset=["value", "timestamp_avail"])
print(df.shape)
df.head()

In [None]:
def generate_event_value(x):
    s = ""
    if pd.isna(x.valueuom):
        s += f"{x.fluid} {x.label} {x.category}: {x.value}"
    else:
        s += f"{x.fluid} {x.label} {x.category}: {x.value} {x.valueuom}"
    if pd.isna(x.flag):
        s += " (normal)"
    else:
        s += " (abnormal)"
    return s

In [None]:
meta_cols = ["fluid", "label", "category", "value", "valueuom", "flag"]
for c in meta_cols:
    df["meta_" + c] = df[c]
meta_cols = ["meta_" + c for c in meta_cols]

In [None]:
generate_event_value(df.iloc[8])

In [None]:
df["event_type"] = event_type
df["event_value"] = df.parallel_apply(generate_event_value, axis=1)

In [None]:
df[df.hadm_id == 29079034]

In [None]:
df.groupby("hadm_id").timestamp.count().describe()

In [None]:
groups = df.groupby("hadm_id")
    
with ThreadPoolExecutor(max_workers=4) as executor:
    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):
        future = executor.submit(
            save_group, 
            group_df[["hadm_id", "event_type", "timestamp", "event_value", "timestamp_avail"] + meta_cols], 
            hadm_id, 
            event_type
        )

In [None]:
!ls -1 {output_path}/event_{event_type} | wc -l