In [None]:
import os
import sys

src_path = os.path.abspath('../..')
print(src_path)
sys.path.append(src_path)

In [None]:
from src.utils import create_directory, dump_pickle, raw_data_path, processed_data_path, set_seed

In [None]:
set_seed(seed=42)

In [None]:
import pandas as pd

In [None]:
mimic_iv_path = os.path.join(raw_data_path, "physionet.org/files/mimiciv/2.2")
mimic_iv_note_path = os.path.join(raw_data_path, "physionet.org/files/mimic-iv-note/2.2")

In [None]:
!ls {mimic_iv_path}

In [None]:
!ls {mimic_iv_note_path}

In [None]:
patients = pd.read_csv(os.path.join(mimic_iv_path, "hosp/patients.csv.gz"))
print(patients.shape)
patients.head()

In [None]:
admissions = pd.read_csv(os.path.join(mimic_iv_path, "hosp/admissions.csv.gz"))
print(admissions.shape)
admissions.head()

In [None]:
icustays = pd.read_csv(os.path.join(mimic_iv_path, "icu/icustays.csv.gz"))
print(icustays.shape)
icustays.head()

In [None]:
discharge = pd.read_csv(os.path.join(mimic_iv_note_path, "note/discharge.csv.gz"))
print(discharge.shape)
discharge.head()

In [None]:
print(patients.subject_id.nunique())
print(admissions.subject_id.nunique())
print(icustays.subject_id.nunique())
print(discharge.subject_id.nunique())

In [None]:
print(admissions.hadm_id.nunique())
print(icustays.hadm_id.nunique())
print(discharge.hadm_id.nunique())

In [None]:
print(icustays.stay_id.nunique())

In [None]:
admissions_hadm_ids = set(admissions.hadm_id.tolist())
icustays_hadm_ids = set(icustays.hadm_id.tolist())
discharge_hadm_ids = set(discharge.hadm_id.tolist())

In [None]:
from matplotlib_venn import venn3
import matplotlib.pyplot as plt


plt.figure(figsize=(8, 8))
venn3([admissions_hadm_ids, icustays_hadm_ids, discharge_hadm_ids], ('Hospital', 'ICU', 'Discharge'))
plt.show()

starting cohort

In [None]:
print(icustays.subject_id.nunique())
print(icustays.hadm_id.nunique())
print(icustays.stay_id.nunique())

remove no hospital admission

In [None]:
icustays = icustays[icustays.hadm_id.isin(admissions_hadm_ids)]

In [None]:
print(icustays.subject_id.nunique())
print(icustays.hadm_id.nunique())
print(icustays.stay_id.nunique())

remove no discharge note

In [None]:
icustays = icustays[icustays.hadm_id.isin(discharge_hadm_ids)]

In [None]:
print(icustays.subject_id.nunique())
print(icustays.hadm_id.nunique())
print(icustays.stay_id.nunique())

remove >2 ICU stays per admission

In [None]:
to_keep = icustays.groupby("hadm_id").stay_id.nunique().reset_index()
to_keep = to_keep[to_keep.stay_id == 1][["hadm_id"]]
icustays = icustays.merge(to_keep, how="inner", on="hadm_id")

In [None]:
print(icustays.subject_id.nunique())
print(icustays.hadm_id.nunique())
print(icustays.stay_id.nunique())

remove < 18 years old patients

In [None]:
icustays.intime = pd.to_datetime(icustays.intime)
icustays = icustays.merge(patients[["subject_id", "anchor_age", "anchor_year"]], on="subject_id", how="inner")
icustays["age"] = icustays.intime.dt.year - icustays.anchor_year + icustays.anchor_age
icustays.head()

In [None]:
icustays.age.min()

remove negative los

In [None]:
icustays = icustays.merge(admissions[["hadm_id", "admittime", "dischtime"]], on="hadm_id", how="inner")
icustays = icustays.rename(columns={
    "admittime": "hadm_intime",
    "dischtime": "hadm_outtime",
    "intime": "stay_intime",
    "outtime": "stay_outtime",
})
icustays.head()

In [None]:
icustays['hadm_intime'] = pd.to_datetime(icustays['hadm_intime'])
icustays['hadm_outtime'] = pd.to_datetime(icustays['hadm_outtime'])
icustays['stay_intime'] = pd.to_datetime(icustays['stay_intime'])
icustays['stay_outtime'] = pd.to_datetime(icustays['stay_outtime'])

In [None]:
icustays['stay_los'] = (icustays['stay_outtime'] - icustays['stay_intime']).dt.total_seconds() / 3600
icustays['hadm_los'] = (icustays['hadm_outtime'] - icustays['hadm_intime']).dt.total_seconds() / 3600

In [None]:
icustays.stay_los.min()

In [None]:
icustays.hadm_los.min()

In [None]:
icustays = icustays[icustays.stay_los >= 0]
icustays = icustays[icustays.hadm_los >= 0]

In [None]:
print(icustays.subject_id.nunique())
print(icustays.hadm_id.nunique())
print(icustays.stay_id.nunique())

statistics

In [None]:
icustays.groupby("subject_id").hadm_id.nunique().value_counts()

In [None]:
icustays.groupby("subject_id").hadm_id.nunique().describe()

In [None]:
icustays.groupby("hadm_id").stay_id.nunique().value_counts()

In [None]:
icustays.groupby("hadm_id").stay_id.nunique().describe()

In [None]:
icustays.stay_los.describe()

In [None]:
icustays.hadm_los.describe()

save

In [None]:
icustays.head()

In [None]:
icustays = icustays[["subject_id", "hadm_id", "stay_id", "hadm_intime", "hadm_outtime", "hadm_los", "stay_intime", "stay_outtime", "stay_los"]]
icustays.head()

In [None]:
output_path = os.path.join(processed_data_path, "mimic4")
create_directory(output_path)

In [None]:
icustays.to_csv(os.path.join(output_path, 'cohort.csv'), index=False)