In [2]:
import datetime
import json
import os

import pandas as pd

# Load Demographics and Dataset_Part-Patient_id bindings


In [3]:
raw_obs_path = "./datasets/raw_stage/observation_tables"
lactate_var_ids = (24000524, 24000732, 24000485)

df_demo = pd.read_csv("./datasets/reference_data/general_table.csv")
# convert admissiontime to datetime type
df_demo["admissiontime"] = pd.to_datetime(df_demo["admissiontime"])
display(df_demo.describe(include="all", datetime_is_numeric=True))

df_pid_part = pd.read_csv(f"{raw_obs_path}/observation_tables_index.csv")
df_pid_part.sort_values(by=["part"], inplace=True)

Unnamed: 0,patientid,admissiontime,sex,age,discharge_status
count,33905.0,33905,33905,33905.0,33666
unique,,,2,,2
top,,,M,,alive
freq,,,21767,,31604
mean,16953.0,2151-03-13 08:23:47.234832384,,63.523964,
min,1.0,2102-11-27 06:45:00,,20.0,
25%,8477.0,2129-02-14 12:20:00,,55.0,
50%,16953.0,2151-05-23 19:30:00,,65.0,
75%,25429.0,2173-04-07 18:30:00,,75.0,
max,33905.0,2198-09-03 16:35:00,,90.0,


# Filter Age >= 18 Years Old

No effect, patients inside HiRID are all >= 18 years old


In [3]:
df_demo = df_demo[df_demo["age"] >= 18]
df_demo.reset_index(drop=True, inplace=True)

# Filter Stay >= 1 Day


In [5]:
filtered_pids = list()
for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part[df_pid_part["part"] == x]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        df_part_X_pid_Y.sort_values(by=["datetime", "entertime"], inplace=True)
        # last datetime - admissiontime
        if (
            df_part_X_pid_Y.iloc[-1]["datetime"]
            - df_demo[df_demo["patientid"] == pid].iloc[0]["admissiontime"]
        ) >= datetime.timedelta(days=1):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
16787


# Filter Lactate_Measurements >= 2

This should be useless as the next filter tests that lactate_measurements >= 2 in the first 2d (CHECK)

So obviously if there are 2+ lactate measurements in the first 2d then there are of course 2+ lactate measurements in general


In [6]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part[
        (df_pid_part["part"] == x)
        & (df_pid_part["patientid"].isin(previously_filtered))
    ]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        if (
            len(df_part_X_pid_Y[df_part_X_pid_Y["variableid"].isin(lactate_var_ids)])
            >= 2
        ):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
15209


# Filter Lactate_Measurements >= 2 in First 2d of Stay


In [7]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part[
        (df_pid_part["part"] == x)
        & (df_pid_part["patientid"].isin(previously_filtered))
    ]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        post_2d = df_demo[df_demo["patientid"] == pid].iloc[0][
            "admissiontime"
        ] + datetime.timedelta(days=2)
        # keep only first 2d
        df_part_X_pid_Y = df_part_X_pid_Y[df_part_X_pid_Y["datetime"] <= post_2d]
        if (
            len(df_part_X_pid_Y[df_part_X_pid_Y["variableid"].isin(lactate_var_ids)])
            >= 2
        ):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
15046


# Extract 5 Patients and Initial Sample Dataset


In [31]:
patientids = list()
with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "r") as f:
    patientids.extend(int(x) for x in f.readlines())
patientids = patientids[:5]
patientids


[17786, 16503, 22946, 9465, 6586]

In [32]:
df_pid_part_sample = df_pid_part[df_pid_part["patientid"].isin(patientids)].copy()
patients_sample = list()
for x in df_pid_part_sample["part"].unique():
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part_sample[df_pid_part_sample["part"] == x]["patientid"]:
        patients_sample.append(df_part_X[df_part_X["patientid"] == pid].copy())

df_sample = pd.DataFrame()
for df in patients_sample:
    df_sample = df_sample.append(df)
df_sample = df_sample.merge(df_demo, on="patientid")
df_variables = pd.read_csv("./datasets/reference_data/hirid_variable_reference.csv")
# extract only observation variables
df_variables = df_variables[df_variables["Source Table"] == "Observation"]
df_sample = df_sample.merge(df_variables, left_on="variableid", right_on="ID")
df_sample.drop(labels=["Source Table", "ID"], axis=1, inplace=True)
df_sample.sort_values(
    by=["patientid", "admissiontime", "datetime", "entertime"], inplace=True
)
df_sample.reset_index(drop=True, inplace=True)
df_sample.to_csv("./results/sample.csv")
display(df_sample)


Unnamed: 0,datetime,entertime,patientid,status,stringvalue,type,value,variableid,admissiontime,sex,age,discharge_status,Variable Name,Unit,Additional information
0,2133-07-29 23:05:00.000,2133-07-29 23:28:41.100,6586,8,,,160.000000,10000450,2133-07-29 23:05:00,F,80,alive,Body height measure,cm,
1,2133-07-29 23:05:00.000,2133-07-29 23:28:41.100,6586,8,,,60.000000,10000400,2133-07-29 23:05:00,F,80,alive,Body weight,kg,
2,2133-07-29 23:10:38.810,2133-07-29 23:10:37.066,6586,4,,,57.000000,200,2133-07-29 23:05:00,F,80,alive,Heart rate,/min,
3,2133-07-29 23:10:38.810,2133-07-29 23:10:37.096,6586,4,,,0.300000,210,2133-07-29 23:05:00,F,80,alive,ST elevation,mm,
4,2133-07-29 23:10:38.810,2133-07-29 23:10:37.130,6586,4,,,0.000000,211,2133-07-29 23:05:00,F,80,alive,ST elevation,mm,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327394,2162-08-05 13:26:54.110,2162-08-05 13:27:23.540,22946,8,,,0.501662,30010009,2162-08-03 21:20:00,M,30,alive,End tidal carbon dioxide concentration,mmHg,
327395,2162-08-05 13:29:07.040,2162-08-05 13:31:17.200,22946,8,,,0.501662,30010009,2162-08-03 21:20:00,M,30,alive,End tidal carbon dioxide concentration,mmHg,
327396,2162-08-05 13:31:20.000,2162-08-05 13:36:16.680,22946,8,,,0.573328,30010009,2162-08-03 21:20:00,M,30,alive,End tidal carbon dioxide concentration,mmHg,
327397,2162-08-05 13:32:26.460,2162-08-05 13:33:22.820,22946,8,,,0.573328,30010009,2162-08-03 21:20:00,M,30,alive,End tidal carbon dioxide concentration,mmHg,


In [37]:
df_pid_part_sample = df_pid_part[df_pid_part["patientid"].isin(patientids)].copy()
patients_sample = dict()
for x in df_pid_part_sample["part"].unique():
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part_sample[df_pid_part_sample["part"] == x]["patientid"]:
        patients_sample[pid] = df_part_X[df_part_X["patientid"] == pid].copy()
        patients_sample[pid].sort_values(by=["datetime", "entertime"], inplace=True)

var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

static_columns = list(var_bindings["static"].values())
for tup in df_demo[df_demo["patientid"].isin(patientids)].itertuples():
    os.makedirs(f"./results/{tup.patientid}", exist_ok=True)
    df_static_sample = pd.DataFrame(columns=static_columns)
    df_static_sample.loc[0] = [
        getattr(tup, x, None) for x in var_bindings["static"].keys()
    ]
    # compute length of stay
    df_static_sample.at[0, "Length of stay (days)"] = round(
        (
            patients_sample[tup.patientid].iloc[-1]["datetime"]
            - df_static_sample.iloc[0]["Admission Time"]
        ).total_seconds()
        / 60
        / 60
        / 24,
        2,
    )
    df_static_sample.to_csv(f"./results/{tup.patientid}/{tup.patientid}_static.csv")

# avoid duplicated columns
dynamic_columns = list(dict.fromkeys(var_bindings["dynamic"].values()).keys())
for pid, df_pid_data in patients_sample.items():
    # convert to str as the json bindings file has all strings
    df_pid_data["variableid"] = df_pid_data["variableid"].astype(str)
    # filter only variables that have bindings with MIMIC
    df_pid_data = df_pid_data[
        df_pid_data["variableid"].isin(var_bindings["dynamic"].keys())
    ]
    pid_admission_time = df_demo[df_demo["patientid"] == pid]["admissiontime"].iloc[0]
    tmp_dict = dict()
    for tup in df_pid_data.itertuples():
        row_key = round((tup.datetime - pid_admission_time).total_seconds() / 60, 2)
        if row_key not in tmp_dict:
            tmp_dict[row_key] = dict.fromkeys(dynamic_columns)
            tmp_dict[row_key][var_bindings["dynamic"]["datetime"]] = tup.datetime
            # entertime could be different for different values of the same record, maybe we should remove it
            # i.e. some values have the same datetime (observation) but different entertime (insertion into db)
            tmp_dict[row_key][var_bindings["dynamic"]["entertime"]] = tup.entertime

        if var_bindings["dynamic"][tup.variableid] == "Lactate":
            # check lactate variables priority (use arterial, if not available use venous)
            if tup.variableid != "24000524":
                # not arterial lactate, check if there's some lactate value already present
                if (
                    tmp_dict[row_key][var_bindings["dynamic"][tup.variableid]]
                    is not None
                ):
                    # lactate value already present, skip since it's probably an arterial one
                    continue

        tmp_dict[row_key][var_bindings["dynamic"][tup.variableid]] = tup.value
    df_dynamic_sample = pd.DataFrame(
        data=tmp_dict.values(),
        columns=dynamic_columns,
    )
    df_dynamic_sample["Patient ID"] = pid
    df_dynamic_sample.set_index(
        pd.Index(tmp_dict.keys(), name="Minutes of Stay"),
        drop=True,
        inplace=True,
    )
    df_dynamic_sample.to_csv(f"./results/{tup.patientid}/{tup.patientid}_dynamic.csv")
