In [1]:
import datetime
import json
import os

import pandas as pd


# Load Demographics and Dataset_Part-Patient_id bindings


In [2]:
base_datasets_path = "D:/Data/Thesis_datasets"
raw_obs_path = f"{base_datasets_path}/raw_stage/observation_tables"
lactate_var_ids = ("24000524", "24000732", "24000485")

df_demo = pd.read_csv(
    f"{base_datasets_path}/reference_data/general_table.csv",
    dtype={
        "patientid": int,
        "sex": str,
        "age": int,
        "discharge_status": str,
    },
    parse_dates=["admissiontime"],
)
display(df_demo.describe(include="all", datetime_is_numeric=True))

df_pid_part = pd.read_csv(
    f"{raw_obs_path}/observation_tables_index.csv",
    dtype={
        "patientid": int,
        "part": int,
    },
)
df_pid_part.sort_values(by=["part"], inplace=True)


Unnamed: 0,patientid,admissiontime,sex,age,discharge_status
count,33905.0,33905,33905,33905.0,33666
unique,,,2,,2
top,,,M,,alive
freq,,,21767,,31604
mean,16953.0,2151-03-13 08:23:47.234832384,,63.523964,
min,1.0,2102-11-27 06:45:00,,20.0,
25%,8477.0,2129-02-14 12:20:00,,55.0,
50%,16953.0,2151-05-23 19:30:00,,65.0,
75%,25429.0,2173-04-07 18:30:00,,75.0,
max,33905.0,2198-09-03 16:35:00,,90.0,


# Filter Age >= 18 Years Old

No effect, patients inside HiRID are all >= 18 years old


In [42]:
df_demo = df_demo[df_demo["age"] >= 18]
df_demo.reset_index(drop=True, inplace=True)


# Filter Stay >= 1 Day


In [10]:
filtered_pids = list()
for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()), len(filtered_pids))
    df_part_X = pd.read_csv(
        f"{raw_obs_path}/csv/part-{x}.csv",
        dtype={
            "value": str,
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    for pid in df_pid_part[df_pid_part["part"] == x]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        df_part_X_pid_Y.sort_values(by=["datetime", "entertime"], inplace=True)
        # last datetime - admissiontime
        if (
            df_part_X_pid_Y.iloc[-1]["datetime"]
            - df_demo[df_demo["patientid"] == pid].iloc[0]["admissiontime"]
        ) >= datetime.timedelta(days=1):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
16787


# Filter Lactate_Measurements >= 2

This should be useless as the next filter tests that lactate_measurements >= 2 in the first 2d (CHECK)

So obviously if there are 2+ lactate measurements in the first 2d then there are of course 2+ lactate measurements in general


In [35]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()), len(filtered_pids))
    df_part_X = pd.read_csv(
        f"{raw_obs_path}/csv/part-{x}.csv",
        dtype={
            "value": str,
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    for pid in df_pid_part[
        (df_pid_part["part"] == x)
        & (df_pid_part["patientid"].isin(previously_filtered))
    ]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        if (
            len(df_part_X_pid_Y[df_part_X_pid_Y["variableid"].isin(lactate_var_ids)])
            >= 2
        ):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1 1432
0.2 2963
0.3 4522
0.4 5981
0.5 7570
0.6 8996
0.7 10496
0.8 12050
0.9 13593
1.0 15143
15209


# Filter Lactate_Measurements >= 2 in First 2d of Stay


In [36]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()), len(filtered_pids))
    df_part_X = pd.read_csv(
        f"{raw_obs_path}/csv/part-{x}.csv",
        dtype={
            "value": str,
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    for pid in df_pid_part[
        (df_pid_part["part"] == x)
        & (df_pid_part["patientid"].isin(previously_filtered))
    ]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        post_2d = df_demo[df_demo["patientid"] == pid].iloc[0][
            "admissiontime"
        ] + datetime.timedelta(days=2)
        # keep only first 2d
        df_part_X_pid_Y = df_part_X_pid_Y[df_part_X_pid_Y["datetime"] <= post_2d]
        if (
            len(df_part_X_pid_Y[df_part_X_pid_Y["variableid"].isin(lactate_var_ids)])
            >= 2
        ):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1 1417
0.2 2929
0.3 4473
0.4 5915
0.5 7488
0.6 8899
0.7 10379
0.8 11918
0.9 13447
1.0 14980
15046


# Extract 5 Patients


In [4]:
patientids = list()
with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "r") as f:
    patientids.extend(int(x) for x in f.readlines())
patientids = patientids[:5]
# create a folder for each patient
for pid in patientids:
    os.makedirs(f"./results/{pid}", exist_ok=True)
patientids


[17786, 16503, 22946, 9465, 6586]

# Split HiRID in patients' datasets


In [5]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

df_pid_part_sample = df_pid_part[df_pid_part["patientid"].isin(patientids)].copy()
for x in df_pid_part_sample["part"].unique():
    df_part_X = pd.read_csv(
        f"{raw_obs_path}/csv/part-{x}.csv",
        dtype={
            "value": str,
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    for pid in df_pid_part_sample[df_pid_part_sample["part"] == x]["patientid"]:
        os.makedirs(f"./results/{pid}", exist_ok=True)
        df_pid_data = df_part_X[
            (df_part_X["patientid"] == pid)
            & (df_part_X["variableid"].isin(var_bindings["dynamic"].keys()))
        ].copy()
        df_pid_data.sort_values(by=["datetime", "entertime"], inplace=True)
        df_pid_data.reset_index(drop=True, inplace=True)
        df_pid_data.to_csv(f"./results/{pid}/{pid}_original.csv")


In [7]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

static_columns = list(var_bindings["static"].values())
for tup in df_demo[df_demo["patientid"].isin(patientids)].itertuples():
    df_pid_data = pd.read_csv(
        f"./results/{tup.patientid}/{tup.patientid}_original.csv",
        dtype={
            "value": str,
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    df_static_sample = pd.DataFrame(columns=static_columns)
    df_static_sample.loc[0] = [
        getattr(tup, x, None) for x in var_bindings["static"].keys()
    ]
    # compute length of stay
    df_static_sample.at[0, "Length of stay (days)"] = round(
        (
            df_pid_data.iloc[-1]["datetime"]
            - df_static_sample.iloc[0]["Admission Time"]
        ).total_seconds()
        / 60
        / 60
        / 24,
        2,
    )
    df_static_sample.to_csv(f"./results/{tup.patientid}/{tup.patientid}_static.csv")

# avoid duplicated columns
dynamic_columns = list(dict.fromkeys(var_bindings["dynamic"].values()).keys())
for pid in patientids:
    df_pid_data = pd.read_csv(
        f"./results/{pid}/{pid}_original.csv",
        dtype={
            "value": str,
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    df_pid_data = df_pid_data[
        df_pid_data["variableid"].isin(var_bindings["dynamic"].keys())
    ]
    pid_admission_time = df_demo[df_demo["patientid"] == pid]["admissiontime"].iloc[0]
    tmp_dict = dict()
    for tup in df_pid_data.itertuples():
        row_key = round((tup.datetime - pid_admission_time).total_seconds() / 60, 2)
        if row_key not in tmp_dict:
            tmp_dict[row_key] = dict.fromkeys(dynamic_columns)
            tmp_dict[row_key][var_bindings["dynamic"]["datetime"]] = tup.datetime
            # entertime could be different for different values of the same record, maybe we should remove it
            # i.e. some values have the same datetime (observation) but different entertime (insertion into db)
            tmp_dict[row_key][var_bindings["dynamic"]["entertime"]] = tup.entertime

        if var_bindings["dynamic"][tup.variableid] == "Lactate":
            # check lactate variables priority (use arterial, if not available use venous)
            if tup.variableid != "24000524":
                # not arterial lactate, check if there's some lactate value already present
                if (
                    tmp_dict[row_key][var_bindings["dynamic"][tup.variableid]]
                    is not None
                ):
                    # lactate value already present, skip since it's probably an arterial one
                    continue

        tmp_dict[row_key][var_bindings["dynamic"][tup.variableid]] = tup.value

        # create new vars for bp
        if var_bindings["dynamic"][tup.variableid] in (
            "Invasive Diastolic Arterial Blood Pressure",
            "Non-Invasive Diastolic Arterial Blood Pressure",
            "Invasive Mean Arterial Blood Pressure",
            "Non-Invasive Mean Arterial Blood Pressure",
            "Invasive Systolic Arterial Blood Pressure",
            "Non-Invasive Systolic Arterial Blood Pressure",
        ):
            var_id = ""
            if var_bindings["dynamic"][tup.variableid] in (
                "Invasive Diastolic Arterial Blood Pressure",
                "Non-Invasive Diastolic Arterial Blood Pressure",
            ):
                var_id = "diastolic_arterial_bp_computed"
            if var_bindings["dynamic"][tup.variableid] in (
                "Invasive Mean Arterial Blood Pressure",
                "Non-Invasive Mean Arterial Blood Pressure",
            ):
                var_id = "mean_arterial_bp_computed"
            if var_bindings["dynamic"][tup.variableid] in (
                "Invasive Systolic Arterial Blood Pressure",
                "Non-Invasive Systolic Arterial Blood Pressure",
            ):
                var_id = "systolic_arterial_bp_computed"
            # check bp variables priority (use invasive, if not available use non-invasive)
            if tup.variableid not in ("100", "110", "120"):
                # non-invasive bp, check if there's some bp value already present
                if tmp_dict[row_key][var_bindings["dynamic"][var_id]] is not None:
                    # bp value already present, skip since it's probably an invasive one
                    continue
            tmp_dict[row_key][var_bindings["dynamic"][var_id]] = tup.value

    df_dynamic_sample = pd.DataFrame(
        data=tmp_dict.values(),
        columns=dynamic_columns,
    )
    df_dynamic_sample["Patient ID"] = pid
    df_dynamic_sample.set_index(
        pd.Index(tmp_dict.keys(), name="Minutes of Stay"),
        drop=True,
        inplace=True,
    )
    df_dynamic_sample.to_csv(f"./results/{tup.patientid}/{tup.patientid}_dynamic.csv")
