In [7]:
import datetime

import pandas as pd

# Load Demographics and Dataset_Part-Patient_id bindings


In [20]:
raw_obs_path = "./datasets/raw_stage/observation_tables"
lactate_var_ids = (24000524, 24000732, 24000485)

df_demo = pd.read_csv("./datasets/reference_data/general_table.csv")
# convert admissiontime to datetime type
df_demo["admissiontime"] = pd.to_datetime(df_demo["admissiontime"])
display(df_demo.describe(include="all", datetime_is_numeric=True))

df_pid_part = pd.read_csv(f"{raw_obs_path}/observation_tables_index.csv")
df_pid_part.sort_values(by=["part"], inplace=True)

Unnamed: 0,patientid,admissiontime,sex,age,discharge_status
count,33905.0,33905,33905,33905.0,33666
unique,,,2,,2
top,,,M,,alive
freq,,,21767,,31604
mean,16953.0,2151-03-13 08:23:47.234832384,,63.523964,
min,1.0,2102-11-27 06:45:00,,20.0,
25%,8477.0,2129-02-14 12:20:00,,55.0,
50%,16953.0,2151-05-23 19:30:00,,65.0,
75%,25429.0,2173-04-07 18:30:00,,75.0,
max,33905.0,2198-09-03 16:35:00,,90.0,


# Filter Age >= 18 Years Old

No effect, patients inside HiRID are all >= 18 years old


In [21]:
df_demo = df_demo[df_demo["age"] >= 18]
df_demo.reset_index(drop=True, inplace=True)

# Filter Stay >= 1 Day


In [29]:
filtered_pids = list()
for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part[df_pid_part["part"] == x]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        df_part_X_pid_Y.sort_values(by=["datetime", "entertime"], inplace=True)
        # last datetime - admissiontime
        if (
            df_part_X_pid_Y.iloc[-1]["datetime"]
            - df_demo[df_demo["patientid"] == pid].iloc[0]["admissiontime"]
        ) >= datetime.timedelta(days=1):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
16787


# Filter Lactate_Measurements >= 2

This should be useless as the next filter tests that lactate_measurements >= 2 in the first 2d (CHECK)

So obviously if there are 2+ lactate measurements in the first 2d then there are of course 2+ lactate measurements in general


In [23]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part[
        (df_pid_part["part"] == x)
        & (df_pid_part["patientid"].isin(previously_filtered))
    ]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        if (
            len(df_part_X_pid_Y[df_part_X_pid_Y["variableid"].isin(lactate_var_ids)])
            >= 2
        ):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
15209


# Filter Lactate_Measurements >= 2 in First 2d of Stay


In [30]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part[
        (df_pid_part["part"] == x)
        & (df_pid_part["patientid"].isin(previously_filtered))
    ]["patientid"]:
        df_part_X_pid_Y = df_part_X[df_part_X["patientid"] == pid].copy()
        post_2d = df_demo[df_demo["patientid"] == pid].iloc[0][
            "admissiontime"
        ] + datetime.timedelta(days=2)
        # keep only first 2d
        df_part_X_pid_Y = df_part_X_pid_Y[df_part_X_pid_Y["datetime"] <= post_2d]
        if (
            len(df_part_X_pid_Y[df_part_X_pid_Y["variableid"].isin(lactate_var_ids)])
            >= 2
        ):
            filtered_pids.append(pid)

with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
15046


# Extract 5 Patients and Initial Sample Dataset


In [31]:
patientids = list()
with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "r") as f:
    patientids.extend(int(x) for x in f.readlines())
patientids = patientids[:5]
patientids


[17786, 16503, 22946, 9465, 6586]

In [None]:
df_pid_part_sample = df_pid_part[df_pid_part["patientid"].isin(patientids)].copy()
patients_sample = list()
for x in df_pid_part_sample["part"].unique():
    print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_parquet(f"{raw_obs_path}/parquet/part-{x}.parquet")
    for pid in df_pid_part_sample[df_pid_part_sample["part"] == x]["patientid"]:
        patients_sample.append(df_part_X[df_part_X["patientid"] == pid].copy())

df_sample = pd.DataFrame()
for df in patients_sample:
    df_sample = df_sample.append(df)
df_sample = df_sample.merge(df_demo, on="patientid")
df_variables = pd.read_csv("./datasets/reference_data/hirid_variable_reference.csv")
# extract only observation variables
df_variables = df_variables[df_variables["Source Table"] == "Observation"]
df_sample = df_sample.merge(df_variables, left_on="variableid", right_on="ID")
df_sample.drop(labels=["Source Table", "ID"], axis=1, inplace=True)
df_sample.sort_values(
    by=["patientid", "admissiontime", "datetime", "entertime"], inplace=True
)
df_sample.reset_index(drop=True, inplace=True)
df_sample.to_csv("./results/sample.csv")
display(df_sample)
