In [1]:
import datetime
import glob
import json
import os
import sqlite3

import pandas as pd
import seaborn as sns
from IPython.display import display
from matplotlib import pyplot as plt
from numpy import nanmedian as np_nanmedian


# Load Demographics and Dataset_Part-Patient_id bindings


In [2]:
config = None
with open("./config.json", "r") as f:
    config = json.load(f)
    config["RAW_OBS_PATH"] = (
        config["BASE_DATASETS_PATH"] + "/raw_stage/observation_tables"
    )
    config["DB_PATH"] = config["BASE_DATASETS_PATH"] + "/raw_data.db"
with open("./config.json", "w") as f:
    json.dump(config, f, indent=4)

df_demo = pd.read_csv(
    config["BASE_DATASETS_PATH"] + "/reference_data/general_table.csv",
    dtype={
        "patientid": int,
        "sex": str,
        "age": int,
        "discharge_status": str,
    },
    parse_dates=["admissiontime"],
)
display(df_demo.describe(include="all", datetime_is_numeric=True))

df_pid_part = pd.read_csv(
    config["RAW_OBS_PATH"] + "/observation_tables_index.csv",
    dtype={
        "patientid": int,
        "part": int,
    },
)
df_pid_part.sort_values(by=["part"], inplace=True)


Unnamed: 0,patientid,admissiontime,sex,age,discharge_status
count,33905.0,33905,33905,33905.0,33666
unique,,,2,,2
top,,,M,,alive
freq,,,21767,,31604
mean,16953.0,2151-03-13 08:23:47.234832384,,63.523964,
min,1.0,2102-11-27 06:45:00,,20.0,
25%,8477.0,2129-02-14 12:20:00,,55.0,
50%,16953.0,2151-05-23 19:30:00,,65.0,
75%,25429.0,2173-04-07 18:30:00,,75.0,
max,33905.0,2198-09-03 16:35:00,,90.0,


# Move Everything to SQLite

Skip if you already have the DB


In [3]:
for x in df_pid_part["part"].unique():
    if (x + 1) % 25 == 0:
        print((x + 1) / len(df_pid_part["part"].unique()))
    df_part_X = pd.read_csv(
        config["RAW_OBS_PATH"] + f"/csv/part-{x}.csv",
        dtype={
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    conn = sqlite3.connect(config["DB_PATH"])
    df_part_X.to_sql(name="raw_data", con=conn, if_exists="append", index=False)
    conn.close()

conn = sqlite3.connect(config["DB_PATH"])
cur = conn.cursor()
print("Creating index on datetime...")
cur.execute("CREATE INDEX index_datetime ON raw_data(datetime)")
print("Creating index on patientid...")
cur.execute("CREATE INDEX index_patient_id ON raw_data(patientid)")
print("Creating index on variableid...")
cur.execute("CREATE INDEX index_variable_id ON raw_data(variableid)")
conn.close()


0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
Creating index on datetime...
Creating index on patientid...
Creating index on variableid...


# Filtering

Skip if you already have the .txt filter files


## Filter Age >= 18 Years Old

No effect, patients inside HiRID are all >= 18 years old


In [4]:
df_demo = df_demo[df_demo["age"] >= 18]
df_demo.reset_index(drop=True, inplace=True)


## Filter Stay >= 1 Day


In [5]:
filtered_pids = list()

conn = sqlite3.connect(config["DB_PATH"])
df_raw = pd.read_sql(
    "SELECT patientid, MAX(datetime) as last_datetime FROM raw_data GROUP BY patientid",
    conn,
    parse_dates=["last_datetime"],
    chunksize=1000,
)
for df in df_raw:
    for tup in df.itertuples():
        # last datetime - admissiontime
        if (
            tup.last_datetime
            - df_demo[df_demo["patientid"] == tup.patientid].iloc[0]["admissiontime"]
        ) >= datetime.timedelta(days=1):
            filtered_pids.append(tup.patientid)
    print(len(filtered_pids))
conn.close()

with open("./results/patients_18+_1d+.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


511
991
1495
1988
2503
3003
3503
4000
4509
4999
5525
6013
6514
7020
7513
8000
8479
8979
9492
9994
10470
10995
11479
11983
12472
12962
13446
13919
14413
14901
15366
15853
16339
16787
16787


## Filter Lactate_Measurements >= 2

This should be useless as the next filter tests that lactate_measurements >= 2 in the first 2d (CHECK)

So obviously if there are 2+ lactate measurements in the first 2d then there are of course 2+ lactate measurements in general


In [6]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

conn = sqlite3.connect(config["DB_PATH"])
lact_vars = "', '".join([str(x) for x in config["lactate_var_ids"]])
pids_checks = ", ".join([str(x) for x in previously_filtered])
df_raw = pd.read_sql(
    f"SELECT patientid, COUNT(*) as cnt_lactate_measurements FROM raw_data WHERE (variableid IN ('{lact_vars}')) AND (patientid IN ({pids_checks})) GROUP BY patientid",
    conn,
    chunksize=1000,
)
for df in df_raw:
    for tup in df.itertuples():
        # 2+ lactate measurements
        if tup.cnt_lactate_measurements >= 2:
            filtered_pids.append(tup.patientid)
    print(len(filtered_pids))
conn.close()

with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


981
1960
2943
3924
4908
5885
6868
7845
8823
9806
10783
11770
12748
13728
14709
15249
15249


## Filter Lactate_Measurements >= 2 in First 2d of Stay


In [7]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+_2+lactate_measurements.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

conn = sqlite3.connect(config["DB_PATH"])
lact_vars = "', '".join([str(x) for x in config["lactate_var_ids"]])
for pid in previously_filtered:
    post_2d = df_demo[df_demo["patientid"] == pid].iloc[0][
        "admissiontime"
    ] + datetime.timedelta(days=2)
    df_raw = pd.read_sql(
        f"SELECT patientid, COUNT(*) AS cnt_lactate_measurements FROM raw_data WHERE (variableid IN ('{lact_vars}')) AND (patientid={pid}) AND (datetime<='{post_2d}')",
        conn,
    )
    for tup in df_raw.itertuples():
        # 2+ lactate measurements in first 2d
        if tup.cnt_lactate_measurements >= 2:
            filtered_pids.append(pid)
    if len(filtered_pids) % 1000 == 0:
        print(len(filtered_pids))
conn.close()

with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "w") as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


1000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
15099


# Create Single-Variable Files


In [8]:
patientids = list()
with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "r") as f:
    patientids.extend(int(x) for x in f.readlines())

var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

os.makedirs("./results/distribution_plots/data", exist_ok=True)
# avoid duplicated columns
dynamic_columns = list(dict.fromkeys(var_bindings["dynamic"].values()).keys())
pids_checks = "', '".join([str(x) for x in patientids])
# skip patientid, datetime, entertime
for var in dynamic_columns[3:]:
    # remove invalid characters from filename
    filename = var
    for c in ("\\", "/", ":", "*", "?", '"', "<", ">", "|"):
        filename = filename.replace(c, "_")
    # get all HiRID var_ids connected to final variable name
    var_X_ids = [k for k, v in var_bindings["dynamic"].items() if v == var]
    print(var, var_X_ids)
    var_ids_checks = "', '".join(var_X_ids)
    conn = sqlite3.connect(config["DB_PATH"])
    df_var_raw = pd.read_sql(
        f"SELECT value FROM raw_data WHERE (patientid IN ('{pids_checks}')) AND (variableid IN ('{var_ids_checks}'))",
        conn,
    )
    df_var_raw.to_csv(f"./results/distribution_plots/data/{filename}.csv", index=False)
    conn.close()


Activated Partial Thromboplastin Time ['20004410']
Alanine aminotransferase ['20002600']
Albumin ['24000605']
Alkaline phosphatase ['20002700']
Anion Gap ['anion_gap_computed']
Aspartate aminotransferase ['24000330']
Base excess ['20001300']
Bicarbonate ['20004200']
Bilirubin.total ['20004300']
Body Height ['10000450']
Body Weight ['10000400']
Calcium ['20005100']
Calcium.ionized ['24000522']
Carbon dioxide [Partial pressure] ['20001200']
Central Venous Pressure ['700', '960', '15001441']
Chloride ['24000439', '24000521']
Creatinine ['20000600']
Diastolic Arterial Blood Pressure ['diastolic_arterial_bp_computed']
Fraction of Inspired Oxygen ['2010']
Glasgow Coma Score - Verbal Response ['10000100']
Glasgow Coma Score - Motor Response ['10000200']
Glasgow Coma Score - Eye Opening ['10000300']
Glasgow Coma Score - Total ['gcs-total_computed']
Glucose ['20005110', '24000523', '24000585']
Heart rate ['200']
Hemoglobin ['20000900', '24000548', '24000836']
Invasive Diastolic Arterial Blood P

# Run plots.R to Get Distribution Plots


# Create Ranges File


In [9]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

tmp = dict()
try:
    with open("./var_ranges.json", "r") as f:
        tmp = json.load(f)
except Exception:
    pass
# avoid duplicated columns
dynamic_columns = list(dict.fromkeys(var_bindings["dynamic"].values()).keys())
# skip patientid, datetime, entertime
for var in dynamic_columns[3:]:
    if var not in tmp:
        tmp[var] = dict(min=None, max=None)

# load ranges' file taken from https://github.com/ratschlab/HIRID-ICU-Benchmark/
df_benchmark_ranges = pd.read_csv("../Materials/varref.tsv", sep="\t")
df_benchmark_ranges = df_benchmark_ranges.iloc[:, [2, 3, 4, 5, 6]]
df_benchmark_ranges["variableid"] = df_benchmark_ranges["variableid"].astype(int)
df_benchmark_ranges

for tup in df_benchmark_ranges.itertuples():
    var = (
        var_bindings["dynamic"][str(tup.variableid)]
        if str(tup.variableid) in var_bindings["dynamic"]
        else None
    )
    if var:
        if not pd.isna(tup.lowerbound):
            if tmp[var]["min"] is not None:
                # if saved min is higher than new min replace it
                if tmp[var]["min"] > tup.lowerbound:
                    tmp[var]["min"] = tup.lowerbound
            else:
                tmp[var]["min"] = tup.lowerbound
        if not pd.isna(tup.upperbound):
            if tmp[var]["max"] is not None:
                # if saved max is lower than new max replace it
                if tmp[var]["max"] < tup.upperbound:
                    tmp[var]["max"] = tup.upperbound
            else:
                tmp[var]["max"] = tup.upperbound

with open("./var_ranges.json", "w") as f:
    json.dump(tmp, f, indent=4)


# Apply Ranges to Single-Variable Files and Plot Again


In [10]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

var_ranges = dict()
with open("./var_ranges.json", "r") as f:
    var_ranges = json.load(f)

# avoid duplicated columns
dynamic_columns = list(dict.fromkeys(var_bindings["dynamic"].values()).keys())
for var in dynamic_columns[3:]:
    # remove invalid characters from filename
    filename = var
    for c in ("\\", "/", ":", "*", "?", '"', "<", ">", "|"):
        filename = filename.replace(c, "_")
    print(var)
    df_var_raw = pd.read_csv(
        f"./results/distribution_plots/data/{filename}.csv"
    )
    if var in var_ranges.keys():
        filename = f"{filename} FILTERED"
        if var_ranges[var]["min"] is not None:
            df_var_raw = df_var_raw[df_var_raw["value"] >= var_ranges[var]["min"]]
        if var_ranges[var]["max"] is not None:
            df_var_raw = df_var_raw[df_var_raw["value"] <= var_ranges[var]["max"]]
    # plots
    f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10))
    ax1.title.set_text(var)
    ax2.title.set_text(var)
    sns.boxplot(y=df_var_raw["value"], ax=ax1)
    sns.histplot(x=df_var_raw["value"], ax=ax2)
    ax1.set_ylabel("Value")
    ax2.set_xlabel("Value")
    # disable scientific notation
    plt.ticklabel_format(style="plain", axis="y")
    plt.tight_layout()
    plt.savefig(f"./results/distribution_plots/{filename}.png")
    plt.close()


Activated Partial Thromboplastin Time
Alanine aminotransferase
Albumin
Alkaline phosphatase
Anion Gap
Aspartate aminotransferase
Base excess
Bicarbonate
Bilirubin.total
Body Height
Body Weight
Calcium
Calcium.ionized
Carbon dioxide [Partial pressure]
Central Venous Pressure
Chloride
Creatinine
Diastolic Arterial Blood Pressure
Fraction of Inspired Oxygen
Glasgow Coma Score - Verbal Response
Glasgow Coma Score - Motor Response
Glasgow Coma Score - Eye Opening
Glasgow Coma Score - Total
Glucose
Heart rate
Hemoglobin
Invasive Diastolic Arterial Blood Pressure
Invasive Mean Arterial Blood Pressure
Invasive Systolic Arterial Blood Pressure
Lactate
Leukocytes
Lymphocytes
Magnesium
Mean Arterial Blood Pressure
Mean Cell Hemoglobin
Mean Cell Hemoglobin Concentration
Mean Corpuscular Volume
Non-Invasive Diastolic Arterial Blood Pressure
Non-Invasive Mean Arterial Blood Pressure
Non-Invasive Systolic Arterial Blood Pressure
Oxygen [Partial pressure]
Oxygen Saturation
pH
Phosphate
Platelets
Potas

# Extract Patients


In [11]:
patientids = list()
with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "r") as f:
    patientids.extend(int(x) for x in f.readlines())
# patientids = [17786, 16503, 22946, 9465, 6586]
# create a folder for each patient
for pid in patientids:
    os.makedirs(f"./results/{pid}", exist_ok=True)
patientids


[2,
 4,
 5,
 7,
 9,
 11,
 14,
 24,
 27,
 29,
 30,
 36,
 37,
 38,
 40,
 43,
 44,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 59,
 60,
 61,
 63,
 67,
 69,
 72,
 76,
 80,
 81,
 83,
 88,
 90,
 91,
 96,
 97,
 99,
 100,
 104,
 105,
 107,
 109,
 110,
 112,
 117,
 118,
 122,
 124,
 126,
 127,
 129,
 133,
 135,
 136,
 137,
 138,
 139,
 141,
 144,
 147,
 150,
 151,
 152,
 160,
 161,
 166,
 167,
 171,
 172,
 175,
 178,
 179,
 182,
 184,
 188,
 190,
 191,
 193,
 194,
 195,
 197,
 199,
 200,
 205,
 206,
 209,
 210,
 211,
 212,
 218,
 219,
 220,
 222,
 225,
 226,
 227,
 230,
 231,
 234,
 235,
 237,
 239,
 240,
 241,
 243,
 246,
 248,
 250,
 253,
 254,
 258,
 260,
 261,
 263,
 264,
 266,
 267,
 271,
 272,
 275,
 276,
 278,
 280,
 281,
 283,
 288,
 290,
 291,
 293,
 294,
 295,
 296,
 298,
 299,
 301,
 303,
 305,
 306,
 307,
 309,
 310,
 312,
 313,
 314,
 319,
 320,
 324,
 326,
 328,
 330,
 331,
 333,
 336,
 338,
 339,
 340,
 345,
 348,
 350,
 351,
 352,
 353,
 355,
 356,
 357,
 359,
 362,
 363,
 364

# Split HiRID in Patients' Datasets

## Original Format


In [12]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

for pid in patientids:
    os.makedirs(f"./results/{pid}", exist_ok=True)
    var_ids_checks = "', '".join(var_bindings["dynamic"].keys())
    conn = sqlite3.connect(config["DB_PATH"])
    df_raw = pd.read_sql(
        f"SELECT * FROM raw_data WHERE (patientid={pid}) AND (variableid IN ('{var_ids_checks}')) ORDER BY datetime, entertime",
        conn,
        parse_dates=["datetime", "entertime"],
    )
    df_raw.to_csv(f"./results/{pid}/{pid}_original.csv", index=False)
    conn.close()


## Static Format


In [13]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

static_columns = list(var_bindings["static"].values())
for tup in df_demo[df_demo["patientid"].isin(patientids)].itertuples():
    last_datetime = pd.read_csv(
        f"./results/{tup.patientid}/{tup.patientid}_original.csv",
        dtype={
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    ).iloc[-1]["datetime"]
    df_static = pd.DataFrame(columns=static_columns)
    df_static.loc[0] = [getattr(tup, x, None) for x in var_bindings["static"].keys()]
    # compute length of stay
    df_static.at[0, "Length of stay (days)"] = round(
        (last_datetime - df_static.iloc[0]["Admission Time"]).total_seconds()
        / 60
        / 60
        / 24,
        2,
    )
    df_static.to_csv(
        f"./results/{tup.patientid}/{tup.patientid}_static.csv", index=False
    )


## Dynamic Format


In [14]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)

var_ranges = dict()
with open("./var_ranges.json", "r") as f:
    var_ranges = json.load(f)

# avoid duplicated columns
dynamic_columns = list(dict.fromkeys(var_bindings["dynamic"].values()).keys())
var_ids_checks = var_bindings["dynamic"].keys()
for pid in patientids:
    df_pid_raw_data = pd.read_csv(
        f"./results/{pid}/{pid}_original.csv",
        dtype={
            "patientid": int,
            "status": int,
            "stringvalue": str,
            "type": str,
            "value": str,
            "variableid": str,
        },
        parse_dates=["datetime", "entertime"],
    )
    df_pid_raw_data = df_pid_raw_data[
        df_pid_raw_data["variableid"].isin(var_ids_checks)
    ]
    pid_admission_time = df_demo[df_demo["patientid"] == pid]["admissiontime"].iloc[0]
    # dynamic data with minutes of stay as key
    tmp_dict = dict()
    for tup in df_pid_raw_data.itertuples():
        row_key = round((tup.datetime - pid_admission_time).total_seconds() / 60, 2)
        if row_key not in tmp_dict:
            tmp_dict[row_key] = dict.fromkeys(dynamic_columns)
            tmp_dict[row_key][var_bindings["dynamic"]["datetime"]] = tup.datetime
            # entertime could be different for different values of the same record, maybe we should remove it
            # i.e. some values have the same datetime (observation) but different entertime (insertion into db)
            tmp_dict[row_key][var_bindings["dynamic"]["entertime"]] = tup.entertime
        var = var_bindings["dynamic"][tup.variableid]
        # lactate priority
        if var == "Lactate":
            # check lactate variables priority (use arterial, if not available use venous)
            if tup.variableid != "24000524":
                # not arterial lactate, check if there's some lactate value already present
                if tmp_dict[row_key][var] is not None:
                    # lactate value already present, skip since it's probably an arterial one
                    continue

        # check if value is in its validity range, if not skip value
        if var in var_ranges.keys() and tup.value:
            if var_ranges[var]["min"] is not None:
                if float(tup.value) < var_ranges[var]["min"]:
                    continue
            if var_ranges[var]["max"] is not None:
                if float(tup.value) > var_ranges[var]["max"]:
                    continue
        tmp_dict[row_key][var] = tup.value

        # create new "computed" vars for bp and set bp priority
        if var in (
            "Invasive Diastolic Arterial Blood Pressure",
            "Non-Invasive Diastolic Arterial Blood Pressure",
            "Invasive Mean Arterial Blood Pressure",
            "Non-Invasive Mean Arterial Blood Pressure",
            "Invasive Systolic Arterial Blood Pressure",
            "Non-Invasive Systolic Arterial Blood Pressure",
        ):
            if var in (
                "Invasive Diastolic Arterial Blood Pressure",
                "Non-Invasive Diastolic Arterial Blood Pressure",
            ):
                var = var_bindings["dynamic"]["diastolic_arterial_bp_computed"]
            elif var in (
                "Invasive Mean Arterial Blood Pressure",
                "Non-Invasive Mean Arterial Blood Pressure",
            ):
                var = var_bindings["dynamic"]["mean_arterial_bp_computed"]
            elif var in (
                "Invasive Systolic Arterial Blood Pressure",
                "Non-Invasive Systolic Arterial Blood Pressure",
            ):
                var = var_bindings["dynamic"]["systolic_arterial_bp_computed"]
            # check bp variables priority (use invasive, if not available use non-invasive)
            if tup.variableid not in ("100", "110", "120"):
                # non-invasive bp, check if there's some bp value already present
                if tmp_dict[row_key][var] is not None:
                    # bp value already present, skip since it's probably an invasive one
                    continue
            tmp_dict[row_key][var] = tup.value

    df_dynamic = pd.DataFrame(
        data=tmp_dict.values(),
        columns=dynamic_columns,
    )
    df_dynamic["Patient ID"] = pid
    # sum of the other gcs
    df_dynamic[var_bindings["dynamic"]["gcs-total_computed"]] = (
        pd.to_numeric(df_dynamic[var_bindings["dynamic"]["10000100"]])
        + pd.to_numeric(df_dynamic[var_bindings["dynamic"]["10000200"]])
        + pd.to_numeric(df_dynamic[var_bindings["dynamic"]["10000300"]])
    )
    # https://www.thecalculator.co/health/Anion-Gap-Calculator-678.html
    df_dynamic[var_bindings["dynamic"]["anion_gap_computed"]] = pd.to_numeric(
        df_dynamic[var_bindings["dynamic"]["20000400"]]
    ) - (
        pd.to_numeric(df_dynamic[var_bindings["dynamic"]["24000439"]])
        + pd.to_numeric(df_dynamic[var_bindings["dynamic"]["20004200"]])
    )

    df_dynamic.set_index(
        pd.Index(tmp_dict.keys(), name="Minutes of Stay"),
        drop=True,
        inplace=True,
    )
    df_dynamic.to_csv(f"./results/{tup.patientid}/{tup.patientid}_dynamic.csv")


# Bin Patients' Data


In [15]:
var_bindings = dict()
with open("./var_id_var_name_bindings.json", "r") as f:
    var_bindings = json.load(f)


HYPER_bin_time = 2  # hours
HYPER_categ_agg = "mode"  # possible values: mode, last
HYPER_numer_agg = "mean"  # possible values: mean, median, last


def custom_mode(x):
    m = pd.Series.mode(x)
    return m.iloc[0] if not m.empty else None


def get_agg_method(x):
    if x == "last" or x == "mean":
        return x
    elif x == "mode":
        return custom_mode
    elif x == "median":
        return np_nanmedian


for pid in patientids:
    df_dynamic = pd.read_csv(f"./results/{pid}/{pid}_dynamic.csv")
    # remove datetime and entertime as no longer needed
    df_dynamic.drop(
        [var_bindings["dynamic"]["datetime"], var_bindings["dynamic"]["entertime"]],
        axis=1,
        inplace=True,
    )

    # bin minutes of stay
    df_dynamic.loc[:, "Minutes of Stay"] = (
        df_dynamic["Minutes of Stay"]
        // (HYPER_bin_time * 60)  # floor division by two hours (obtain hours)
        * (HYPER_bin_time * 60)  # multiply again to obtain minutes
    )
    # extract categorical/numerical variables
    categorical_vars = (
        "Patient ID",
        "Glasgow Coma Score - Verbal Response",
        "Glasgow Coma Score - Motor Response",
        "Glasgow Coma Score - Eye Opening",
        "Glasgow Coma Score - Total",
        "APACHE II Group",
        "APACHE IV Group",
        "Circadian rhythm",
        "Richmond agitation-sedation scale",
        "Train of four count",
        "Ventilator Airway Code",
        "Ventilator mode",
    )
    numerical_vars = [
        x
        for x in df_dynamic.columns
        if x != "Minutes of Stay" and x not in categorical_vars
    ]
    agg_dict = {x: get_agg_method(HYPER_categ_agg) for x in categorical_vars}
    agg_dict.update({x: get_agg_method(HYPER_numer_agg) for x in numerical_vars})
    # apply aggregation methods
    df_tmp = df_dynamic.groupby(by=["Minutes of Stay"]).agg(agg_dict)
    df_tmp.reset_index(inplace=True)

    # check if patient still has 2+ lactate measures, if not skip patient
    if (
        df_tmp[var_bindings["dynamic"][config["lactate_var_ids"][0]]].notnull().sum()
        < 2
    ):
        continue

    # filter first two days of stay
    df_tmp = df_tmp[df_tmp["Minutes of Stay"] <= 60 * 24 * 2]
    # filter data between first and last lactate measurements
    df_tmp = df_tmp.iloc[
        # first lactate index
        df_tmp[pd.notna(df_tmp[var_bindings["dynamic"][config["lactate_var_ids"][0]]])]
        .iloc[0]
        .name :
        # last lactate index + 1
        df_tmp[pd.notna(df_tmp[var_bindings["dynamic"][config["lactate_var_ids"][0]]])]
        .iloc[-1]
        .name
        + 1
    ]
    # reapply bp priority
    for tup in df_tmp.itertuples():
        df_tmp.loc[
            tup.Index, var_bindings["dynamic"]["diastolic_arterial_bp_computed"]
        ] = (
            df_tmp.loc[
                tup.Index, ["Invasive Diastolic Arterial Blood Pressure"]
            ].values[0]
            if pd.notna(
                df_tmp.loc[
                    tup.Index, ["Invasive Diastolic Arterial Blood Pressure"]
                ].values[0]
            )
            else (
                df_tmp.loc[
                    tup.Index, ["Non-Invasive Diastolic Arterial Blood Pressure"]
                ].values[0]
                if pd.notna(
                    df_tmp.loc[
                        tup.Index, ["Non-Invasive Diastolic Arterial Blood Pressure"]
                    ].values[0]
                )
                else None
            )
        )
        df_tmp.loc[tup.Index, var_bindings["dynamic"]["mean_arterial_bp_computed"]] = (
            df_tmp.loc[tup.Index, ["Invasive Mean Arterial Blood Pressure"]].values[0]
            if pd.notna(
                df_tmp.loc[tup.Index, ["Invasive Mean Arterial Blood Pressure"]].values[
                    0
                ]
            )
            else (
                df_tmp.loc[
                    tup.Index, ["Non-Invasive Mean Arterial Blood Pressure"]
                ].values[0]
                if pd.notna(
                    df_tmp.loc[
                        tup.Index, ["Non-Invasive Mean Arterial Blood Pressure"]
                    ].values[0]
                )
                else None
            )
        )
        df_tmp.loc[
            tup.Index, var_bindings["dynamic"]["systolic_arterial_bp_computed"]
        ] = (
            df_tmp.loc[tup.Index, ["Invasive Systolic Arterial Blood Pressure"]].values[
                0
            ]
            if pd.notna(
                df_tmp.loc[
                    tup.Index, ["Invasive Systolic Arterial Blood Pressure"]
                ].values[0]
            )
            else (
                df_tmp.loc[
                    tup.Index, ["Non-Invasive Systolic Arterial Blood Pressure"]
                ].values[0]
                if pd.notna(
                    df_tmp.loc[
                        tup.Index, ["Non-Invasive Systolic Arterial Blood Pressure"]
                    ].values[0]
                )
                else None
            )
        )

    df_tmp.set_index(
        "Minutes of Stay",
        drop=True,
        inplace=True,
    )
    df_tmp.to_csv(f"./results/{pid}/{pid}_dynamic_binned2h_48h.csv")


# Filter Lactate_Measurements >= 2 in First Binned 2d of Stay


In [16]:
filtered_pids = list()
previously_filtered = list()
with open("./results/patients_18+_1d+_2+lactate_measurements_in_first2d.txt", "r") as f:
    previously_filtered.extend(int(x) for x in f.readlines())

for pid in previously_filtered:
    for filename in glob.glob(f"./results/{pid}/*binned*.csv"):
        filtered_pids.append(pid)
        continue

with open(
    "./results/patients_18+_1d+_2+lactate_measurements_in_first2d_binned.txt", "w"
) as f:
    for x in filtered_pids:
        f.write(f"{x}\n")
print(len(filtered_pids))


14688
