In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
d_labitem_df = pd.read_csv("../V1.1.0/D_LABITEMS.csv")

In [3]:
icu_stay_df = pd.read_csv("./temp_pp_data_files/icu_stay_mibi_NEW.csv")

In [4]:
lab_events_df = pd.read_csv("../V1.1.0/LABEVENTS.csv")

In [5]:
lab_grp_by_hadm = lab_events_df.groupby("ITEMID")["HADM_ID"].nunique()

In [6]:
# side note: there are 13449 different HADMs in the dataset
# so check how many values had at least one measurement relative to all HADMs
most_common_values = lab_grp_by_hadm.loc[(lab_grp_by_hadm / 13449) > 0.90]

In [9]:
# which values are common?
d_labitem_df = pd.read_csv("../V1.1.0/D_LABITEMS.csv")
# change here to add new lab values, 5241 -> a02/fi02 -> like 0% values
idx_lst = [5241]  # most_common_values.index.to_list()
id_to_name_grp = d_labitem_df.query("ITEMID in @idx_lst").groupby("ITEMID")["LABEL"]

In [10]:
# map name to item_id; this is possible as one item_id always defines one single value and not multiples
id_to_name = id_to_name_grp.apply(
    lambda grp: list(grp.value_counts().index)[0]
).to_dict()

# Preprocess lab values outliers

To reduce the effect and influence of extreme outliers and include them into calculation (for example average), winsorize the lab values in the original data and use them for downstream analysis.

1.) Calculate the upper and lower fence per column that is: 75%/25% quantile three times their difference +/-; --> all values otuside those fences are probably errors and not extreme outliers.


2.) Per upper/lower fence per column: Determine the quantile, that is closest to the respective fence (in 2.5% steps). This is typically around 95% for upper and about 5% for lower fence.

3.) Set all values above/below to that quantiles value.

In [12]:
lab_value_item_ids = list(id_to_name.keys())

In [13]:
# calc upper outer fence: turkeys method boxplot (marks border between possible and actual outliers)
# --> useful for winsorizing
def uo_fence(df):
    q1 = df["VALUENUM"].quantile(0.25)
    q3 = df["VALUENUM"].quantile(0.75)
    iqr = q3 - q1
    outer_fence = 3 * iqr
    outer_fence_le = q1 - outer_fence
    outer_fence_ue = q3 + outer_fence
    return outer_fence_ue, outer_fence_le

In [14]:
# calculate upper and lower fence per column
upper_fences = {}
lower_fences = {}
for lab_val_id in lab_value_item_ids:
    lab_val_only_df_mask = lab_events_df["ITEMID"] == lab_val_id
    fence_up, fence_low = uo_fence(lab_events_df.loc[lab_val_only_df_mask])
    fence_up, fence_low = round(fence_up, 2), round(fence_low, 2)
    upper_fences[lab_val_id] = fence_up
    lower_fences[lab_val_id] = fence_low

In [15]:
# calculate closest upper quantile for upper fence
k_dict_up = {}
for lab_val_id in upper_fences:
    lab_val_only_df_mask = lab_events_df["ITEMID"] == lab_val_id
    vals_only_df = lab_events_df.loc[lab_val_only_df_mask]
    quant_90 = vals_only_df["VALUENUM"].quantile(0.90)
    quant_925 = vals_only_df["VALUENUM"].quantile(0.925)
    quant_95 = vals_only_df["VALUENUM"].quantile(0.95)
    quant_975 = vals_only_df["VALUENUM"].quantile(0.975)
    quant_99 = vals_only_df["VALUENUM"].quantile(0.99)
    tmp_lst = [quant_90, quant_925, quant_95, quant_975, quant_99]
    closest_idx = (np.abs(tmp_lst - upper_fences[lab_val_id])).argmin()
    k_dict_up[lab_val_id] = round(tmp_lst[closest_idx], 2)

In [16]:
# calculate closest lower quantile for lower fence
k_dict_low = {}
for lab_val_id in lower_fences:
    lab_val_only_df_mask = lab_events_df["ITEMID"] == lab_val_id
    vals_only_df = lab_events_df.loc[lab_val_only_df_mask]
    quant_10 = vals_only_df["VALUENUM"].quantile(0.10)
    quant_075 = vals_only_df["VALUENUM"].quantile(0.075)
    quant_05 = vals_only_df["VALUENUM"].quantile(0.05)
    quant_025 = vals_only_df["VALUENUM"].quantile(0.025)
    quant_01 = vals_only_df["VALUENUM"].quantile(0.01)
    tmp_lst = [quant_01, quant_025, quant_05, quant_075, quant_10]
    closest_idx = (np.abs(tmp_lst - lower_fences[lab_val_id])).argmin()
    k_dict_low[lab_val_id] = round(tmp_lst[closest_idx], 2)

In [17]:
# winsorize each column
for lab_val_id in lab_value_item_ids:
    lab_events_df.loc[
        (lab_events_df["ITEMID"].eq(lab_val_id))
        & (lab_events_df["VALUENUM"] > k_dict_up[lab_val_id]),
        "VALUENUM",
    ] = k_dict_up[lab_val_id]
    # could be negative (Base Excess)
    if lab_val_id != 5211:
        lab_events_df.loc[
            (lab_events_df["ITEMID"].eq(lab_val_id))
            & (lab_events_df["VALUENUM"] < k_dict_low[lab_val_id])
            & (lab_events_df["VALUENUM"] != np.nan),
            "VALUENUM",
        ] = (
            k_dict_low[lab_val_id] if k_dict_low[lab_val_id] > 0 else 0
        )
    else:
        lab_events_df.loc[
            (lab_events_df["ITEMID"].eq(lab_val_id))
            & (lab_events_df["VALUENUM"] < k_dict_low[lab_val_id])
            & (lab_events_df["VALUENUM"] != np.nan),
            "VALUENUM",
        ] = k_dict_low[lab_val_id]

## Labevents
Add most common blood values as well. To reduce influence of missing data and the different blood values measured different times, we will use the following strategy:

* Get the most common blood values across all HADM IDs (threshold should be ideally above 80-90% all values included in the inital anaylsis are measured at least once during the specific hospital stay). Filter all values, that are clinically relevant across a large population of ICU patients. This includes renal, liver and BGA parameters.

* Include the max, min and average measurement for each such value + a flag, that indicates whether a value is in the normal range of values or not.

In [18]:
for item_id in list(id_to_name.keys()):
    lab_name = id_to_name[item_id]
    kwargs_dict = {
        f"{lab_name}_min": ("VALUENUM", np.min),
        f"{lab_name}_max": ("VALUENUM", np.max),
        f"{lab_name}_avg": ("VALUENUM", np.nanmean),
    }
    lab_events_df_temp = lab_events_df[lab_events_df["ITEMID"] == item_id]
    max_lab_value_per_hadm = lab_events_df_temp.groupby("HADM_ID").agg(**kwargs_dict)
    # todo check why this reduces size of dataframe (removes rows and columns)
    # icu_stay_df = pd.merge(icu_stay_df, max_lab_value_per_hadm, on="HADM_ID")
    for col in kwargs_dict.keys():
        hadm_id_to_col = {
            hadm_id: col
            for hadm_id, col in zip(
                max_lab_value_per_hadm.index, max_lab_value_per_hadm[col]
            )
        }
        icu_stay_df[col] = icu_stay_df["HADM_ID"].map(hadm_id_to_col)
        # round avg column: 10,12345 -> 10,12
        if col.endswith("_avg"):
            icu_stay_df[col] = icu_stay_df[col].round(2)
    # flag to indicate whether a lab value has been measured or not
    # this works, because whenever a value has no minimum (or maximum or avg) in this row, it has not been measured at all
    # icu_stay_df[f'{lab_name}_measured'] = np.where(icu_stay_df[f'{lab_name}_min'] == , 0, 1)

In [21]:
icu_stay_df.to_csv("./temp_pp_data_files/data_02_2023_NEW_1.csv", index=False)