# Data Preprocessing
- Build a data processing pipeline that takes the raw data and transforms them into a NN trainable dataset

## Imports

In [1]:
# data processing
import numpy as np
import pandas as pd

# data viz
import matplotlib.pyplot as plt
import seaborn as sn 

# gui
from tqdm.auto import tqdm
tqdm.pandas()


#funtions

def doing(string):
    print(string+"...", end="")

def done():
    print("✅")
    #print(" \033[91m\033[1mDone.\033[0m")
    
def choose_target_period(patition,target_name="ICP_Vital", mins_before_first_icp = 1440, min_icp_val=-10, max_icp_val=105):
    '''
    1. Filter ICP outliers taht are outside the 0.999 quantile (-10,105)
    2. Drop steps after last ICP measurement
    3. Shift relative time to start with 0 after removing rows
    
        Parameters:
                patition (DataFrame): The Dataframe we want to transform
                target_name(str): The column name of the ICP measurement value
                mins_before_first_icp (int): The number of minutes before the first ICP we want to capture
                min_icp_val (int): The minimum icp value that will be kept in the dataframe
                max_icp_val (int): The maximum icp value that will be kept in the dataframe                
                
        Return:
            patition (DataFrame): The transformed dataframe that was passed to the function
    
    '''
    # DF with only rows that have ICP as "Maßnahme"to check when the first and last ICP happened
    target_rows = patition[patition["Maßnahme"] == target_name]
    
    # create a mask that is False where values are outside specified range
    outlier_mask = (target_rows["Wert"] >= min_icp_val) & (target_rows["Wert"] <= min_icp_val)
    target_rows = target_rows[outlier_mask]
    
    
    # Check is patition has steps
    if len(target_rows) == 0:
        return patition.iloc[0:0]
    
    # kick out targets beyond thresholds
    min_time_threashold = min(target_rows["rel_time"]) # since target steps only has ICP measurements, this is the first ICP measurement for the patient
    min_time_threashold = min_time_threashold - mins_before_first_icp # But we want to have 1440 minutes before the first measurement, so we need to substract it
    max_time_threashold = max(target_rows["rel_time"]) # We want to remove rows that occure after the last ICP
    
    # kick out (basically set to NaN) measurements beyond interesting region
    patition = patition[patition["rel_time"] >= min_time_threashold]
    patition = patition[patition["rel_time"] <= max_time_threashold]
    
    # subtract the minimum time value from al values, to shift the first row to have a rel_time of 0
    patition["rel_time"] -= min(patition["rel_time"])


    return patition

def rename_nbd(name):
    '''
    merge "NBD" (non-invasive ways of measuring blood pressure) with invasive ways as they measure the same thing
    
        Parameters:
            name(string): the name of the measurement we want to rename
        Return:
            name(string): the name after we have altered it to the generel name
    '''
    if "syst" in name:
        name = "syst_Vital"
    elif "diast" in name:
        name = "diast_Vital"
    elif "mittl" in name:
        name = "mittl_Vital"
        
    return name

# Loade the dataset 
doing("Loading Dataset")
datenbank_werte_df = pd.read_csv("../icp_prediction/data/Datenbank_Werte.csv")
done()## Create ICP data set 
Can be skipped if file already exists.

In [2]:
# Loade the dataset 
doing("Loading Dataset")
datenbank_werte_df = pd.read_csv("../icp_prediction/data/Datenbank_Werte.csv")
done()

Loading Dataset...

  exec(code_obj, self.user_global_ns, self.user_ns)


✅


In [6]:
datenbank_werte_df.head(10)

Unnamed: 0,Pat_ID,ID,Maßnahme,Maßnahme_norm,rel_time,Wert,DB
0,123456798,BGA,Glu,Glu,-2880.0,100.0,UKE
1,474097135,BGA,Ca,Ca,419.0,1.23,UKE
2,474097135,BGA,Ca,Ca,492.0,1.28,UKE
3,474097135,BGA,Ca,Ca,558.0,1.29,UKE
4,474097135,BGA,Ca,Ca,675.0,1.31,UKE
5,474097135,BGA,Ca,Ca,764.0,1.31,UKE
6,474097135,BGA,Ca,Ca,956.0,1.34,UKE
7,474097135,BGA,Ca,Ca,1158.0,1.36,UKE
8,474097135,BGA,Ca,Ca,1240.0,1.36,UKE
9,474097135,BGA,Ca,Ca,1469.0,1.35,UKE


In [8]:
datenbank_werte_df[datenbank_werte_df.Maßnahme == 'Glu'].head(50)

Unnamed: 0,Pat_ID,ID,Maßnahme,Maßnahme_norm,rel_time,Wert,DB
0,123456798,BGA,Glu,Glu,-2880.0,100.0,UKE
94,474097135,BGA,Glu,Glu,419.0,109.0,UKE
95,474097135,BGA,Glu,Glu,492.0,123.0,UKE
96,474097135,BGA,Glu,Glu,558.0,115.0,UKE
97,474097135,BGA,Glu,Glu,675.0,115.0,UKE
98,474097135,BGA,Glu,Glu,764.0,131.0,UKE
99,474097135,BGA,Glu,Glu,956.0,86.0,UKE
100,474097135,BGA,Glu,Glu,1158.0,121.0,UKE
101,474097135,BGA,Glu,Glu,1240.0,131.0,UKE
102,474097135,BGA,Glu,Glu,1469.0,125.0,UKE


In [64]:
# Loade the dataset 
doing("Loading Dataset")
datenbank_werte_df = pd.read_csv("../icp_prediction/data/Datenbank_Werte.csv")
done()

# remove the eICU values from the dataset
doing("Remove eICU values")
datenbank_werte_df = datenbank_werte_df[datenbank_werte_df["DB"] != "eICU"]
done()

# add the type of the Masßnahme (ID = ['BGA' 'Labor' 'Vital' 'Med']) to the Masßnahme value and drop the ID column
doing("Combine Maßnahme and ID column")
datenbank_werte_df["Maßnahme"] += "_" + datenbank_werte_df["ID"]
datenbank_werte_df = datenbank_werte_df.drop(columns="ID")
done()

# drop measurements where we have a "NAN"
nan_values = datenbank_werte_df['Wert'].isnull().sum()
doing("Dropping {} nan values".format(nan_values))
datenbank_werte_df = datenbank_werte_df.dropna(subset=["Wert"])
done()

# drop duplicates
doing("Dropping duplicates")
datenbank_werte_df = datenbank_werte_df.drop_duplicates(subset=["Pat_ID", "Maßnahme", "rel_time", "Wert"])
done()

# groupby the Pat ID and apply choose_target_period to each patition
doing("Remove outliers and shift relative time to start with 0")
datenbank_werte_df = datenbank_werte_df.groupby("Pat_ID").apply(lambda patition: choose_target_period(patition))
done()

# resetting the DF index, becasue we have removed many rows
doing("Reset index")
datenbank_werte_df = datenbank_werte_df.reset_index(drop=True)
done()

# renaming NBDs because multiple measures measure the same thing
doing("Renaming NBDs")
datenbank_werte_df["Maßnahme"] = datenbank_werte_df["Maßnahme"].apply(rename_nbd)
done()

# save the DF to a .csv file
file_name = "data/Datenbank_icp.csv"
doing("Saving dataframe to '{}'".format(file_name))
datenbank_werte_df.to_csv(file_name, index=False)
done()

Loading Dataset...✅
Remove eICU values...✅
Combine Maßnahme and ID column...✅
Dropping 4271 nan values...✅
Dropping duplicates...✅
Remove outliers and shift relative time to start with 0...✅
Reset index...✅
Renaming NBDs...✅
Saving dataframe to 'data/Datenbank_icp.csv'...✅


## Loade the 'Datenbank_icp.csv' file
- determine columns that have few missings
- select high count vitals 

In [66]:
# load the 'Datenbank_icp.csv' file
icp_df = pd.read_csv("data/Datenbank_icp.csv")

In [None]:
icp_df_anton = pd.read_csv("../icp_prediction/data/Datenbank_icp.csv")

In [67]:
print(len(icp_df["Pat_ID"]))
print(len(icp_df_anton["Pat_ID"]))

612768
81746137
