# Data Cleaning and Selection Patientendaten

## Imports and Read

In [1]:
import pandas as pd
import datetime
import numpy as np
import re

In [2]:
print("Start Clean and Preprocessing data patient")

Start Clean and Preprocessing data patient


In [3]:
df = pd.read_excel(r'../raw_data/Hypophysenpatienten.xlsx',sheet_name='no duplicate PID')

In [4]:
df.columns

Index(['%ID', 'Fall Nr.', 'Datum/Zeit', 'Modalität', 'Exam Code', 'Exam Name',
       'Abteilung', 'Arbeitsplatz.Kürzel', 'Aufnahmeart', 'PID', 'Grösse',
       'Ausfälle prä', 'Prolaktin', 'IGF1', 'Cortisol', 'fT4',
       'weiteres Labor', 'Qualität', 'ED', 'OP Datum', 'Ausfälle post',
       'Diagnose', 'Kategorie', 'Patient Alter', 'Zuweiser',
       'AnforderungDatum', 'ÜberweiserIntern.Bereich',
       'ÜberweiserIntern.Klinik'],
      dtype='object')

## Basic Cleaning, Column Selection, Anomaly Correction and Format definition

In [5]:
# define needed columns
column_list = ['PID',"Datum/Zeit","Arbeitsplatz.Kürzel",'Grösse',
       'Ausfälle prä', 'Qualität', 'ED','OP Datum', 'Ausfälle post',
       'Diagnose', 'Kategorie', 'Patient Alter',
       'Prolaktin',"IGF1", 'Cortisol','fT4','weiteres Labor']


### Data Type Definition


In [6]:
#TODO: check Tristan
# not parseable correct values corrected
df.loc[3,'ED'] = datetime.datetime(2006,1,1,0,0,0,0)
df.loc[12,'ED'] = datetime.datetime(2008,1,1,0,0,0,0)

#TODO: check Tristan
# correct a value which is not datetime parseable
df.loc[df['OP Datum'] == '2006, 2009', 'OP Datum'] = datetime.datetime(2006,1,1,0,0,0,0)

In [7]:
# make datetime values
df["Datum/Zeit"] = pd.to_datetime(df["Datum/Zeit"])
df["ED"] = pd.to_datetime(df["ED"])
df["OP Datum"] = pd.to_datetime(df["OP Datum"])

In [8]:
# TODO: anomaly? check tristan
# rows where Entry Date is after Operationdate?
df[df['OP Datum'] < df['ED']][['ED','OP Datum']]

Unnamed: 0,ED,OP Datum
20,2023-09-02,2009-02-18


### Impute NAN Lab Data

In [9]:
# TODO: set impute value
df.loc[df["Prolaktin"].isna(),'Prolaktin'] = '0'
df.loc[df["IGF1"].isna(),'IGF1'] = '0'
df.loc[df["Cortisol"].isna(),'Cortisol'] = '0'
df.loc[df["fT4"].isna(),'fT4'] = '0'


### Unit Conversion Lab Data

In [10]:
df["Prolaktin"].unique()

array(['0', '173mU/l', '743mU/l', '687mU/l', '7.8 ug/l'], dtype=object)

In [11]:
# get indices which need to be converted
indices_to_divide = df.loc[df["Prolaktin"].str.contains('ug/l'),'Prolaktin'].index 
# remove units and strings
df['Prolaktin'] = df['Prolaktin'].str.rstrip(r'mU/l')
df['Prolaktin'] = df['Prolaktin'].str.rstrip(r'ug/l')
df['Prolaktin'] = df['Prolaktin'].astype(float)
# ug/l -> mU/l (ug/l * 21.2)
df.loc[indices_to_divide,'Prolaktin'] = df.loc[indices_to_divide,'Prolaktin'] * 21.2


In [12]:
df["IGF1"].unique()

array(['0', '6.3nmol/l', '20.2nmol/l', '75.4ng/ml', '208 ng/ml'],
      dtype=object)

In [13]:
# get indices which need to be converted
indices_to_divide = df.loc[df["IGF1"].str.contains('ng/ml'),'IGF1'].index 
# remove units and strings
df['IGF1'] = df['IGF1'].str.rstrip(r'nmol/l')
df['IGF1'] = df['IGF1'].str.rstrip(r'ng/ml')
df['IGF1'] = df['IGF1'].astype(float)
# ng/ml -> nmol/l (ng/ml / 7.65)
df.loc[indices_to_divide,'IGF1'] = df.loc[indices_to_divide,'IGF1'] / 7.65


In [14]:
df["Cortisol"].unique()

array(['0', 329, 271, 110, '311 nmol/l'], dtype=object)

In [15]:
# # get indices which need to be converted
# indices_to_divide = df.loc[df["Cortisol"].str.contains('ng/ml'),'Cortisol'].index 
# # remove units and strings
# df['Cortisol'] = df['Cortisol'].str.rstrip(r'nmol/l')
# df['Cortisol'] = df['IGF1'].str.rstrip(r'ng/ml')
# df['Cortisol'] = df['Cortisol'].astype(float)
# # ng/ml -> nmol/l (ng/ml *  27.59)
# df.loc[indices_to_divide,'Cortisol'] = df.loc[indices_to_divide,'Cortisol'] / 7.65


In [16]:
df["fT4"].unique()

array(['0', 10.1, 8.4, 7.3, '14.6 pmol/l'], dtype=object)

In [17]:
# # get indices which need to be converted
# indices_to_divide = df.loc[df["fT4"].str.contains('ng/ml'),'fT4'].index 
# # remove units and strings
# df['fT4'] = df['fT4'].str.rstrip(r'nmol/l')
# df['fT4'] = df['fT4'].str.rstrip(r'ng/ml')
# df['fT4'] = df['fT4'].astype(float)
# # ng/ml -> nmol/l (ng/ml / 7.65)
# df.loc[indices_to_divide,'fT4'] = df.loc[indices_to_divide,'fT4'] / 7.65


In [18]:
# Patient ID Duplicate Check
assert len(df[df["PID"].duplicated()]) == 0

# Case Nr Duplicate Check
assert len(df[df["Fall Nr."].duplicated()]) == 0

In [19]:
# select and rename columns
df = df[column_list]
df= df.rename(columns={"Fall Nr.": "Case_ID","PID": "Patient_ID",
                       "Datum/Zeit": "Date_MRI","ED": "Entry_date", "OP Datum": "Operation_date",
                       "Arbeitsplatz.Kürzel":"ID_MRI_Machine","Grösse": "Adenoma_size","Qualität": "Label_Quality",
                       "Patient Alter":"Patient_age","Kategorie":"Category","Diagnose":"Diagnosis",
                       "Prolaktin":"Prolactin","weiteres Labor":"Lab_additional"})

In [20]:
# set category data type in pandas, check datatypes
df['ID_MRI_Machine'] = df['ID_MRI_Machine'].astype('category')
df['Adenoma_size'] = df['Adenoma_size'].astype('category')
df['Label_Quality'] = df['Label_Quality'].astype('category')
df['Diagnosis'] = df['Diagnosis'].astype('category')
df['Category'] = df['Category'].astype('category')
df.dtypes

Patient_ID                 int64
Date_MRI          datetime64[ns]
ID_MRI_Machine          category
Adenoma_size            category
Ausfälle prä              object
Label_Quality           category
Entry_date        datetime64[ns]
Operation_date    datetime64[ns]
Ausfälle post             object
Diagnosis               category
Category                category
Patient_age                int64
Prolactin                float64
IGF1                     float64
Cortisol                  object
fT4                       object
Lab_additional            object
dtype: object

In [21]:
# replace and correct wrong namings from labelers
df["Ausfälle prä"]= df["Ausfälle prä"].str.replace("intak","intakt")
df["Ausfälle prä"]= df["Ausfälle prä"].str.replace("goando","gonado")
df["Ausfälle post"]= df["Ausfälle post"].str.replace("goando","gonado")
df["Ausfälle post"]= df["Ausfälle post"].str.replace("adh","ADH")

## One Hot Encode Categorical Values

To use and analyse the categorical data we need to one-hot encode them. This is done by splitting the comma separated strings into single strings and then create a one-hot-encoded column of each individual value. This column is then added to the original dataframe.

In [22]:
# Split the 'Ausfälle prä' column into separate strings
df['Ausfälle prä'] = df['Ausfälle prä'].str.split(', ')
# Create a set to store all unique disfunctions
unique_disfunctions = set()

# Iterate over the 'Ausfälle prä' column to gather unique disfunctions
for value in df['Ausfälle prä']:
    if isinstance(value, list):
        unique_disfunctions.update(value)
    elif isinstance(value, str):
        unique_disfunctions.add(value)

# Iterate over the unique disfunctions and create one-hot encoded columns
for disfunction in unique_disfunctions:
    df["Pre_OP_hormone_"+ disfunction] = df['Ausfälle prä'].apply(lambda x: 1 if (isinstance(x, list) and disfunction in x) or (x == disfunction) else 0)
# drop the original 'Ausfälle prä' column
df = df.drop('Ausfälle prä', axis=1)

In [23]:
# Split the 'Ausfälle post' column into separate strings
df['Ausfälle post'] = df['Ausfälle post'].str.split(', ')

# Create a set to store all unique disfunctions
unique_disfunctions = set()

# Iterate over the 'Ausfälle post' column to gather unique disfunctions
for value in df['Ausfälle post']:
    if isinstance(value, list):
        unique_disfunctions.update(value)
    elif isinstance(value, str):
        unique_disfunctions.add(value)

# Iterate over the unique disfunctions and create one-hot encoded columns
for disfunction in unique_disfunctions:
    df["Post_OP_hormone_"+ disfunction] = df['Ausfälle post'].apply(lambda x: 1 if (isinstance(x, list) and disfunction in x) or (x == disfunction) else 0)

# drop the original 'Ausfälle post' column
df = df.drop('Ausfälle post', axis=1)

In [24]:
df.columns

Index(['Patient_ID', 'Date_MRI', 'ID_MRI_Machine', 'Adenoma_size',
       'Label_Quality', 'Entry_date', 'Operation_date', 'Diagnosis',
       'Category', 'Patient_age', 'Prolactin', 'IGF1', 'Cortisol', 'fT4',
       'Lab_additional', 'Pre_OP_hormone_', 'Pre_OP_hormone_coritco',
       'Pre_OP_hormone_hypothyreo', 'Pre_OP_hormone_morbus cushing',
       'Pre_OP_hormone_hyperprolaktin', 'Pre_OP_hormone_gondao',
       'Pre_OP_hormone_gonado', 'Pre_OP_hormone_cortico',
       'Pre_OP_hormone_thyreo', 'Pre_OP_hormone_SIADH',
       'Pre_OP_hormone_hyperprolakin stressinduziert',
       'Pre_OP_hormone_hypogonado', 'Pre_OP_hormone_intakt',
       'Pre_OP_hormone_somato', 'Pre_OP_hormone_keine',
       'Pre_OP_hormone_hyperprolaktin stressbedingt', 'Pre_OP_hormone_inaktiv',
       'Pre_OP_hormone_Kompression',
       'Post_OP_hormone_LH und FSH immunohistoch. Expression',
       'Post_OP_hormone_ADH', 'Post_OP_hormone_hyperprolaktin',
       'Post_OP_hormone_gonado', 'Post_OP_hormone_cortic

In [25]:
df= df[['Patient_ID', 'Date_MRI', 'ID_MRI_Machine','Entry_date', 'Operation_date', 'Adenoma_size',
         'Diagnosis',
       'Category', 'Patient_age', 'Prolactin',"IGF1", 'Cortisol','fT4','Lab_additional',
       'Pre_OP_hormone_cortico', 'Pre_OP_hormone_gonado',
       'Pre_OP_hormone_somato', 'Pre_OP_hormone_thyreo',
       'Pre_OP_hormone_hyperprolaktin', 'Pre_OP_hormone_keine',
       'Pre_OP_hormone_intakt', 'Post_OP_hormone_cortico',
       'Post_OP_hormone_gonado', 'Post_OP_hormone_somato',
       'Post_OP_hormone_ADH', 'Post_OP_hormone_thyreo',
       'Post_OP_hormone_hyperprolaktin', 'Post_OP_hormone_keine',
       'Post_OP_hormone_intakt', 'Label_Quality',]]

In [26]:
df.to_csv(r'../raw_data/label_data.csv',index=False)

In [27]:
print("End Clean and Preprocessing patient data")

End Clean and Preprocessing patient data


# Data Cleaning and Selection Labor data

In [28]:
print("Start Clean and Preprocessing labor data")

Start Clean and Preprocessing labor data


## Read

In [47]:
labor_data = pd.read_excel("../raw_data/extract_pit.xlsx")

In [48]:
ids = {'Ã¼': 'ü', 'Ã¤': 'ä', "Ã„":"Ä","√§":"ä"}

for column in labor_data.columns[labor_data.columns.isin(["FALL_NR","PATIENT_NR","Datum_Resultat","Auftragsdatum"]) == False]:
    for old, new in ids.items():
        labor_data[column] = labor_data[column].str.replace(old, new, regex=False)
clean_result = lambda result: re.sub(r'(?<!\d)\.', '', re.sub(r'[^\d.]', '', str(result))) #clean < zahl / > zahl / 1 A zahl
labor_data["Resultat"] = labor_data["Resultat"].apply(clean_result) 
labor_data = labor_data[labor_data["Resultat"] != ""]
labor_data["Resultat"] = labor_data["Resultat"].astype(float)
labor_data["Normwert"] = labor_data["Normwert"].str.lower().str.replace(" ", "").replace("",np.nan)
labor_data["Warnung"] = labor_data["Warnung"].replace('   ',np.nan)

In [49]:
assert labor_data["Datum_Resultat"].min() > pd.to_datetime("1995-01-01")

In [61]:
labor_data_for_modell = labor_data.drop(columns = ["Warnung","Einheit","Auftraggeber","Normwert","Analyse","Auftragsdatum","FALL_NR"])

In [62]:
# mean of results of same date
labor_data_for_modell = labor_data_for_modell.groupby(["PATIENT_NR","Analyse-ID","Datum_Resultat"])["Resultat"].agg([np.mean]).reset_index()

In [63]:
patient_data = pd.read_excel(r'../raw_data/Hypophysenpatienten.xlsx',sheet_name='w duplicates').loc[:,["PID","Datum/Zeit","Fall Nr."]].rename(columns={"PID":"PATIENT_NR"})

In [64]:
# if there are multiple
n_patients = len(patient_data)
patient_data = patient_data.groupby(["PATIENT_NR","Fall Nr."])["Datum/Zeit"].max().reset_index()
print(f"{n_patients-len(patient_data)} Fälle wurden gelöscht, weil sie mehrfach vorkommen.")

69 Fälle wurden gelöscht, weil sie mehrfach vorkommen.


In [65]:
labor_data_for_modell = pd.merge(labor_data_for_modell,patient_data,on="PATIENT_NR",how = "right")
labor_data_for_modell = labor_data_for_modell[labor_data_for_modell["Datum/Zeit"] >= labor_data_for_modell["Datum_Resultat"]].drop(columns="Datum/Zeit")

In [67]:
# Compute minimum date for each patient and analysis
max_dates = labor_data_for_modell.groupby(['PATIENT_NR', "Analyse-ID","Fall Nr."])['Datum_Resultat'].max().reset_index()
# Merge with the original DataFrame to filter rows with minimum dates
labor_data_for_modell = pd.merge(labor_data_for_modell, max_dates, on=['PATIENT_NR', 'Analyse-ID', 'Datum_Resultat',"Fall Nr."])

In [69]:
# check for any duplicate Values
assert len(labor_data_for_modell.loc[:,["Fall Nr.","Analyse-ID"]].drop_duplicates()) == len(labor_data_for_modell)

In [70]:
labor_data_for_modell = labor_data_for_modell.pivot(index=["PATIENT_NR","Fall Nr."],values = ['mean'], columns = ['Analyse-ID'])
labor_data_for_modell.columns = labor_data_for_modell.columns.droplevel()
labor_data_for_modell = labor_data_for_modell.reset_index()

In [72]:
print(f"Sparsity of labordata: {round(labor_data_for_modell.isna().mean().mean(),3)} % (nur von Fällen mit Laborwerten)")
print(f"Von {len(patient_data)-len(labor_data_for_modell)} Fällen gibt es keine Laborwerte.")

Sparsity of labordata: 0.254 % (nur von Fällen mit Laborwerten)
Von 541 Fällen gibt es keine Laborwerte.


In [73]:
labor_data_for_modell.to_csv(r'../raw_data/labor_data_preprocessed.csv',index=False)

In [46]:
print("End Clean and Preprocessing labor data")

End Clean and Preprocessing labor data
