# Data Cleaning

# Imports and Read

In [1]:
import pandas as pd
import datetime
import numpy as np
import re

# Data Cleaning and Selection Patient-data

In [2]:
print("Start Clean and Preprocessing patients-data")

Start Clean and Preprocessing patients-data


In [3]:
df = pd.read_excel(r'../raw_data/Hypophysenpatienten.xlsx',sheet_name='no duplicate PID')

In [4]:
df.columns

Index(['%ID', 'Fall Nr.', 'Datum/Zeit', 'Modalität', 'Exam Code', 'Exam Name',
       'Abteilung', 'Arbeitsplatz.Kürzel', 'Aufnahmeart', 'PID', 'Grösse',
       'Ausfälle prä', 'Prolaktin', 'IGF1', 'Cortisol', 'fT4',
       'weiteres Labor', 'Qualität', 'ED', 'OP Datum', 'Ausfälle post',
       'Diagnose', 'Kategorie', 'Patient Alter', 'Zuweiser',
       'AnforderungDatum', 'ÜberweiserIntern.Bereich',
       'ÜberweiserIntern.Klinik'],
      dtype='object')

## Basic Cleaning, Column Selection, Anomaly Correction and Format definition

In [5]:
# define needed columns
column_list = ['PID','Fall Nr.',"Datum/Zeit","Arbeitsplatz.Kürzel",'Grösse',
       'Ausfälle prä', 'Qualität', 'ED','OP Datum', 'Ausfälle post',
       'Diagnose', 'Kategorie', 'Patient Alter',
       'Prolaktin',"IGF1", 'Cortisol','fT4','weiteres Labor']


### Data Type Definition


In [6]:
#TODO: check Tristan
# not parseable correct values corrected
df.loc[3,'ED'] = datetime.datetime(2006,1,1,0,0,0,0)
df.loc[12,'ED'] = datetime.datetime(2008,1,1,0,0,0,0)

#TODO: check Tristan
# correct a value which is not datetime parseable
df.loc[df['OP Datum'] == '2006, 2009', 'OP Datum'] = datetime.datetime(2006,1,1,0,0,0,0)

In [7]:
# make datetime values
df["Datum/Zeit"] = pd.to_datetime(df["Datum/Zeit"])
df["ED"] = pd.to_datetime(df["ED"])
df["OP Datum"] = pd.to_datetime(df["OP Datum"])

In [8]:
# TODO: anomaly? check tristan
# rows where Entry Date is after Operationdate?
df[df['OP Datum'] < df['ED']][['ED','OP Datum']]

Unnamed: 0,ED,OP Datum


In [9]:
# Patient ID Duplicate Check
assert len(df[df["PID"].duplicated()]) == 0

# Case Nr Duplicate Check
assert len(df[df["Fall Nr."].duplicated()]) == 0

In [10]:
# select and rename columns
df = df[column_list]
df= df.rename(columns={"Fall Nr.": "Case_ID","PID": "Patient_ID",
                       "Datum/Zeit": "Date_MRI","ED": "Entry_date", "OP Datum": "Operation_date",
                       "Arbeitsplatz.Kürzel":"ID_MRI_Machine","Grösse": "Adenoma_size","Qualität": "Label_Quality",
                       "Patient Alter":"Patient_age","Kategorie":"Category","Diagnose":"Diagnosis",
                       "Prolaktin":"Prolactin","weiteres Labor":"Lab_additional"})

In [11]:
# set category data type in pandas, check datatypes
df['ID_MRI_Machine'] = df['ID_MRI_Machine'].astype('category')
df['Adenoma_size'] = df['Adenoma_size'].astype('category')
df['Label_Quality'] = df['Label_Quality'].astype('category')
df['Diagnosis'] = df['Diagnosis'].astype('category')
df['Category'] = df['Category'].astype('category')
df.dtypes

Patient_ID                 int64
Case_ID                    int64
Date_MRI          datetime64[ns]
ID_MRI_Machine          category
Adenoma_size            category
Ausfälle prä              object
Label_Quality           category
Entry_date        datetime64[ns]
Operation_date    datetime64[ns]
Ausfälle post             object
Diagnosis               category
Category                category
Patient_age                int64
Prolactin                 object
IGF1                      object
Cortisol                  object
fT4                       object
Lab_additional            object
dtype: object

In [12]:
# replace and correct wrong namings from labelers
df["Ausfälle prä"]= df["Ausfälle prä"].str.replace("intak","intakt")
df["Ausfälle prä"]= df["Ausfälle prä"].str.replace("intaktt","intakt")
df["Ausfälle prä"]= df["Ausfälle prä"].str.replace("goando","gonado")
df["Ausfälle post"]= df["Ausfälle post"].str.replace("goando","gonado")
df["Ausfälle post"]= df["Ausfälle post"].str.replace("adh","ADH")

## One Hot Encode Categorical Values

To use and analyse the categorical data we need to one-hot encode them. This is done by splitting the comma separated strings into single strings and then create a one-hot-encoded column of each individual value. This column is then added to the original dataframe.

In [13]:
# Split the 'Ausfälle prä' column into separate strings
df['Ausfälle prä'] = df['Ausfälle prä'].str.split(', ')
# Create a set to store all unique disfunctions
unique_disfunctions = set()

# Iterate over the 'Ausfälle prä' column to gather unique disfunctions
for value in df['Ausfälle prä']:
    if isinstance(value, list):
        unique_disfunctions.update(value)
    elif isinstance(value, str):
        unique_disfunctions.add(value)

# Iterate over the unique disfunctions and create one-hot encoded columns
for disfunction in unique_disfunctions:
    df["Pre_OP_hormone_"+ disfunction] = df['Ausfälle prä'].apply(lambda x: 1 if (isinstance(x, list) and disfunction in x) or (x == disfunction) else 0)
# drop the original 'Ausfälle prä' column
df = df.drop('Ausfälle prä', axis=1)

In [14]:
# Split the 'Ausfälle post' column into separate strings
df['Ausfälle post'] = df['Ausfälle post'].str.split(', ')

# Create a set to store all unique disfunctions
unique_disfunctions = set()

# Iterate over the 'Ausfälle post' column to gather unique disfunctions
for value in df['Ausfälle post']:
    if isinstance(value, list):
        unique_disfunctions.update(value)
    elif isinstance(value, str):
        unique_disfunctions.add(value)

# Iterate over the unique disfunctions and create one-hot encoded columns
for disfunction in unique_disfunctions:
    df["Post_OP_hormone_"+ disfunction] = df['Ausfälle post'].apply(lambda x: 1 if (isinstance(x, list) and disfunction in x) or (x == disfunction) else 0)

# drop the original 'Ausfälle post' column
df = df.drop('Ausfälle post', axis=1)

In [15]:
df.columns

Index(['Patient_ID', 'Case_ID', 'Date_MRI', 'ID_MRI_Machine', 'Adenoma_size',
       'Label_Quality', 'Entry_date', 'Operation_date', 'Diagnosis',
       'Category', 'Patient_age', 'Prolactin', 'IGF1', 'Cortisol', 'fT4',
       'Lab_additional', 'Pre_OP_hormone_', 'Pre_OP_hormone_prolaktin',
       'Pre_OP_hormone_hyperprolakin stressinduziert',
       'Pre_OP_hormone_hypogonado', 'Pre_OP_hormone_keine',
       'Pre_OP_hormone_somatotrop', 'Pre_OP_hormone_chiasma',
       'Pre_OP_hormone_morbus cushing', 'Pre_OP_hormone_hypothyreo',
       'Pre_OP_hormone_ADH', 'Pre_OP_hormone_coritco', 'Pre_OP_hormone_somato',
       'Pre_OP_hormone_intakt', 'Pre_OP_hormone_gonado',
       'Pre_OP_hormone_gondao', 'Pre_OP_hormone_SIADH',
       'Pre_OP_hormone_thyreo', 'Pre_OP_hormone_inaktiv',
       'Pre_OP_hormone_inakiv', 'Pre_OP_hormone_hyperprolaktin stressbedingt',
       'Pre_OP_hormone_cortico', 'Pre_OP_hormone_hyperprolaktin',
       'Pre_OP_hormone_Kompression', 'Post_OP_hormone_thyreo',
  

In [16]:
df= df[['Patient_ID','Case_ID', 'Date_MRI', 'ID_MRI_Machine','Entry_date', 'Operation_date', 'Adenoma_size',
         'Diagnosis',
       'Category', 'Patient_age', 'Prolactin',"IGF1", 'Cortisol','fT4','Lab_additional',
       'Pre_OP_hormone_cortico', 'Pre_OP_hormone_gonado',
       'Pre_OP_hormone_somato', 'Pre_OP_hormone_thyreo',
       'Pre_OP_hormone_hyperprolaktin', 'Pre_OP_hormone_keine',
       'Pre_OP_hormone_intakt', 'Post_OP_hormone_cortico',
       'Post_OP_hormone_gonado', 'Post_OP_hormone_somato',
       'Post_OP_hormone_ADH', 'Post_OP_hormone_thyreo',
       'Post_OP_hormone_hyperprolaktin', 'Post_OP_hormone_keine',
       'Post_OP_hormone_intakt', 'Label_Quality',]]

In [17]:
df.to_csv(r'../raw_data/label_data.csv',index=False)

In [18]:
print("End Clean and Preprocessing patient data")

End Clean and Preprocessing patient data


# Data Cleaning and Selection Lab-data

In [41]:
print("Start Clean and Preprocessing lab-data")

Start Clean and Preprocessing lab-data


## Read

In [62]:
lab_data = pd.read_excel("../raw_data/extract_pit.xlsx").rename(columns={"PATIENT_NR":"Patient_ID","FALL_NR":"Case_ID","Analyse-ID":"Lab_ID"})

In [63]:
lab_data.columns

Index(['Case_ID', 'Patient_ID', 'Analyse', 'Lab_ID', 'Auftraggeber',
       'Datum_Resultat', 'Auftragsdatum', 'Resultat', 'Einheit', 'Normwert',
       'Fallart', 'Warnung'],
      dtype='object')

In [72]:
lab_data['Lab_ID'] = lab_data['Lab_ID'].replace({20396:'IGF1',24382:'PROL',24384:'PROL',24383:'PROL'})

In [73]:
lab_data['Lab_ID'].unique()

array(['LH', 'FSH', 'FT4', 'PROL', 'TEST', 'IGF1', 'TBILHB', 'COR60',
       'COR30', 'ABTEST'], dtype=object)

In [74]:
ids = {'Ã¼': 'ü', 'Ã¤': 'ä', "Ã„":"Ä","√§":"ä"}

for column in lab_data.columns[lab_data.columns.isin(["Case_ID","Patient_ID","Datum_Resultat","Auftragsdatum"]) == False]:
    for old, new in ids.items():
        lab_data[column] = lab_data[column].str.replace(old, new, regex=False)
clean_result = lambda result: re.sub(r'(?<!\d)\.', '', re.sub(r'[^\d.]', '', str(result))) #clean < zahl / > zahl / 1 A zahl
lab_data["Resultat"] = lab_data["Resultat"].apply(clean_result) 
lab_data = lab_data[lab_data["Resultat"] != ""]
lab_data["Resultat"] = lab_data["Resultat"].astype(float)
lab_data["Normwert"] = lab_data["Normwert"].str.lower().str.replace(" ", "").replace("",np.nan)
lab_data["Warnung"] = lab_data["Warnung"].replace('   ',np.nan)

In [75]:
assert lab_data["Datum_Resultat"].min() > pd.to_datetime("1995-01-01")

In [76]:
lab_data['Lab_ID'].unique()

array(['LH', 'FSH', 'FT4', 'PROL', 'TEST', 'IGF1', 'COR60', 'COR30',
       'TBILHB'], dtype=object)

In [77]:
lab_data_model = lab_data.drop(columns = ["Warnung","Einheit","Auftraggeber","Normwert","Analyse","Auftragsdatum"])

In [78]:
lab_data

Unnamed: 0,Case_ID,Patient_ID,Analyse,Lab_ID,Auftraggeber,Datum_Resultat,Auftragsdatum,Resultat,Einheit,Normwert,Fallart,Warnung
0,41505731,23613,LH (luteinisierendes Hormon),LH,ENDOKRINOLOGIE,2023-09-08,20230908,139.0,U/l,"1,50-9,30",ambulant,ausserhalb Norm
1,41919082,300065854,FSH (Follikel-stimulierendes Hormon),FSH,P√ÑD.ENDOKRINO/DIABET.,2023-07-10,20230710,225.0,U/l,"1,40-18,1",ambulant,
2,41643796,628910,LH (luteinisierendes Hormon),LH,GYN.ENDOKR.,2023-01-16,20230116,399.0,U/l,zyklusabhängig,ambulant,
3,41424235,444742,FSH (Follikel-stimulierendes Hormon),FSH,P√ÑD.ENDOKRINO/DIABET.,2023-03-31,20230331,300.0,U/l,"1,40-18,1",ambulant,
4,41777042,500857,LH (luteinisierendes Hormon),LH,ENDOKRINOLOGIE,2023-03-07,20230307,223.0,U/l,zyklusabhängig,ambulant,
...,...,...,...,...,...,...,...,...,...,...,...,...
93563,40580475,416229,fT4 (freies Thyroxin),FT4,ENDOKRINOLOGIE,2018-09-26,20180926,109.0,pmol/l,"9,9-19,3",ambulant,
93565,40351451,300077868,Testosteron,TEST,KINDERKLINIK,2018-07-04,20180704,563.0,nmol/l,<32,ambulant,
93566,10111455,484286,FSH (Follikel-stimulierendes Hormon),FSH,SPIELMANN-SANDMEIER,2018-02-08,20180208,620.0,U/l,,no,
93567,40556611,313571,fT4 (freies Thyroxin),FT4,NUKLEARMEDIZIN,2018-10-08,20181008,163.0,pmol/l,"9,9-19,3",ambulant,


In [79]:
# mean of results of same date
lab_data_model = lab_data_model.groupby(["Patient_ID","Lab_ID","Datum_Resultat"])["Resultat"].agg([np.mean]).reset_index()

  lab_data_model = lab_data_model.groupby(["Patient_ID","Lab_ID","Datum_Resultat"])["Resultat"].agg([np.mean]).reset_index()


In [80]:
lab_data_model['Lab_ID'].unique()

array(['FT4', 'TEST', 'PROL', 'COR60', 'FSH', 'LH', 'IGF1', 'COR30',
       'TBILHB'], dtype=object)

In [81]:
patient_data = pd.read_excel(r'../raw_data/Hypophysenpatienten.xlsx',sheet_name='w duplicates').loc[:,["PID","Datum/Zeit","Fall Nr."]].rename(columns={"PID":"Patient_ID","Fall Nr.":"Case_ID"})

In [82]:
# if there are multiple
n_patients = len(patient_data)
patient_data = patient_data.groupby(["Patient_ID","Case_ID"])["Datum/Zeit"].max().reset_index()
print(f"{n_patients-len(patient_data)} Cases were deleted, because they were duplicates.")

69 Cases were deleted, because they were duplicates.


In [83]:
lab_data_model = pd.merge(lab_data_model,patient_data,on="Patient_ID",how = "right")
lab_data_model = lab_data_model[lab_data_model["Datum/Zeit"] >= lab_data_model["Datum_Resultat"]].drop(columns="Datum/Zeit")

In [84]:
# Compute minimum date for each patient and analysis
max_dates = lab_data_model.groupby(['Patient_ID', "Lab_ID","Case_ID"])['Datum_Resultat'].max().reset_index()
# Merge with the original DataFrame to filter rows with minimum dates
lab_data_model = pd.merge(lab_data_model, max_dates, on=['Patient_ID', 'Lab_ID', 'Datum_Resultat',"Case_ID"])

In [85]:
# check for any duplicate Values
assert len(lab_data_model.loc[:,["Case_ID","Lab_ID"]].drop_duplicates()) == len(lab_data_model)

In [86]:
lab_data_model = lab_data_model.pivot(index=["Patient_ID","Case_ID"],values = ['mean'], columns = ['Lab_ID'])
lab_data_model.columns = lab_data_model.columns.droplevel()
lab_data_model = lab_data_model.reset_index()

### Create LabData from label data

In [98]:
df_additional_lab = pd.read_csv(r'../raw_data/label_data.csv').rename(columns={'Cortisol':'COR60','fT4':'FT4','Prolactin':'PROL'})[['Patient_ID','Case_ID','COR60','FT4','PROL','IGF1']]
df_additional_lab.columns
df_additional_lab = df_additional_lab.dropna(subset=['PROL','IGF1','COR60','FT4',]).reset_index(drop=True)
df_additional_lab

Unnamed: 0,Patient_ID,Case_ID,COR60,FT4,PROL,IGF1
0,300228153,41707994,329,10.1,173mU/l,6.3nmol/l
1,300312446,41718174,271,8.4,743mU/l,20.2nmol/l
2,36127,41579190,110,7.3,687mU/l,75.4ng/ml
3,300291886,41169249,311 nmol/l,14.6 pmol/l,7.8 ug/l,208 ng/ml
4,560863,40469555,607,11.4,269ug/l,22.7nmol
5,459429,40603831,703,13.2,7ug/l,32.5nmol/l
6,17081,40573077,766,14.3,13.5ug/L,16.6nmol
7,112374,40541632,334,11,381ug/l,15.9nmol
8,113792,40525843,1380,11.8,366ug/l,14.9nmol
9,242880,40419128,1213,17,954ug/l,22nmol


In [None]:
# get indices which need to be converted
indices_to_divide = df.loc[df["Prolaktin"].str.contains('ug/l'),'Prolaktin'].index 
# remove units and strings
df['Prolaktin'] = df['Prolaktin'].str.rstrip(r'mU/l')
df['Prolaktin'] = df['Prolaktin'].str.rstrip(r'ug/l')
df['Prolaktin'] = df['Prolaktin'].astype(float)
# ug/l -> mU/l (ug/l * 21.2)
df.loc[indices_to_divide,'Prolaktin'] = df.loc[indices_to_divide,'Prolaktin'] * 21.2


In [None]:
lab_data_model

In [None]:
print(f"Sparsity of labordata: {round(lab_data_model.isna().mean().mean(),3)} % (nur von Fällen mit Laborwerten)")
print(f"Von {len(patient_data)-len(lab_data_model)} Fällen gibt es keine Laborwerte.")

In [None]:
lab_data_model.to_csv(r'../raw_data/labor_data_preprocessed.csv',index=False)

In [None]:
print("End Clean and Preprocessing labor data")