# Data Partitioning

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
random_state = 123

In [2]:
print("Start Data Partitioning")

Start Data Partitioning


In [3]:
# read cleaned data
df = pd.read_csv(r'../raw_data/label_data.csv')
df.head()

Unnamed: 0,Patient_ID,Date_MRI,ID_MRI_Machine,Entry_date,Operation_date,Adenoma_size,Diagnosis,Category,Patient_age,Prolactin,...,Pre_OP_hormone_intakt,Post_OP_hormone_cortico,Post_OP_hormone_gonado,Post_OP_hormone_somato,Post_OP_hormone_ADH,Post_OP_hormone_thyreo,Post_OP_hormone_hyperprolaktin,Post_OP_hormone_keine,Post_OP_hormone_intakt,Label_Quality
0,300146159,2023-05-11 09:00:00,MRI3,2021-09-01,2021-09-17,makro,inaktiv (gonado),non-prolaktinom,57,0.0,...,0,0,1,0,0,0,0,0,0,
1,762512,2023-05-06 08:12:00,MRI3,2018-09-01,2018-09-19,makro,gh,non-prolaktinom,66,0.0,...,0,0,0,0,0,1,0,0,0,
2,365189,2023-05-05 14:19:00,MRI3,,,,,,32,0.0,...,0,0,0,0,0,0,0,0,0,keine daten
3,543641,2023-05-05 07:54:00,MRI2,2006-01-01,2009-06-04,mikro,acth,non-prolaktinom,39,0.0,...,0,0,0,0,0,0,0,1,0,
4,300302329,2023-05-02 12:04:00,MRI3,2021-10-01,2022-01-12,makro,inaktiv,non-prolaktinom,56,0.0,...,1,0,0,0,0,0,0,0,1,


## Train/Test Split Strategy
We will Split the Data into a Train- and Testset. 
We are splitting each patient fully into either the train or the test set to avoid data leakage.
This is ensured checking for Patient ID duplicates (assert statement beneath) in the Dataframe. If each row only corresponds to one patient we can savely split the dataframe.


Also we are gonna stratify the split on the binary labels to ensure that the train- and testset include about the same of each class.

In [4]:
# drop patients without a label
df= df[~df["Category"].isna()]

In [5]:
# Patient ID Duplicate Check
assert len(df[df["Patient_ID"].duplicated()]) == 0

In [6]:
# split columns into features and labels
X = df.drop(columns=["Category"])
y = df["Category"]
# stratified train/test split on labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [7]:
print("Total Dataframe Train rows:", len(X_train))
print("Total Dataframe Test rows:", len(X_test))

Total Dataframe Train rows: 138
Total Dataframe Test rows: 35


In [8]:
label_diff = ((y_train.value_counts(normalize=True) - y_test.value_counts(normalize=True)) *100).iloc[1]

In [9]:
print(f"Label Distribution relative Difference between Train- and Testset:\n","±",np.round(label_diff,3),"%")

Label Distribution relative Difference between Train- and Testset:
 ± 0.166 %


In [10]:
# Create dataframes for training and test data
train_data = pd.DataFrame(X_train)
train_data['Category'] = y_train

test_data = pd.DataFrame(X_test)
test_data['Category'] = y_test

In [27]:
# read cleaned labor data
labor_data = pd.read_csv(r'../raw_data/labor_data_preprocessed.csv')
labor_data.head()

Unnamed: 0,Case_ID,20396,24382,24383,24384,COR30,COR60,FSH,FT4,LH,PROL,TBILHB,TEST
0,#2017851,,,,,,,,13.3,,,,
1,#2024754,,,,,,,,28.540909,,,,24.088636
2,#2024755,28.238095,,,,,,26.431818,,38.0,22.327273,,
3,#2025084,,,,,,,,,,28.15,,
4,#2025918,,,,,,,,15.0,,,,


In [35]:
df_more_data = pd.read_excel(r'../raw_data/Hypophysenpatienten.xlsx',sheet_name='w duplicates')
df_more_data =df_more_data[["Fall Nr.","PID"]]
df_more_data= df_more_data.rename(columns={"Fall Nr.": "Case_ID","PID": "Patient_ID",})
df_more_data

Unnamed: 0,Case_ID,Patient_ID
0,0041835743,300146159
1,0041708812,762512
2,0041892695,365189
3,0041725372,543641
4,0041843364,300302329
...,...,...
1193,0004213315,112374
1194,0004211936,153807
1195,0004180070,719666
1196,0004139115,313269


In [40]:
train_data_merged = train_data.merge(df_more_data,how='right',on='Patient_ID')
train_data_merged["Case_ID"] = train_data_merged["Case_ID"].str.lstrip("0")
train_data_merged = train_data_merged.merge(labor_data,how='left',on='Case_ID')
test_data_merged = test_data.merge(df_more_data,how='right',on='Patient_ID')
test_data_merged = test_data_merged.merge(labor_data,how='left',on='Case_ID')

In [None]:
train_data_merged = train_data.merge(df_more_data,how='right',on='Patient_ID')
train_data_merged["Case_ID"] = train_data_merged["Case_ID"].apply(lambda x: x[2:])
train_data_merged = train_data_merged.merge(labor_data,how='left',on='Case_ID')

In [59]:
labor_values = []
for case_id in train_data_merged["Case_ID"]:
    try:
        float(case_id)
    except:
        labor_values.append(case_id)

In [60]:
labor_values

['3365461-8']

In [47]:
labor_values = []
for case_id in labor_data["Case_ID"]:
    try:
        float(case_id)
    except:
        labor_values.append(case_id)

In [48]:
labor_values

['#2017851',
 '#2024754',
 '#2024755',
 '#2025084',
 '#2025918',
 '#2025919',
 '#2026549',
 '#2027244',
 '#2028285',
 '#2028768',
 '#2029295',
 '#2031882',
 '#2031883',
 '#2032539',
 '#2032667',
 '#2033033',
 '#2033097',
 '#2033621',
 '#2033709',
 '#2033800',
 '#2034467',
 '#2034741',
 '#2035923',
 '#2036892',
 '#2036980',
 '#2037460',
 '#2038756',
 '#2040819',
 '#2040976',
 '#2041008',
 '#2041378',
 '#2041571',
 '#2041641',
 '#2041643',
 '#2041790',
 '#2042051',
 '#2042066',
 '#2042067',
 '#2042095',
 '#2042106',
 '#2043592',
 '#2043694',
 '#2044438',
 '#2045608',
 '#2045632',
 '#2046592',
 '#2050781',
 '#2053066',
 '#2053322',
 '#2053962']

In [46]:
labor_values

['3365461-8']

In [None]:
# Save the training and test data to CSV files
train_data_merged.to_csv(r'../data/train_data.csv', index=False)
test_data_merged.to_csv(r'../data/test_data.csv', index=False)

In [None]:
assert len(train_data_merged.Patient_ID.unique()) == len(train_data.Patient_ID.unique())
assert len(test_data_merged.Patient_ID.unique()) == len(test_data.Patient_ID.unique())

AssertionError: 

In [None]:
print("End Data Partitioning")

End Data Partitioning
