# Data Partitioning

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
random_state = 123

In [2]:
print("Start Data Partitioning")

Start Data Partitioning


In [3]:
# read cleaned data
df = pd.read_csv(r'../raw_data/label_data.csv')
df.head()

Unnamed: 0,Patient_ID,Case_ID,Date_Case,ID_MRI_Machine,Adenoma_size,Label_Quality,Entry_date,Operation_date,Category,Patient_age,...,Diagnosis_supprimiertesprolaktin,Diagnosis_null-zell,Diagnosis_inaktiv(cortico),Diagnosis_intraundsupraselläreraumforderung,Diagnosis_apoplektiform,Diagnosis_emptysella,Diagnosis_tumorapoplex,Diagnosis_inaktiv(gh),Diagnosis_supprimiert,Diagnosis_gonado
0,300071920,40323241,2018-01-16 08:00:00,MRI3,mikro,"extern Zug, postmenopausal",2017-07-01,2017-09-06,non-prolaktinom,57,...,0,0,0,0,0,0,0,0,0,0
1,666184,40681080,2019-04-02 12:24:00,MRI2,makro,verstorben,2012-09-14,2012-09-19,non-prolaktinom,69,...,0,0,0,0,0,0,0,0,0,0
2,543641,41725372,2023-05-05 07:54:00,MRI2,mikro,,2006-01-01,2009-06-04,non-prolaktinom,39,...,0,0,0,0,0,0,0,0,0,0
3,300038107,41668452,2023-04-11 07:34:00,MRI4,mikro,,2023-09-01,2023-09-14,non-prolaktinom,49,...,0,0,0,0,0,0,0,0,0,0
4,287347,41704243,2023-03-30 09:15:00,MRI1,mikro,,2022-06-01,2022-12-20,non-prolaktinom,25,...,0,0,0,0,0,0,0,0,0,0


## Train/Test Split Strategy
We will Split the Data into a Train- and Testset. 
We are splitting each patient fully into either the train or the test set to avoid data leakage.
This is ensured by checking for Patient ID duplicates (assert statement beneath) in the Dataframe. If each row only corresponds to one patient we can savely split the dataframe.


Also we are gonna stratify the split on the binary labels to ensure that the train- and testset include about the same of each class.

In [4]:
# drop patients without a label
df= df[~df["Category"].isna()]

In [5]:
# Patient ID Duplicate Check
assert len(df[df["Patient_ID"].duplicated()]) == 0

In [6]:
# remove unnecessary fields
df=df[['Patient_ID','Category']]

In [7]:
# split columns into features and labels
X = df.drop(columns=["Category"])
y = df["Category"]
# stratified train/test split on labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [8]:
print("Total Patients Train set:", len(X_train))
print("Total Patients Test set:", len(X_test))

Total Patients Train set: 292
Total Patients Test set: 73


In [9]:
label_diff = ((y_train.value_counts(normalize=True) - y_test.value_counts(normalize=True)) *100).iloc[1]

In [10]:
print(f"Label Distribution relative Difference between Train- and Testset:\n","±",np.round(np.abs(label_diff),3),"%")

Label Distribution relative Difference between Train- and Testset:
 ± 0.342 %


In [11]:
# Create dataframes for training and test data
train_data = pd.DataFrame(X_train)
train_data['Category'] = y_train

test_data = pd.DataFrame(X_test)
test_data['Category'] = y_test

In [12]:
df_more_data = pd.read_csv(r'../raw_data/data_imputed.csv')

In [13]:
train_data_merged = train_data.merge(df_more_data,how='inner',on=['Patient_ID','Category'])
test_data_merged = test_data.merge(df_more_data,how='inner',on=['Patient_ID','Category'])

In [14]:
print("Total Dataframe Train rows:", len(train_data_merged))
print("Total Dataframe Test rows:", len(test_data_merged))

Total Dataframe Train rows: 410
Total Dataframe Test rows: 100


In [15]:
label_diff = ((train_data_merged['Category'].value_counts(normalize=True) - test_data_merged['Category'].value_counts(normalize=True)) *100).iloc[1]

In [16]:
print(f"Label Distribution relative Difference between Train- and Testset:\n","±",np.round(np.abs(label_diff),3),"%")

Label Distribution relative Difference between Train- and Testset:
 ± 1.951 %


In [17]:
# Split your data into features (X) and target (y)
X_crossval = train_data_merged.drop('Category', axis=1)
y_crossval = train_data_merged['Category']

# Perform Stratified Cross-Validation with fold numbers
n_splits = 5  # Number of folds
stratified_kf = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)

# Create a new DataFrame to store the fold number
train_data_merged['fold'] = -1  # Initialize with -1
label_distributions = []
fold_number = 0  # Initialize fold number

for _, test_index in stratified_kf.split(X_crossval, y_crossval):
    y_test = y_crossval.iloc[test_index]

    # Update the fold number for the corresponding rows in the new DataFrame
    train_data_merged.loc[test_index, 'fold'] = fold_number
    label_distribution_fold = y_test.value_counts(normalize=True).to_dict()
    label_distributions.append(label_distribution_fold)

    fold_number += 1  # Increment the fold number
    print(f"Total Dataframe Fold {fold_number} rows:", len(test_index))
    
    # Calculate and print the relative label differences
for i in range(n_splits):
    for j in range(i + 1, n_splits):
        label_diff = sum(abs(label_distributions[i][k] - label_distributions[j][k]) for k in label_distributions[i])
        print(f"Label Distribution relative Difference between Fold {i} vs. Fold {j}:\n","±",np.round(label_diff,3),"%")


Total Dataframe Fold 1 rows: 82
Total Dataframe Fold 2 rows: 82
Total Dataframe Fold 3 rows: 82
Total Dataframe Fold 4 rows: 82
Total Dataframe Fold 5 rows: 82
Label Distribution relative Difference between Fold 0 vs. Fold 1:
 ± 0.024 %
Label Distribution relative Difference between Fold 0 vs. Fold 2:
 ± 0.024 %
Label Distribution relative Difference between Fold 0 vs. Fold 3:
 ± 0.024 %
Label Distribution relative Difference between Fold 0 vs. Fold 4:
 ± 0.024 %
Label Distribution relative Difference between Fold 1 vs. Fold 2:
 ± 0.0 %
Label Distribution relative Difference between Fold 1 vs. Fold 3:
 ± 0.0 %
Label Distribution relative Difference between Fold 1 vs. Fold 4:
 ± 0.0 %
Label Distribution relative Difference between Fold 2 vs. Fold 3:
 ± 0.0 %
Label Distribution relative Difference between Fold 2 vs. Fold 4:
 ± 0.0 %
Label Distribution relative Difference between Fold 3 vs. Fold 4:
 ± 0.0 %


In [18]:
# Save the training and test data to CSV files
train_data_merged.to_csv(r'../data/train_data.csv', index=False)
test_data_merged.to_csv(r'../data/test_data.csv', index=False)

In [19]:
print("End Data Partitioning")

End Data Partitioning
