# Data Partitioning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
random_state = 123
np.random.seed(random_state)

In [None]:
print("Start Data Partitioning")

## Train/Test Split Strategy
We will Split the Data into a Train- and Testset. 
We are splitting each patient fully into either the train or the test set to avoid data leakage.
This is ensured by checking for Patient ID duplicates (assert statement beneath) in the Dataframe. If each row only corresponds to one patient we can savely split the dataframe.


Also we are gonna stratify the split on the binary labels to ensure that the train- and testset include about the same of each class.

In [None]:
# load the whole lab dataset with all cases and merge them with the patient data
lab_train_data = pd.read_csv(r'../raw_data/lab_data_only_dirty.csv')
mri_train_data = pd.read_csv(r'../raw_data/mri_data_only_dirty.csv')
df_patients = pd.read_csv(r'../raw_data/label_data.csv')
print("Total Dataframe LAB rows:", len(lab_train_data))
print("Total Dataframe MRI rows:", len(mri_train_data))

In [None]:
lab_train_data = lab_train_data.merge(df_patients[['Patient_ID', 'Category']], on='Patient_ID')
mri_train_data = mri_train_data.merge(df_patients[['Patient_ID', 'Category']], on='Patient_ID')

In [None]:
# sample min(mri_data) from lab
lower_limit = 0
upper_limit = len(lab_train_data)
num_indexes = len(mri_train_data)
random_indexes = np.random.choice(np.arange(lower_limit, upper_limit), size=num_indexes, replace=False)

# reduce labs
lab_train_data= lab_train_data.iloc[random_indexes].reset_index()

In [None]:
#SPLIT LAB

# Split your data into train features (X_crossval) and target (y_crossval) and groups (Patient_ID)
X_crossval = lab_train_data.drop('Category', axis=1)
y_crossval = lab_train_data['Category']
groups = mri_train_data['Patient_ID']

# Perform Stratified Cross-Validation with fold numbers
n_splits = 5  # Number of folds
stratified_kf = StratifiedGroupKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
stratified_kf_lc = StratifiedGroupKFold(n_splits=int(np.floor(len(lab_train_data)/50)), random_state=random_state, shuffle=True)

# Create a new DataFrame to store the fold number
lab_train_data['fold'] = -1  # Initialize with -1
label_distributions = []
fold_number = 0  # Initialize fold number

for _, test_index in stratified_kf.split(X_crossval, y_crossval, groups):
    y_test = y_crossval.iloc[test_index]

    # Update the fold number for the corresponding rows in the new DataFrame
    lab_train_data.loc[test_index, 'fold'] = fold_number
    label_distribution_fold = y_test.value_counts(normalize=True).to_dict()
    label_distributions.append(label_distribution_fold)

    fold_number += 1  # Increment the fold number
    print(f"Total Dataframe Fold {fold_number} rows:", len(test_index))
    
    # Calculate and print the relative label differences
for i in range(n_splits):
    for j in range(i + 1, n_splits):
        label_diff = sum(abs(label_distributions[i][k] - label_distributions[j][k]) for k in label_distributions[i])
        print(f"Label Distribution relative Difference between Fold {i} vs. Fold {j}:\n","±",np.round(label_diff,3),"%")

fold_number = 0  # Initialize fold number
for _, test_index in stratified_kf_lc.split(X_crossval, y_crossval, groups):
    y_test = y_crossval.iloc[test_index]

    # Update the fold number for the corresponding rows in the new DataFrame
    lab_train_data.loc[test_index, 'fold_learning_curve'] = fold_number
    label_distribution_fold = y_test.value_counts(normalize=True).to_dict()
    label_distributions.append(label_distribution_fold)

    fold_number += 1  # Increment the fold number

lab_train_data["label"] = (lab_train_data["Category"]=="prolaktinom").astype(int)
# Save the training and test data to CSV files
lab_train_data.to_csv(r'../data/train/train_lab_data.csv', index=False)

In [None]:
#SPLIT MRI

# Split your data into train features (X_crossval) and target (y_crossval) and groups (Patient_ID)
X_crossval = mri_train_data.drop('Category', axis=1)
y_crossval = mri_train_data['Category']
groups = mri_train_data['Patient_ID']

# Perform Stratified Cross-Validation with fold numbers
n_splits = 5  # Number of folds
stratified_kf = StratifiedGroupKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
stratified_kf_lc = StratifiedGroupKFold(n_splits=int(np.floor(len(mri_train_data)/50)), random_state=random_state, shuffle=True)

# Create a new DataFrame to store the fold number
mri_train_data['fold'] = -1  # Initialize with -1
label_distributions = []
fold_number = 0  # Initialize fold number

for _, test_index in stratified_kf.split(X_crossval, y_crossval,groups):
    y_test = y_crossval.iloc[test_index]

    # Update the fold number for the corresponding rows in the new DataFrame
    mri_train_data.loc[test_index, 'fold'] = fold_number
    label_distribution_fold = y_test.value_counts(normalize=True).to_dict()
    label_distributions.append(label_distribution_fold)

    fold_number += 1  # Increment the fold number
    print(f"Total Dataframe Fold {fold_number} rows:", len(test_index))
    
    # Calculate and print the relative label differences
for i in range(n_splits):
    for j in range(i + 1, n_splits):
        label_diff = sum(abs(label_distributions[i][k] - label_distributions[j][k]) for k in label_distributions[i])
        print(f"Label Distribution relative Difference between Fold {i} vs. Fold {j}:\n","±",np.round(label_diff,3),"%")

fold_number = 0  # Initialize fold number
for _, test_index in stratified_kf_lc.split(X_crossval, y_crossval,groups):
    y_test = y_crossval.iloc[test_index]

    # Update the fold number for the corresponding rows in the new DataFrame
    mri_train_data.loc[test_index, 'fold_learning_curve'] = fold_number
    label_distribution_fold = y_test.value_counts(normalize=True).to_dict()
    label_distributions.append(label_distribution_fold)

    fold_number += 1  # Increment the fold number

mri_train_data["label"] = (mri_train_data["Category"]=="prolaktinom").astype(int)
# Save the training and test data to CSV files
mri_train_data.to_csv(r'../data/train/train_mri_data.csv', index=False)

In [None]:
print("End Data Partitioning")