# Data Partitioning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
random_state = 123

In [None]:
print("Start Data Partitioning")

## Train/Test Split Strategy
We will Split the Data into a Train- and Testset. 
We are splitting each patient fully into either the train or the test set to avoid data leakage.
This is ensured by checking for Patient ID duplicates (assert statement beneath) in the Dataframe. If each row only corresponds to one patient we can savely split the dataframe.


Also we are gonna stratify the split on the binary labels to ensure that the train- and testset include about the same of each class.

In [None]:
# load the whole lab dataset with all cases and merge them with the patient data
train_data = pd.read_csv(r'../raw_data/data_full_merge.csv')
print("Total Dataframe rows:", len(train_data))

In [None]:
# Split your data into train features (X_crossval) and target (y_crossval) and groups (Patient_ID)
X_crossval = train_data.drop('Category', axis=1)
y_crossval = train_data['Category']
groups = train_data['Patient_ID']

# Perform Stratified Cross-Validation with fold numbers
n_splits = 5  # Number of folds
stratified_kf = StratifiedGroupKFold(n_splits=n_splits, random_state=random_state, shuffle=True)

# Create a new DataFrame to store the fold number
train_data['fold'] = -1  # Initialize with -1
label_distributions = []
fold_number = 0  # Initialize fold number

for _, test_index in stratified_kf.split(X_crossval, y_crossval,groups):
    y_test = y_crossval.iloc[test_index]

    # Update the fold number for the corresponding rows in the new DataFrame
    train_data.loc[test_index, 'fold'] = fold_number
    label_distribution_fold = y_test.value_counts(normalize=True).to_dict()
    label_distributions.append(label_distribution_fold)

    fold_number += 1  # Increment the fold number
    print(f"Total Dataframe Fold {fold_number} rows:", len(test_index))
    
    # Calculate and print the relative label differences
for i in range(n_splits):
    for j in range(i + 1, n_splits):
        label_diff = sum(abs(label_distributions[i][k] - label_distributions[j][k]) for k in label_distributions[i])
        print(f"Label Distribution relative Difference between Fold {i} vs. Fold {j}:\n","±",np.round(label_diff,3),"%")


In [None]:
# Save the training and test data to CSV files
train_data.to_csv(r'../data/data_pairs.csv', index=False)

In [None]:
print("End Data Partitioning")