# Data Partitioning - Additional Dirty Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold
random_state = 123

In [None]:
print("Start Data Partitioning")

In [None]:
# read cleaned data
df = pd.read_csv(r'../raw_data/label_data.csv')
df.head()

## Train/Test Split Strategy
We will Split the Data into a Train- and Testset. 
We are splitting each patient fully into either the train or the test set to avoid data leakage.
This is ensured by checking for Patient ID duplicates (assert statement beneath) in the Dataframe. If each row only corresponds to one patient we can savely split the dataframe.


Also we are gonna stratify the split on the binary labels to ensure that the train- and testset include about the same of each class.

In [None]:
# drop patients without a label
df= df[~df["Category"].isna()]

In [None]:
# Patient ID Duplicate Check
assert len(df[df["Patient_ID"].duplicated()]) == 0

In [None]:
# remove unnecessary fields
df=df[['Patient_ID','Category']]
# split columns into features and labels
X = df.drop(columns=["Category"])
y = df["Category"]
# stratified train/test split on labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=random_state)
print("Total Patients Train set:", len(X_train))
print("Total Patients Test set:", len(X_test))

In [None]:
label_diff = ((y_train.value_counts(normalize=True) - y_test.value_counts(normalize=True)) *100).iloc[1]
print(f"Label Distribution relative Difference between Train- and Testset:\n","±",np.round(np.abs(label_diff),3),"%")

In [None]:
# Create dataframes for training and test data
train_data = pd.DataFrame(X_train)
train_data['Category'] = y_train

test_data = pd.DataFrame(X_test)
test_data['Category'] = y_test

In [None]:
# load the whole lab dataset with all cases and merge them with the patient data
df_more_data = pd.read_csv(r'../raw_data/data_dirty_merge.csv')
train_data_merged = train_data.merge(df_more_data,how='inner',on=['Patient_ID','Category'])
test_data_merged = test_data.merge(df_more_data,how='inner',on=['Patient_ID','Category'])
print("Total Dataframe Train rows:", len(train_data_merged))
print("Total Dataframe Test rows:", len(test_data_merged))

In [None]:
#calculate and print difference in label between test and train dataset
label_diff = ((train_data_merged['Category'].value_counts(normalize=True) - test_data_merged['Category'].value_counts(normalize=True)) *100).iloc[1]
print(f"Label Distribution relative Difference between Train- and Testset:\n","±",np.round(np.abs(label_diff),3),"%")

In [None]:
# Split your data into train features (X_crossval) and target (y_crossval)
X_crossval = train_data_merged.drop('Category', axis=1)
y_crossval = train_data_merged['Category']

# Perform Stratified Cross-Validation with fold numbers
n_splits = 5  # Number of folds
stratified_kf = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)

# Create a new DataFrame to store the fold number
train_data_merged['fold'] = -1  # Initialize with -1
label_distributions = []
fold_number = 0  # Initialize fold number

for _, test_index in stratified_kf.split(X_crossval, y_crossval):
    y_test = y_crossval.iloc[test_index]

    # Update the fold number for the corresponding rows in the new DataFrame
    train_data_merged.loc[test_index, 'fold'] = fold_number
    label_distribution_fold = y_test.value_counts(normalize=True).to_dict()
    label_distributions.append(label_distribution_fold)

    fold_number += 1  # Increment the fold number
    print(f"Total Dataframe Fold {fold_number} rows:", len(test_index))
    
    # Calculate and print the relative label differences
for i in range(n_splits):
    for j in range(i + 1, n_splits):
        label_diff = sum(abs(label_distributions[i][k] - label_distributions[j][k]) for k in label_distributions[i])
        print(f"Label Distribution relative Difference between Fold {i} vs. Fold {j}:\n","±",np.round(label_diff,3),"%")


In [None]:
# Save the training and test data to CSV files
train_data_merged.to_csv(r'../data/train_data_dirty_not_imputed.csv', index=False)
test_data_merged.to_csv(r'../data/test_data_dirty_not_imputed.csv', index=False)

In [None]:
print("End Data Partitioning")