In [3]:
import os
import pandas as pd
from sklearn.model_selection import KFold

# Load the clustered dataset and participant ID dataset
clustered_file_path = "processed_dataset_with_clusters.csv"
participant_file_path = "combined_data_imputed.csv"

df_clustered = pd.read_csv(clustered_file_path)
df_participants = pd.read_csv(participant_file_path)

# Ensure both datasets have the same number of rows
assert len(df_clustered) == len(df_participants), "Mismatch in row numbers between datasets"

# Add participant IDs to the clustered dataset
df_clustered["participant_id"] = df_participants["participant_id"]

# Create output directory if it doesn't exist
output_dir = "5fold_splits"
os.makedirs(output_dir, exist_ok=True)

# Perform 5-fold split ensuring no data leakage within participants
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(df_participants["participant_id"].unique())):
    train_participants = df_participants["participant_id"].unique()[train_idx]
    test_participants = df_participants["participant_id"].unique()[test_idx]
    
    train_data = df_clustered[df_clustered["participant_id"].isin(train_participants)]
    test_data = df_clustered[df_clustered["participant_id"].isin(test_participants)]
    
    train_file = os.path.join(output_dir, f"train_fold_{fold}.csv")
    test_file = os.path.join(output_dir, f"test_fold_{fold}.csv")
    
    train_data.to_csv(train_file, index=False)
    test_data.to_csv(test_file, index=False)
    
    print(f"Saved fold {fold}: train ({len(train_data)}) and test ({len(test_data)})")


Saved fold 0: train (4163) and test (1188)
Saved fold 1: train (4316) and test (1035)
Saved fold 2: train (4249) and test (1102)
Saved fold 3: train (4320) and test (1031)
Saved fold 4: train (4356) and test (995)
