SPLITTING DATA FOR K-FOLD CROSS VALIDATION METHOD

In [2]:
import os
import pandas as pd
from sklearn.model_selection import KFold


In [3]:


def kfold_split_and_save_dataset(filepath, n_splits=5, random_state=42, shuffle=True):
    # Extract filename without path
    filename = os.path.basename(filepath)
    
    # Read the dataset
    df = pd.read_csv(filepath)

    # Prepare KFold
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    for fold, (train_index, test_index) in enumerate(kf.split(df), start=1):
        train_df = df.iloc[train_index]
        test_df = df.iloc[test_index]
        
        print(f"Fold {fold}:")
        print(f"  Training rows: {len(train_df)}")
        print(f"  Testing rows: {len(test_df)}")
        
        # Define output directories
        train_dir = os.path.join("..", "datasets", "train")
        test_dir = os.path.join("..", "datasets", "test")
        
        # Create directories if they don't exist
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(test_dir, exist_ok=True)
        
        # Define filenames with fold number
        train_file = f"train_fold{fold}_{filename}"
        test_file = f"test_fold{fold}_{filename}"
        
        # Save to CSV
        train_df.to_csv(os.path.join(train_dir, train_file), index=False)
        test_df.to_csv(os.path.join(test_dir, test_file), index=False)

        print(f"  Saved: {train_file} and {test_file}")

# Example usage
kfold_split_and_save_dataset("../datasets/preprocessed_student_depression.csv", n_splits=5)


Fold 1:
  Training rows: 22320
  Testing rows: 5580
  Saved: train_fold1_preprocessed_student_depression.csv and test_fold1_preprocessed_student_depression.csv
Fold 2:
  Training rows: 22320
  Testing rows: 5580
  Saved: train_fold2_preprocessed_student_depression.csv and test_fold2_preprocessed_student_depression.csv
Fold 3:
  Training rows: 22320
  Testing rows: 5580
  Saved: train_fold3_preprocessed_student_depression.csv and test_fold3_preprocessed_student_depression.csv
Fold 4:
  Training rows: 22320
  Testing rows: 5580
  Saved: train_fold4_preprocessed_student_depression.csv and test_fold4_preprocessed_student_depression.csv
Fold 5:
  Training rows: 22320
  Testing rows: 5580
  Saved: train_fold5_preprocessed_student_depression.csv and test_fold5_preprocessed_student_depression.csv
