In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

def stratified_kfold(X, y, n_splits=5, random_state=None):
    np.random.seed(random_state)
    unique_classes, class_counts = np.unique(y, return_counts=True)
    folds = [[] for _ in range(n_splits)]

    for cls in unique_classes:
        cls_indices = np.where(y == cls)[0]
        np.random.shuffle(cls_indices)
        cls_folds = np.array_split(cls_indices, n_splits)
        
        for i in range(n_splits):
            folds[i].extend(cls_folds[i])
    
    return [(np.array(folds[i]), np.concatenate([np.array(folds[j]) for j in range(n_splits) if j != i])) for i in range(n_splits)]

# Load a sample dataset (Iris dataset)
data = load_iris()
X = data.data
y = data.target

# For simplicity, let's do stratified k-fold with binary classification (class 0 vs. class 1)
X = X[y != 2]  # Remove class 2
y = y[y != 2]  # Remove class 2

# Apply Stratified K-Fold Cross-Validation
n_splits = 3
folds = stratified_kfold(X, y, n_splits=n_splits, random_state=42)

# Example Usage
for i, (test_idx, train_idx) in enumerate(folds):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    print(f"Fold {i+1}:")
    print(f"  Train class distribution: {np.bincount(y_train)}")
    print(f"  Test class distribution: {np.bincount(y_test)}\n")


Fold 1:
  Train class distribution: [33 33]
  Test class distribution: [17 17]

Fold 2:
  Train class distribution: [33 33]
  Test class distribution: [17 17]

Fold 3:
  Train class distribution: [34 34]
  Test class distribution: [16 16]

