Stratified K-Fold Cross-Validation is a variation of k-fold cross-validation that preserves the class distribution in each fold, ensuring that each subset has a similar proportion of each class as the original dataset. This is especially useful for imbalanced datasets to get more reliable model evaluation.

In [1]:
 

 # Import necessary libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target labels

# Initialize the scaler and model
scaler = StandardScaler()
model = DecisionTreeClassifier(criterion='gini', random_state=42)

# Initialize StratifiedKFold with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store accuracy scores for each fold
fold_accuracies = []

# Perform Stratified K-Fold Cross-Validation
fold_no = 1
for train_index, test_index in skf.split(X, y):
    # Split data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Scale the features
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)
    
    print(f'Fold {fold_no} Accuracy: {accuracy:.4f}')
    fold_no += 1

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(fold_accuracies)
print(f'\nAverage Accuracy across {skf.n_splits} folds: {average_accuracy:.4f}')
print(f'Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}')

Fold 1 Accuracy: 1.0000
Fold 2 Accuracy: 0.9667
Fold 3 Accuracy: 0.9000
Fold 4 Accuracy: 1.0000
Fold 5 Accuracy: 0.9000

Average Accuracy across 5 folds: 0.9533
Standard Deviation of Accuracy: 0.0452


In [2]:
from sklearn.datasets import load_wine
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load the Iris dataset
iris = load_wine()
X, y = iris.data, iris.target

# Define the Stratified K-Fold cross-validator with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a RandomForest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform stratified cross-validation and compute accuracy for each fold
scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')

# Print accuracy scores for each fold and mean accuracy
print(f"Accuracy for each fold: {scores}")
print(f"Mean accuracy: {np.mean(scores):.4f}")

Accuracy for each fold: [0.97222222 1.         0.97222222 0.94285714 1.        ]
Mean accuracy: 0.9775
