In [5]:
import time
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
y = y.astype(int)

X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

# Step 2: Normalize the data
print("Normalizing data...")
X = X / 255.0  # Scale pixel values to [0, 1]

# Step 3: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Define the Random Forest model
n_estimators = 100  # Number of trees in the forest
print(f"Applying 10-fold cross-validation with Random Forest (n_estimators={n_estimators})...")
rf_clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)

# Step 5: Perform 10-fold cross-validation on the entire dataset
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  # 10-fold cross-validation
fold_scores = []

start_time = time.time()
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    # Split the data for the current fold
    print(f"Processing fold {fold_idx}...")  # Console output indicating the current fold
  
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train the model
    rf_clf.fit(X_train, y_train)

    # Evaluate the model
    score = rf_clf.score(X_test, y_test)
    fold_scores.append(score)
    print(f"Fold {fold_idx}: Accuracy = {score * 100:.2f}%%")

end_time = time.time()

# Step 6: Report overall results
print(f"\nCross-validation completed in {end_time - start_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {np.mean(fold_scores) * 100:.2f}%")
print(f"Standard deviation: {np.std(fold_scores) * 100:.2f}%")


Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Normalizing data...
Standardizing features...
Applying 10-fold cross-validation with Random Forest (n_estimators=100)...
Processing fold 1...
Fold 1: Accuracy = 93.50%%
Processing fold 2...
Fold 2: Accuracy = 94.90%%
Processing fold 3...
Fold 3: Accuracy = 94.50%%
Processing fold 4...
Fold 4: Accuracy = 95.10%%
Processing fold 5...
Fold 5: Accuracy = 94.70%%
Processing fold 6...
Fold 6: Accuracy = 93.90%%
Processing fold 7...
Fold 7: Accuracy = 95.50%%
Processing fold 8...
Fold 8: Accuracy = 96.00%%
Processing fold 9...
Fold 9: Accuracy = 95.30%%
Processing fold 10...
Fold 10: Accuracy = 94.90%%

Cross-validation completed in 278.09 seconds.
Accuracy for each fold: ['93.50%', '94.90%', '94.50%', '95.10%', '94.70%', '93.90%', '95.50%', '96.00%', '95.30%', '94.90%']
Mean accuracy: 94.83%
Standard deviation: 0.70%
