In [4]:
import time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Load the dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
y = y.astype(int)

# Step 2: Normalize the dataset
print("Normalizing data...")
X = X / 255.0  # Scale pixel values to [0, 1]

# Step 3: Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Manually split the data into 7k for training and 3k for testing
X_train, X_test, y_train, y_test = X[:7000], X[7000:10000], y[:7000], y[7000:10000]

# Step 5: Set up the SVM with an RBF kernel
C = 1.0  # Regularization parameter
gamma = 'scale'  # Kernel coefficient for RBF
clf = SVC(kernel='rbf', C=C, gamma=gamma)

# Step 6: Apply 10-fold cross-validation on the training set and track time and accuracy
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 10-fold cross-validation
accuracies = []
fold_times = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    print(f"Processing fold {fold + 1}...")  # Console output indicating the current fold
    start_time = time.time()
    
    # Split the data for this fold
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    # Train the model on the fold
    clf.fit(X_train_fold, y_train_fold)
    
    # Evaluate the model on the validation set
    accuracy = clf.score(X_val_fold, y_val_fold)
    accuracies.append(accuracy)
    
    end_time = time.time()
    fold_time = end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold + 1}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 7: Compute mean, variance, and best accuracy from cross-validation
mean_accuracy = np.mean(accuracies)
accuracy_variance = np.var(accuracies)
total_time = np.sum(fold_times)
best_accuracy = np.max(accuracies)

print(f"\nMean accuracy: {mean_accuracy*100:.4f}")
print(f"Accuracy variance: {accuracy_variance*100:.4f}")
print(f"Total time for all folds: {total_time:.2f} seconds")
print(f"Best accuracy: {best_accuracy*100:.4f}")

# Step 8: Evaluate the model on the separate test set
print("\nEvaluating the model on the test set...")
clf.fit(X_train, y_train)  # Train on the entire training set
accuracy = clf.score(X_test, y_test)  # Evaluate on the test set
print(f"Accuracy on the test set: {accuracy*100:.4f}%")


Fetching MNIST dataset...
Normalizing data...
Standardizing features...
Processing fold 1...
Fold 1: Accuracy = 92.43%, Time = 32.29 seconds
Processing fold 2...
Fold 2: Accuracy = 95.57%, Time = 37.40 seconds
Processing fold 3...
Fold 3: Accuracy = 93.43%, Time = 28.86 seconds
Processing fold 4...
Fold 4: Accuracy = 94.14%, Time = 37.35 seconds
Processing fold 5...
Fold 5: Accuracy = 93.71%, Time = 35.99 seconds
Processing fold 6...
Fold 6: Accuracy = 93.43%, Time = 34.67 seconds
Processing fold 7...
Fold 7: Accuracy = 93.57%, Time = 26.98 seconds
Processing fold 8...
Fold 8: Accuracy = 93.29%, Time = 27.16 seconds
Processing fold 9...
Fold 9: Accuracy = 92.14%, Time = 23.69 seconds
Processing fold 10...
Fold 10: Accuracy = 93.29%, Time = 23.69 seconds

Mean accuracy: 93.5000
Accuracy variance: 0.0078
Total time for all folds: 308.09 seconds
Best accuracy: 95.5714

Evaluating the model on the test set...
Accuracy on the test set: 92.7667%
