In [6]:
import time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Load the dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
y = y.astype(int)

# Step 2: Use only the first 10,000 images
X, y = X[:10000], y[:10000]  # Select first 10,000 images for processing
print(f"Using first 10,000 images: {X.shape[0]} training samples.")

# Step 3: Normalize the dataset
print("Normalizing data...")
X = X / 255.0  # Scale pixel values to [0, 1]
print("Normalizing data completed")

# Step 4: Standardize the features
print("Standardizing the features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)
print("Standardization completed.")

# Step 5: Train the SVM with a linear kernel
C = 1.0  # Regularization parameter
clf = SVC(kernel='linear', C=C)

print("Starting 10-fold cross-validation and tracking time and accuracy...")

# Step 6: Apply 10-fold cross-validation and track time and accuracy
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 10-fold cross-validation
accuracies = []
fold_times = []

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"Processing fold: {fold + 1}...")
    start_time = time.time()
    
    # Split the data for this fold
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Train the model on the fold
    clf.fit(X_train_fold, y_train_fold)
    
    # Evaluate the model on the validation set
    accuracy = clf.score(X_val_fold, y_val_fold)
    accuracies.append(accuracy)
    
    end_time = time.time()
    fold_time = end_time - start_time
    fold_times.append(fold_time)
    
    print(f"Fold {fold + 1}: Accuracy = {accuracy * 100:.2f}%, Time = {fold_time:.2f} seconds")

# Step 7: Compute mean, variance, and best accuracy
mean_accuracy = np.mean(accuracies)
accuracy_variance = np.var(accuracies)
total_time = np.sum(fold_times)
best_accuracy = np.max(accuracies)

print(f"\nMean accuracy: {mean_accuracy*100:.4f}")
print(f"Accuracy variance: {accuracy_variance*100:.4f}")
print(f"Total time for all folds: {total_time:.2f} seconds")
print(f"Best accuracy: {best_accuracy*100:.4f}%")



Fetching MNIST dataset...
Using first 10,000 images: 10000 training samples.
Normalizing data...
Normalizing data completed
Standardizing the features...
Standardization completed.
Starting 10-fold cross-validation and tracking time and accuracy...
Processing fold: 1...
Fold 1: Accuracy = 91.90%, Time = 25.92 seconds
Processing fold: 2...
Fold 2: Accuracy = 91.20%, Time = 27.60 seconds
Processing fold: 3...
Fold 3: Accuracy = 91.30%, Time = 22.72 seconds
Processing fold: 4...
Fold 4: Accuracy = 90.00%, Time = 26.62 seconds
Processing fold: 5...
Fold 5: Accuracy = 92.20%, Time = 27.06 seconds
Processing fold: 6...
Fold 6: Accuracy = 91.70%, Time = 29.50 seconds
Processing fold: 7...
Fold 7: Accuracy = 90.90%, Time = 27.31 seconds
Processing fold: 8...
Fold 8: Accuracy = 91.30%, Time = 25.37 seconds
Processing fold: 9...
Fold 9: Accuracy = 91.60%, Time = 19.94 seconds
Processing fold: 10...
Fold 10: Accuracy = 91.70%, Time = 20.45 seconds

Mean accuracy: 91.3800
Accuracy variance: 0.0034