In [1]:
import time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

print("Test case : testing SVM with RBF kernel")

# Step 1: Load the dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
y = y.astype(int)

# Step 2: Normalize the data
print("Normalizing data...")
X = X / 255.0  # Scale pixel values to [0, 1]

# # Step 3: Select 10,000 samples randomly
# X_subset, _, y_subset, _ = train_test_split(X, y, train_size=10000, random_state=42)
# print(f"Reduced dataset size: {len(X_subset)}")

# # Step 4: Standardize the features
# scaler = StandardScaler()
# X_subset = scaler.fit_transform(X_subset)

# Step 5: Define the SVM model
C = 1.0  # Regularization parameter
print(f"Applying 10-fold cross-validation with SVM (C={C})...")
clf = SVC(kernel='rbf', C=C)

# Step 6: Perform 10-fold cross-validation with detailed partitions
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_scores = []
fold_partitions = []

start_time = time.time()
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_subset, y_subset), 1):
    # Split the data for the current fold
    X_train, X_test = X_subset[train_idx], X_subset[test_idx]
    y_train, y_test = y_subset[train_idx], y_subset[test_idx]

    # Train the model
    clf.fit(X_train, y_train)

    # Evaluate the model
    score = clf.score(X_test, y_test)
    fold_scores.append(score)

    # Log partition details
    fold_partitions.append((train_idx, test_idx))
    print(f"Fold {fold_idx}:")
    print(f"  Accuracy: {score * 100:.2f}%")

end_time = time.time()

# Step 7: Identify the best fold
best_fold_index = np.argmax(fold_scores)  # Index of the fold with the highest accuracy
best_fold_accuracy = fold_scores[best_fold_index]

# Step 8: Report overall results
print(f"\nCross-validation completed in {end_time - start_time:.2f} seconds.")
print(f"Accuracy for each fold: {[f'{score * 100:.2f}%' for score in fold_scores]}")
print(f"Mean accuracy: {np.mean(fold_scores) * 100:.2f}%")
print(f"Standard deviation: {np.std(fold_scores) * 100:.2f}%")
print(f"Best fold: Fold {best_fold_index + 1} with accuracy {best_fold_accuracy * 100:.2f}%")


Test case : testing SVM with RBF kernel
Fetching MNIST dataset...
Normalizing data...
Reduced dataset size: 10000
Applying 10-fold cross-validation with SVM (C=1.0)...
Fold 1:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 94.20%
Fold 2:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 94.90%
Fold 3:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 94.10%
Fold 4:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 93.70%
Fold 5:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 94.00%
Fold 6:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 93.60%
Fold 7:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 92.60%
Fold 8:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 93.50%
Fold 9:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 93.60%
Fold 10:
  Training set size: 9000, Testing set size: 1000
  Accuracy: 94.20%

Cross-validation completed in 180.83 seconds.
Accuracy for e

In [1]:
import time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
y = y.astype(int)

# Step 2: Normalize and downscale the dataset
print("Normalizing and downsampling data...")
X = X / 255.0  # Scale pixel values to [0, 1]

# Reduce the dataset size to speed up training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)  # Use only 20% of data
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# Step 3: Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train the SVM with a linear kernel
C = 1.0  # Regularization parameter
print(f"Training SVM with linear kernel (C={C})...")
clf = SVC(kernel='linear', C=C)
start_time = time.time()
clf.fit(X_train, y_train)
end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds.")

# Step 5: Evaluate the model
print("Evaluating the model...")
accuracy = clf.score(X_test, y_test)
print(f"Accuracy on the test set: {accuracy:.4f}")


Fetching MNIST dataset...
Normalizing and downsampling data...
Training set size: 56000, Test set size: 14000
Training SVM with linear kernel (C=1.0)...
Training completed in 406.53 seconds.
Evaluating the model...
Accuracy on the test set: 0.9211


In [5]:
import time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
print("Fetching MNIST dataset...")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
y = y.astype(int)

# Step 2: Normalize and downscale the dataset
print("Normalizing and downsampling data...")
X = X / 255.0  # Scale pixel values to [0, 1]

# Reduce the dataset size to speed up training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80, random_state=42)  # Use only 20% of data
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# Step 3: Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train the SVM with a polynomial kernel
C = 1.0  # Regularization parameter
degree = 3  # Polynomial degree
print(f"Training SVM with polynomial kernel (C={C}, degree={degree})...")
clf = SVC(kernel='poly', C=C, degree=degree, coef0=1)
start_time = time.time()
clf.fit(X_train, y_train)
end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds.")

# Step 5: Evaluate the model
print("Evaluating the model...")
accuracy = clf.score(X_test, y_test)
print(f"Accuracy on the test set: {accuracy:.4f}")


Test case: testing SVM with Linear kernel
Fetching MNIST dataset...
Normalizing data...
Reduced dataset size: 10000
Applying 10-fold cross-validation with SVM (C=1.0, Linear kernel)...
Fold 1:
  Accuracy: 92.80%
Fold 2:
  Accuracy: 92.80%
Fold 3:
  Accuracy: 91.90%
Fold 4:
  Accuracy: 91.90%
Fold 5:
  Accuracy: 92.00%
Fold 6:
  Accuracy: 91.00%
Fold 7:
  Accuracy: 90.50%
Fold 8:
  Accuracy: 92.80%
Fold 9:
  Accuracy: 90.10%
Fold 10:
  Accuracy: 90.70%

Cross-validation completed in 52.29 seconds.
Accuracy for each fold: ['92.80%', '92.80%', '91.90%', '91.90%', '92.00%', '91.00%', '90.50%', '92.80%', '90.10%', '90.70%']
Mean accuracy: 91.65%
Standard deviation: 0.96%
Best fold: Fold 1 with accuracy 92.80%
