## Task completed individually by Yasmin Ebrahimi -  98.33% accuracy achieved with 23 features

## Goal
- Build an end-to-end ML pipeline using the minimum number of features  
- Achieve over 98% accuracy on the test set of the `digits` dataset

## Result
- Test accuracy: **98.33%**  
- Model: SVM (C=7, γ=0.012, RBF kernel)  
- Features used: 23 principal components  


In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV

# Dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Classifiers
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
#-----

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# Load the digits dataset
digits = load_digits()
X = digits.data        # Features (1797 samples, 64 features each)
y = digits.target      # Labels (digits 0 through 9)

# Split into training and test sets (e.g., 70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [3]:
top_k = 23

In [4]:
# Normalize data before PCA (pixel values to [0, 1])
scaler_pre = MinMaxScaler()
X_normalized = scaler_pre.fit_transform(X)

# Apply PCA for 23 components with whitening
pca = PCA(n_components=top_k, whiten=True, random_state=42)
X_pca = pca.fit_transform(X_normalized)

# Split into training and test sets (70% train, 30% test)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42, stratify=y)

# Scale PCA components
scaler = StandardScaler()
X_train_pca_scaled = scaler.fit_transform(X_train_pca)
X_test_pca_scaled = scaler.transform(X_test_pca)

In [5]:
# Ultra-fine SVM hyperparameter grid
param_grid_svm = {
    'C': [3, 4, 5, 6, 7],  # Regularization strength: higher → softer margin
    'gamma': [0.008, 0.01, 0.012, 0.015, 'scale'], # Kernel coefficient: controls influence radius of support vectors
    'kernel': ['rbf']  # Radial Basis Function kernel
}
svm = SVC(probability=True)
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_svm.fit(X_train_pca_scaled, y_train)

# Best SVM model
best_svm = grid_search_svm.best_estimator_
print(f"Best SVM parameters: {grid_search_svm.best_params_}")

# Predict and evaluate SVM
y_pred_svm = best_svm.predict(X_test_pca_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Test set accuracy with 23 PCA components (SVM): {accuracy_svm:.4f}")

Best SVM parameters: {'C': 7, 'gamma': 0.012, 'kernel': 'rbf'}
Test set accuracy with 23 PCA components (SVM): 0.9833
