# Handwritten Digit Classification
### Gaussian Process Classification

#### Dataset Description ([Link to Data](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html))

Each entry corresponds to one hand-written digit on 8x8 pixels. The dataset contains 1797 samples, with about 180 samples for each of the 10 classes (0-9). A Gaussian Process Classification using the Jensen-Shannon Metric with an exponential kernels, where 10 models are trained for a One-v-Rest approach. We use PCA for direction optimizationand use both the probit link function to convert the regression to classes. This example should replicated without too much difficulty or time. Instead of training on global optimization, use an MCMC with 1000 iterations.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys

from fvgp import GP
from fvgp.gp_kernels import exponential_kernel

from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.decomposition import PCA

from scipy.stats import wasserstein_distance
from scipy.spatial.distance import cdist
from scipy.stats import norm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import ot

In [3]:
# 1. Load and Preprocess the Digits Dataset
digits = load_digits()
X, y = digits.data, digits.target
X = X[::10]
y = y[::10]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

# Normalize the data to resemble probability distributions
for i in range(len(X_train)):
    X_train[i] = (X_train[i] - np.min(X_train[i])) + 1e-8
    X_train[i] = X_train[i] / np.sum(X_train[i])

for i in range(len(X_test)):
    X_test[i] = (X_test[i] - np.min(X_test[i])) + 1e-8
    X_test[i] = X_test[i] / np.sum(X_test[i])

In [4]:
# 2. Define Sinkhorn Distance Function
def sinkhorn_distance(p, q, M, reg):
    '''
    Compute the Sinkhorn distance between distributions p and q with cost matrix M and regularization reg
    '''
    # Ensure that p and q are numpy arrays
    p = np.asarray(p, dtype=np.float64)
    q = np.asarray(q, dtype=np.float64)
    # Compute the Sinkhorn distance
    sinkhorn_dist = ot.sinkhorn2(p, q, M, reg)
    return sinkhorn_dist

In [5]:
# 3. Compute Sinkhorn Distance Matrices with Caching
def compute_Sinkhorn_matrix(X1, X2, M, reg):
    n1 = X1.shape[0]
    n2 = X2.shape[0]
    Sinkhorn_matrix = np.zeros((n1, n2))
    for i in range(n1):
        for j in range(n2):
            Sinkhorn_matrix[i, j] = sinkhorn_distance(X1[i], X2[j], M, reg)
    return Sinkhorn_matrix

In [6]:
# 4. Prepare Cost Matrix for Sinkhorn Distance
# Positions of the pixels in the 8x8 grid
positions = []
for i in range(8):
    for j in range(8):
        positions.append([i, j])
positions = np.array(positions)  # shape (64, 2)

In [7]:
# Compute the cost matrix (squared Euclidean distance between pixels)
M = cdist(positions, positions, 'sqeuclidean')  # shape (64, 64)

# Regularization parameter for Sinkhorn distance
reg = 0.1

print("Computing Sinkhorn distance matrices...")
Sinkhorn_X_train = compute_Sinkhorn_matrix(X_train, X_train, M, reg)       # Training vs. Training
print("Finished train x train")
Sinkhorn_X_test = compute_Sinkhorn_matrix(X_test, X_test, M, reg)          # Testing vs. Testing
print("Finished test x test")
Sinkhorn_X_train_test = compute_Sinkhorn_matrix(X_train, X_test, M, reg)   # Training vs. Testing
print("Finished train x test")
print("Sinkhorn distance matrices computed.")


Computing Sinkhorn distance matrices...
Finished train x train
Finished test x test
Finished train x test
Sinkhorn distance matrices computed.


In [8]:
# 5. Define the GP Kernel Function Using Precomputed Sinkhorn Matrices
def Sinkhorn_kernel(X1, X2, hyperparameters):
    length_scale = hyperparameters[0]
    n_train = X_train.shape[0]
    n_test = X_test.shape[0]
    if len(X1) == n_train and len(X2) == n_train:
        K = exponential_kernel(Sinkhorn_X_train, length_scale)
    elif len(X1) == n_test and len(X2) == n_test:
        K = exponential_kernel(Sinkhorn_X_test, length_scale)
    elif len(X1) == n_train and len(X2) == n_test:
        K = exponential_kernel(Sinkhorn_X_train_test, length_scale)
    elif len(X1) == n_test and len(X2) == n_train:
        K = exponential_kernel(Sinkhorn_X_train_test.T, length_scale)
    else:
        raise ValueError("Invalid input sizes for X1 and X2.")
    return K

In [9]:
# 6. Initialize Hyperparameters and Bounds
initial_length_scale = 1.0
init_hyperparameters = np.array([initial_length_scale])

# Define bounds for the length scale
length_scale_bounds = np.array([[0.1, 10.0]])

In [14]:
# 7. Train GP Models Using One-vs-Rest Strategy
gp_models = []
num_classes = 10  # Digits 0-9

print("Training GP models...")
for class_label in range(num_classes):
    print(f"Training GP model for class {class_label}...")
    # Binary labels for the current class
    y_train_binary = (y_train == class_label).astype(float)
    # Initialize GP model
    gp_model = GP(
        X_train,
        y_train_binary,
        init_hyperparameters=init_hyperparameters,
        gp_kernel_function=Sinkhorn_kernel,
        noise_variances=np.zeros(len(y_train_binary)) + 1e-6  # Noise variance
    )

    # Train the GP model (optimize hyperparameters)
    gp_model.train(
        hyperparameter_bounds=length_scale_bounds,
        method='mcmc',
        max_iter=1000,
        tolerance=1e-3,
    )

    gp_models.append(gp_model)
    print(f"GP model for class {class_label} trained.")

print("All GP models trained.")

Training GP models...
Training GP model for class 0...


LinAlgError: 135-th leading minor of the array is not positive definite

In [None]:
# 8. Define the Probit Link Function (Prefer over Logit, Gaussian Assumptions)
def probit(mu, sigma2):
    # Applies the probit function with variance adjustment.
    adjusted_mu = mu / np.sqrt(1 + sigma2)
    return norm.cdf(adjusted_mu)

# Predict Probabilities Using the Trained GP Models
def predict_probs(X_test, gp_models):
    num_classes = len(gp_models)
    n_test = X_test.shape[0]
    
    # Initialize arrays to store means and variances
    means = np.zeros((n_test, num_classes))
    variances = np.zeros((n_test, num_classes))
    
    for class_label, gp_model in enumerate(gp_models):
        # Compute the posterior mean for the test data
        posterior_mean = gp_model.posterior_mean(X_test)
        mean = posterior_mean["f(x)"]  # Extract mean predictions
        means[:, class_label] = mean.flatten()
        
        # Compute the posterior variance for the test data
        posterior_cov = gp_model.posterior_covariance(X_test, variance_only=True)
        variance = posterior_cov["v(x)"]  # Extract variances
        variances[:, class_label] = variance.flatten()
    
    # Apply probit with variance to convert means and variances to probabilities
    probabilities = probit(means, variances)
    return probabilities

In [None]:
# 9. Predict Class Labels and Evaluate the Classifier
probabilities = predict_probs(X_test, gp_models)

y_pred = np.argmax(probabilities, axis=1)

accuracy = accuracy_score(y_test, y_pred) * 100
print(f'\nAccuracy: {accuracy:.0f}%')
print('Classification Report:')
print(classification_report(y_test, y_pred))