In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys

from fvgp import GP
from fvgp.gp_kernels import exponential_kernel

from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

from scipy.stats import wasserstein_distance

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [2]:
# 1. Load and Preprocess the Digits Dataset
digits = load_digits()
X, y = digits.data, digits.target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)


In [3]:
# 2. Create Sparse Datasets by Selecting Every 5th Sample
X_train_sparse = X_train[::5]
X_test_sparse = X_test[::5]
y_train_sparse = y_train[::5]
y_test_sparse = y_test[::5]

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)

In [13]:
# 3. Define the Sliced Wasserstein Exponential Kernel
def sliced_wasserstein_exponential_kernel(X1, X2, hyperparameters, n_directions=64):
    """
    Computes the sliced Wasserstein exponential kernel between two datasets.

    Parameters:
    - X1: (n1, d) ndarray
    - X2: (n2, d) ndarray
    - hyperparameters: ndarray, contains [length_scale]
    - n_directions: int, number of random projections

    Returns:
    - kernel_matrix: (n1, n2) ndarray
    """
    length_scale = hyperparameters[0]
    
    n1, d = X1.shape
    n2, _ = X2.shape

    # Generate random directions (unit vectors)
    directions = np.random.randn(n_directions, d)
    directions /= np.linalg.norm(directions, axis=1, keepdims=True)  # Normalize to unit vectors

    # Initialize kernel matrix
    kernel_matrix = np.zeros((n1, n2))

    # Iterate over each direction
    for dir_idx in range(n_directions):
        direction = directions[dir_idx]

        # Project the data onto the current direction
        proj_X1 = X1.dot(direction)  # Shape: (n1,)
        proj_X2 = X2.dot(direction)  # Shape: (n2,)

        # Compute the absolute differences between projections
        # Broadcasting to compute pairwise |x_i - y_j|
        abs_diff = np.abs(proj_X1[:, np.newaxis] - proj_X2[np.newaxis, :])  # Shape: (n1, n2)

        # Apply the exponential kernel
        kernel_matrix += exponential_kernel(abs_diff, length_scale)

    # Average over all directions
    kernel_matrix /= n_directions

    # Add jitter for numerical stability
    jitter = 1e-3
    if X1.shape[0] == X2.shape[0]:
        kernel_matrix += jitter * np.eye(X1.shape[0])

    return kernel_matrix

In [5]:
def sliced_wasserstein_exponential_kernel_wrapper(x1, x2, hyperparameters):
    """
    Wrapper function to match the expected signature for the GP kernel.

    Parameters:
    - x1: (n1, d) ndarray
    - x2: (n2, d) ndarray
    - hyperparameters: ndarray, contains [length_scale]

    Returns:
    - kernel_matrix: (n1, n2) ndarray
    """
    return sliced_wasserstein_exponential_kernel(x1, x2, hyperparameters, n_directions=64)

In [6]:
# 5. Initialize Hyperparameters and Bounds
initial_length_scale = 1.0  # Initial guess for length scale
init_hyperparameters = np.array([initial_length_scale])

# Define bounds for the length scale (e.g., between 0.1 and 10)
length_scale_bounds = np.array([[0.1, 10.0]])

In [7]:
# 6. Initialize and Train GP Models for Each Class using One-vs-Rest Strategy
gp_models_wasserstein = []
num_classes = 10  # Digits 0-9

for class_label in range(num_classes):
    print(f"Training GP model for class {class_label}")

    # Binary labels for the current class
    y_train_binary = (y_train_sparse == class_label).astype(float)

    # Initialize GP model
    gp_model = GP(
        X_train_scaled,
        y_train_binary,
        init_hyperparameters=init_hyperparameters,
        gp_kernel_function=sliced_wasserstein_exponential_kernel_wrapper,
        noise_variances=np.ones_like(y_train_binary) * 0.25 + 1e-6  # Noise variance
    )

    # Train the GP model using MCMC
    gp_model.train(
        hyperparameter_bounds=length_scale_bounds,
        method='mcmc',
        max_iter=100,
        tolerance=1e-3,  
    )

    gp_models_wasserstein.append(gp_model)
    print(f"GP model for class {class_label} trained.\n")


Training GP model for class 0
GP model for class 0 trained.

Training GP model for class 1
GP model for class 1 trained.

Training GP model for class 2
GP model for class 2 trained.

Training GP model for class 3
GP model for class 3 trained.

Training GP model for class 4
GP model for class 4 trained.

Training GP model for class 5
GP model for class 5 trained.

Training GP model for class 6
GP model for class 6 trained.

Training GP model for class 7
GP model for class 7 trained.

Training GP model for class 8
GP model for class 8 trained.

Training GP model for class 9
GP model for class 9 trained.



In [8]:
# 7. Define Prediction Function
def predict_probs(X_test, gp_models):
    """
    Predicts class probabilities for the test set using trained GP models.

    Parameters:
    - X_test: (n_test, d) ndarray
    - gp_models: list of trained GP models

    Returns:
    - probabilities: (n_test, num_classes) ndarray
    """
    num_classes = len(gp_models)
    n_test = X_test.shape[0]
    means = np.zeros((n_test, num_classes))

    for class_label, gp_model in enumerate(gp_models):
        # Compute the posterior mean for the test data
        posterior = gp_model.posterior_mean(X_test)
        mean = posterior["f(x)"]  # Extract mean predictions
        means[:, class_label] = mean.flatten()

    # Apply softmax to convert means to probabilities
    probabilities = softmax(means)
    return probabilities

def softmax(x):
    """
    Applies the softmax function to each row of the input array.

    Parameters:
    - x: (n, num_classes) ndarray

    Returns:
    - softmaxed: (n, num_classes) ndarray
    """
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)


In [11]:
# 8. Make Predictions on the Test Set
gp_probabilities_wasserstein = predict_probs(X_test_scaled, gp_models_wasserstein)
gp_predictions_wasserstein = np.argmax(gp_probabilities_wasserstein, axis=1)
gp_accuracy_wasserstein = accuracy_score(y_test_sparse, gp_predictions_wasserstein)


In [49]:
# 9. Evaluate the Classifier
print(f'GP Classifier with Sliced Wasserstein Kernel – Accuracy: {gp_accuracy_wasserstein:.4f}\n')
print('Classification Report:')
print(classification_report(y_test_sparse, gp_predictions_wasserstein))

GP Classifier with Sliced Wasserstein Kernel – Accuracy: 0.8333

Classification Report:
              precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       1.00      0.80      0.89         5
           2       1.00      0.67      0.80         3
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       0.83      1.00      0.91         5
           7       0.67      0.67      0.67         3
           8       0.80      0.57      0.67         7
           9       1.00      1.00      1.00         1

    accuracy                           0.83        36
   macro avg       0.89      0.87      0.87        36
weighted avg       0.86      0.83      0.83        36



Next Steps:
- include the multiplication kernel
- standardize the projected directions
- kernel choice

In [95]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Define the Sigmoid (Logit) Function
def sigmoid(x):
    """
    Applies the sigmoid function element-wise to the input array.
    
    Parameters:
    - x: ndarray
    
    Returns:
    - sigmoided: ndarray
    """
    return 1 / (1 + np.exp(-x))

# Define Prediction Function using Sigmoid (Logit)
def predict_probs_sigmoid(X_test, gp_models):
    """
    Predicts class probabilities for the test set using trained GP models with sigmoid activation.
    
    Parameters:
    - X_test: (n_test, d) ndarray
    - gp_models: list of trained GP models
    
    Returns:
    - probabilities: (n_test, num_classes) ndarray
    """
    num_classes = len(gp_models)
    n_test = X_test.shape[0]
    logits = np.zeros((n_test, num_classes))

    for class_label, gp_model in enumerate(gp_models):
        # Compute the posterior mean for the test data
        posterior = gp_model.posterior_mean(X_test)
        mean = posterior["f(x)"]  # Extract mean predictions
        logits[:, class_label] = mean.flatten()

    # Apply sigmoid to convert logits to probabilities
    probabilities = sigmoid(logits)
    return probabilities

# Predict Probabilities using Sigmoid Link Function
gp_probabilities_sigmoid = predict_probs_sigmoid(X_test_scaled, gp_models_wasserstein)

# Predict Class Labels by Selecting the Class with the Highest Probability
gp_predictions_sigmoid = np.argmax(gp_probabilities_sigmoid, axis=1)

# Calculate Accuracy
gp_accuracy_sigmoid = accuracy_score(y_test_sparse, gp_predictions_sigmoid)

# 9. Evaluate the Classifier
print(f'GP Classifier with Sigmoid Link Function – Accuracy: {gp_accuracy_sigmoid:.4f}\n')
print('Classification Report:')
print(classification_report(y_test_sparse, gp_predictions_sigmoid))


GP Classifier with Sigmoid Link Function – Accuracy: 0.8611

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      0.60      0.75         5
           2       0.75      1.00      0.86         3
           3       1.00      1.00      1.00         2
           4       1.00      0.67      0.80         3
           5       0.50      1.00      0.67         3
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         3
           8       1.00      0.71      0.83         7
           9       0.50      1.00      0.67         1

    accuracy                           0.86        36
   macro avg       0.88      0.90      0.86        36
weighted avg       0.92      0.86      0.87        36



In [96]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import norm

# Define the Probit Function
def probit(x):
    """
    Applies the probit (Gaussian CDF) function element-wise to the input array.
    
    Parameters:
    - x: ndarray
    
    Returns:
    - probited: ndarray
    """
    return norm.cdf(x)

# Define Prediction Function using Probit Link
def predict_probs_probit(X_test, gp_models):
    """
    Predicts class probabilities for the test set using trained GP models with probit activation.
    
    Parameters:
    - X_test: (n_test, d) ndarray
    - gp_models: list of trained GP models
    
    Returns:
    - probabilities: (n_test, num_classes) ndarray
    """
    num_classes = len(gp_models)
    n_test = X_test.shape[0]
    logits = np.zeros((n_test, num_classes))

    for class_label, gp_model in enumerate(gp_models):
        # Compute the posterior mean for the test data
        posterior = gp_model.posterior_mean(X_test)
        mean = posterior["f(x)"]  # Extract mean predictions
        logits[:, class_label] = mean.flatten()

    # Apply probit (Gaussian CDF) to convert logits to probabilities
    probabilities = probit(logits)
    return probabilities

# Predict Probabilities using Probit Link Function
gp_probabilities_probit = predict_probs_probit(X_test_scaled, gp_models_wasserstein)

# Predict Class Labels by Selecting the Class with the Highest Probability
gp_predictions_probit = np.argmax(gp_probabilities_probit, axis=1)

# Calculate Accuracy
gp_accuracy_probit = accuracy_score(y_test_sparse, gp_predictions_probit)

# 9. Evaluate the Classifier
print(f'GP Classifier with Probit Link Function – Accuracy: {gp_accuracy_probit:.4f}\n')
print('Classification Report:')
print(classification_report(y_test_sparse, gp_predictions_probit))


GP Classifier with Probit Link Function – Accuracy: 0.8333

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.80      0.80      0.80         5
           2       0.60      1.00      0.75         3
           3       1.00      0.50      0.67         2
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       0.83      1.00      0.91         5
           7       0.50      0.67      0.57         3
           8       1.00      0.57      0.73         7
           9       1.00      1.00      1.00         1

    accuracy                           0.83        36
   macro avg       0.87      0.85      0.84        36
weighted avg       0.87      0.83      0.83        36



In [110]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import norm

# Define the Probit Function with Variance Incorporation
def probit_with_variance(mu, sigma2):
    """
    Applies the probit function with variance adjustment.
    
    Parameters:
    - mu: ndarray of shape (n_test, num_classes)
      Posterior means for each class.
    - sigma2: ndarray of shape (n_test, num_classes)
      Posterior variances for each class.
      
    Returns:
    - probabilities: ndarray of shape (n_test, num_classes)
      Adjusted probabilities for each class.
    """
    # Adjust the mean by incorporating variance
    adjusted_mu = mu / np.sqrt(1 + sigma2)
    return norm.cdf(adjusted_mu)

# Define Prediction Function using Probit Link with Variance
def predict_probs_probit_with_variance(X_test, gp_models):
    """
    Predicts class probabilities for the test set using trained GP models
    with probit activation, incorporating posterior variance.
    
    Parameters:
    - X_test: (n_test, d) ndarray
      Test data features.
    - gp_models: list of trained GP models
      One GP model per class.
      
    Returns:
    - probabilities: (n_test, num_classes) ndarray
      Class probabilities for each test sample.
    """
    num_classes = len(gp_models)
    n_test = X_test.shape[0]
    
    # Initialize arrays to store means and variances
    means = np.zeros((n_test, num_classes))
    variances = np.zeros((n_test, num_classes))
    
    for class_label, gp_model in enumerate(gp_models):
        # Compute the posterior mean for the test data
        posterior_mean = gp_model.posterior_mean(X_test)
        mean = posterior_mean["f(x)"]  # Extract mean predictions
        means[:, class_label] = mean.flatten()
        
        # Compute the posterior variance for the test data
        posterior_cov = gp_model.posterior_covariance(X_test, variance_only=True)
        variance = posterior_cov["v(x)"]  # Extract variances
        variances[:, class_label] = variance.flatten()
    
    # Apply probit with variance to convert means and variances to probabilities
    probabilities = probit_with_variance(means, variances)
    return probabilities

# Predict Probabilities using Probit Link with Variance
gp_probabilities_probit_var = predict_probs_probit_with_variance(X_test_scaled, gp_models_wasserstein)

# Predict Class Labels by Selecting the Class with the Highest Probability
gp_predictions_probit_var = np.argmax(gp_probabilities_probit_var, axis=1)

# Calculate Accuracy
gp_accuracy_probit_var = accuracy_score(y_test_sparse, gp_predictions_probit_var)

# 9. Evaluate the Classifier
print(f'GP Classifier with Probit Link Function (Incorporating Variance) – Accuracy: {gp_accuracy_probit_var:.4f}\n')
print('Classification Report:')
print(classification_report(y_test_sparse, gp_predictions_probit_var))




GP Classifier with Probit Link Function (Incorporating Variance) – Accuracy: 0.8611

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      0.40      0.57         5
           2       0.40      0.67      0.50         3
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         5
           7       0.75      1.00      0.86         3
           8       0.86      0.86      0.86         7
           9       1.00      1.00      1.00         1

    accuracy                           0.86        36
   macro avg       0.90      0.89      0.88        36
weighted avg       0.90      0.86      0.86        36



In [121]:
gp_probabilities_sigmoid[0]

array([0.4502325 , 0.45134926, 0.47431008, 0.4927964 , 0.41762249,
       0.5635634 , 0.45153889, 0.71764992, 0.59673628, 0.5305073 ])

In [122]:
gp_probabilities_probit_var[0]

array([0.45999514, 0.44703695, 0.43432159, 0.48614555, 0.54774741,
       0.50127841, 0.55273951, 0.83564407, 0.58452505, 0.59076567])

In [126]:
gp_probabilities_sigmoid[1]

array([0.53731936, 0.53973398, 0.55362314, 0.4734236 , 0.51683638,
       0.53324227, 0.42161375, 0.54093926, 0.6400128 , 0.57499343])

In [125]:
gp_probabilities_probit_var[1]

array([0.56458416, 0.64465083, 0.54540548, 0.47490549, 0.5892383 ,
       0.45076306, 0.4923338 , 0.58474147, 0.75611995, 0.54313679])