In [119]:
import numpy as np

# Load the data

For $k = 0, 1, 2$ we have the following files:
* Xtrk.csv - the training sequences.
* Xtek.csv - the test sequences.
* Ytrk.csv - labels for the training sequences

In [None]:
Xtr0_mat100 = np.genfromtxt("data/Xtr0_mat100.csv", delimiter='')
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

Xtr1_mat100 = np.genfromtxt("data/Xtr1_mat100.csv", delimiter='')
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

Xtr2_mat100 = np.genfromtxt("data/Xtr2_mat100.csv", delimiter='')
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)


In [None]:
Xtr0_mat100.shape

# Implementing some kernels

## Gaussian Kernel

In [None]:
def gaussian(x,y, sigma):
    exp_term = np.linalg.norm(x-y)**2 /(2*sigma)
    return(np.exp(-exp_term))

# Naive computation of the gaussian kernel that can be easily improved
def gaussian_kernel(X,sigma):
    n = X.shape[0]
    K = np.eye(n) # One along the diagonals because K(x,x) = exp(0) = 1
    for i in range(n):
        for j in range(i+1,n):
            val = gaussian(X[i], X[j], sigma)
            K[i,j] = val
            K[j,i] = val
    return(K)
    

## Polynomial Kernel

## Kernel Ridge Regression

* Consider RKHS $\mathcal H$, associated to a p.d. kernel K on $\mathcal X$
* Let $y = (y_1, \dots, y_n)^T \in \mathbb R ^n$
* Let $\alpha = (\alpha_1, \dots, \alpha_n)^T \in \mathbb R ^n$
* Let $K$ be the $n\times n$ Gram Matrix such that $K_{i,j} = K(x_i, x_j)$
* We can then write
$$
(\hat f(x_1), \dots, \hat f(x_n))^T = K\alpha
$$
* The norm is $||\hat f||^2_{\mathcal H} = \alpha^T K \alpha$
* KRR $\leftrightarrow \text{argmin}_{\alpha \in \mathbb R^n} \frac{1}{n} (K\alpha - y)^T(K\alpha - y) + \lambda \alpha^T K \alpha$
* Solution for $\lambda > 0$:
$$
\alpha = (K+\lambda nI)^{-1}y
$$


In [None]:
def KRR(K, y, lambd):
    """
    takes the kernel matrix as an input and computes the MSE and the predictions for each value in lambd (list)
    """
    assert K.shape[0] == y.shape[0]
    assert len(lambd) > 0
    
    y_preds = []
    loss = []
    for l in lambd:
        assert l >= 0
        # find the parameter alpha
        alpha = np.linalg.solve((K + l*n*np.eye(n)), y)
        # predict
        loss_lambda = MSE(y, l, alpha, K)
        print(f"The MSE for lambda = {l:.2f} is : {loss_lambda:.4f}")
        y_preds += [K @ alpha]
        loss += [loss_lambda]
    return(y_preds, loss)
    

In [None]:
def MSE(y, lambd, alpha, K):
    n = y.shape[0]
    data_term = (np.linalg.norm(np.dot(K, alpha.reshape(-1,1)) - y)**2)/n
    reg_term = alpha @ K @ alpha
    return(data_term + lambd * reg_term)

In [None]:
K_tr0 = gaussian_kernel(Xtr0_mat100,0.5)

In [120]:
lambdas = np.linspace(0,0.1,10)
pred_tr0, loss_tr0 = KRR(K_tr0, Ytr0[:,1], lambdas)

The MSE for lambda = 0.00 is : 998.5560
The MSE for lambda = 0.01 is : 499.4957
The MSE for lambda = 0.02 is : 499.5742
The MSE for lambda = 0.03 is : 499.8317
The MSE for lambda = 0.04 is : 500.1999
The MSE for lambda = 0.06 is : 500.6640
The MSE for lambda = 0.07 is : 501.2154
The MSE for lambda = 0.08 is : 501.8478
The MSE for lambda = 0.09 is : 502.5553
The MSE for lambda = 0.10 is : 503.3328


## Kernel Logistic Regression

- Binary Classificaiton setup: $\mathcal Y = \{-1, 1\}$
- $\mathcal l_{0/1}(f(x),y) = \mathbb 1\{yf(x) < 0 \}$ (0 if $y = \text{sign}f(x)$, 1 otherwise)
- $\mathcal l_{\text{logistic}}(f(x),y) = -\log p(y|f(x)) = \log(1 + e^{-yf(x)})$ where $p(y|f(x)) = \sigma(y(f(x))$
- solve WKRR