In [179]:
import numpy as np

# Load the data

For $k = 0, 1, 2$ we have the following files:
* Xtrk.csv - the training sequences.
* Xtek.csv - the test sequences.
* Ytrk.csv - labels for the training sequences

In [281]:
Xtr0_mat100 = np.genfromtxt("data/Xtr0_mat100.csv", delimiter='')
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

Xtr1_mat100 = np.genfromtxt("data/Xtr1_mat100.csv", delimiter='')
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

Xtr2_mat100 = np.genfromtxt("data/Xtr2_mat100.csv", delimiter='')
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)


In [282]:
def accuracy(y_true,y_pred):
    n = y_true.shape[0]
    predictions = np.zeros(n)
    predictions[y_pred >= 0.5] = 1
    return np.sum(y_true == predictions) / n

# Implementing some kernels

## Linear Kernel

In [283]:
def linear_kernel(X):
    return(X @ X.T)

## Gaussian Kernel

In [284]:
def gaussian(x,y, sigma):
    exp_term = np.linalg.norm(x-y)**2 /(2*sigma)
    return(np.exp(-exp_term))

# Naive computation of the gaussian kernel that can be easily improved
def gaussian_kernel(X,sigma):
    n = X.shape[0]
    K = np.eye(n) # One along the diagonals because K(x,x) = exp(0) = 1
    for i in range(n):
        for j in range(i+1,n):
            val = gaussian(X[i], X[j], sigma)
            K[i,j] = val
            K[j,i] = val
    return(K)
    

## Polynomial Kernel

## Kernel Ridge Regression

* Consider RKHS $\mathcal H$, associated to a p.d. kernel K on $\mathcal X$
* Let $y = (y_1, \dots, y_n)^T \in \mathbb R ^n$
* Let $\alpha = (\alpha_1, \dots, \alpha_n)^T \in \mathbb R ^n$
* Let $K$ be the $n\times n$ Gram Matrix such that $K_{i,j} = K(x_i, x_j)$
* We can then write
$$
(\hat f(x_1), \dots, \hat f(x_n))^T = K\alpha
$$
* The norm is $||\hat f||^2_{\mathcal H} = \alpha^T K \alpha$
* KRR $\leftrightarrow \text{argmin}_{\alpha \in \mathbb R^n} \frac{1}{n} (K\alpha - y)^T(K\alpha - y) + \lambda \alpha^T K \alpha$
* Solution for $\lambda > 0$:
$$
\alpha = (K+\lambda nI)^{-1}y
$$


In [300]:
def KRR(K, y, Kval, yval, lambd):
    """
    takes the kernel matrix as an input and computes the MSE and the predictions for each value in lambd (list)
    """
    assert K.shape[0] == y.shape[0]
    assert len(lambd) > 0
    n = K.shape[0]
    
    loss = []
    acc = []
    
    loss_val = []
    acc_val = []
    alphas = []
    
    for l in lambd:
        
        assert l >= 0
        # find the parameter alpha
        alpha = np.linalg.solve((K + l*n*np.eye(n)), y)
        # predict
        
        loss_lambda = MSE(K, y, l, alpha)
        acc_lambda = accuracy(y,K@alpha)
        
        loss_lambdaval = MSE(Kval, yval, l, alpha)
        acc_lambdaval = accuracy(yval,Kval@alpha)

        print(f"***********lambda = {l}***********")
        print(f"Training: loss = {loss_lambda:.4f}, accuracy = {acc_lambda:.6f}")
        print(f"Validation: loss = {loss_lambdaval:.4f}, accuracy = {acc_lambdaval:.6f}")
        
        loss += [loss_lambda]
        acc += [acc_lambda]
        
        loss_val += [loss_lambdaval]
        acc_val += [acc_lambdaval]
        
        
        alphas +=[alpha]
        
    return(alphas, loss, acc, loss_val, acc_val)
    

In [286]:
def MSE(K, y, lambd, alpha):
    n = y.shape[0]
    data_term = (np.linalg.norm(np.dot(K, alpha.reshape(-1,1)) - y)**2)/n
    reg_term = alpha @ K @ alpha
    return(data_term + lambd * reg_term)

## Kernel Logistic Regression

- Binary Classificaiton setup: $\mathcal Y = \{-1, 1\}$
- $\mathcal l_{\text{logistic}}(f(x),y) = -\log p(y|f(x)) = \log(1 + e^{-yf(x)})$ where $p(y|f(x)) = \sigma(y(f(x))$

Objective:
\begin{align*}
\hat f &= \text{argmin}_{f\in \mathcal H} \frac{1}{n} \sum_{i=1}^n \log(1+e^{-y_if(x_i)}) + \frac{\lambda}{2}||f||^2_{\mathcal H}\\
\alpha &= \text{argmin}_{\alpha \in \mathbb R^n} \frac{1}{n} \sum_{i=1}^n \log(1+e^{-y_i[K\alpha]_i}) + \frac{\lambda}{2} \alpha^T K \alpha
\end{align*}

We define the following fonctions and vectors:
* $\mathcal l _\text{logistic}(u) = \log(1+e^{-u})$
* $\mathcal l' _\text{logistic}(u) = -\sigma(-u)$
* $\mathcal l'' _\text{logistic}(u) = \sigma(u)\sigma(-u)$

* for $i = 1, \dots, n$, $P_i(\alpha) = \mathcal l' _\text{logistic}(y_i[K\alpha]_i)$
* for $i = 1, \dots, n$, $W_i(\alpha) = \mathcal l'' _\text{logistic}(y_i[K\alpha]_i)$




\begin{align*}
J(\alpha) &= \frac{1}{n} \sum_{i=1}^n \log(1+e^{-y_i[K\alpha]_i}) + \frac{\lambda}{2} \alpha^T K \alpha\\
\nabla J(\alpha) &= \frac{1}{n} KP(\alpha) y + \lambda K \alpha \quad \text{where } P(\alpha) = \text{diag}(P_1(\alpha), \dots, P_n(\alpha))\\
\nabla^2 J(\alpha) &= \frac{1}{n}KW(\alpha)K+\lambda K \quad \text{where } W(\alpha) = \text{diag}(W_1(\alpha), \dots, W_n(\alpha))
\end{align*}

We are interested in the quadratic approximation of $J$ near a point $\alpha_0$:
\begin{align*}
J_q(\alpha) &= J(\alpha_0) + (\alpha - \alpha_0)^T \nabla J(\alpha_0) + \frac{1}{2} (\alpha - \alpha_0)^T \nabla^2 J(\alpha_0)(\alpha - \alpha_0)\\
2J_q(\alpha) &= -\frac{2}{n} \alpha^T KW(K\alpha_0-W^{-1}Py)+\frac{1}{n}\alpha^TKWK\alpha+ \lambda\alpha^TK\alpha +C\\
&= \frac{1}{n} (K\alpha - z)^TW(K\alpha - z) + \lambda\alpha^TK\alpha + C \quad \text{where} z = K\alpha_0 - W^{-1} P y
\end{align*}

The WKRR problem is presented as:
$$
\text{argmin}_{\alpha \in \mathbb R^n} \frac{1}{n}(K\alpha - y)^TW(K\alpha - y) + \lambda \alpha^TK\alpha
$$
and has as :
$$
\alpha = W^{1/2} (W^{1/2}KW^{1/2}+n\lambda I)^{-1} W^{1/2}y
$$

So, in order to solve KRL, we use IRLS on a WKRR problem until convergence:
$$\alpha^{t+1} \gets \text{solveWKRR}(K, W^t, z^t)$$
With the updates for $W^t$ and $z^t$ from $\alpha^t$ are:
- $m_i \gets [K\alpha^t]_i$
- $P_i^t \gets -\sigma(-y_im_i)$
- $W_i^t \gets \sigma(m_i)\sigma(-m_i)$
- $z_i^t \gets m_i + y_i / \sigma(-y_im_i)$

In [287]:
def solveWKRR(K,W,z,y,lambd):
    
    assert np.all(W >= 0)
    
    W_sq = np.sqrt(W)
    n = K.shape[0]
    inv_matrix = np.linalg.solve((W_sq @ K @ W_sq + n * lambd * np.eye(n)), W_sq @ y)
    alpha = W_sq @ inv_matrix
    
    return alpha
    

In [288]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def logistic_loss(y_true, y_pred):
    n = y_true.shape[0]
    log_term = np.log(sigmoid(y_true*y_pred))
    return(-np.sum(log_term)/n)
    

In [295]:
def KLR(K, y, lambd, maxIter = 100, tresh = 1e-8):
    
    # initialize the values
    assert K.shape[0] == y.shape[0]
    n = K.shape[0]
    
    y_preds = []
    loss = []
    accuracies = []
    alphas = []
    
    for l in lambd :
        cnt = 0
        
        P_t, W_t = np.eye(n), np.eye(n)
        z_t = K@ np.ones(n) - y
        alpha_t = np.ones(n)
        diff_alpha = np.inf


        while (diff_alpha > tresh) and (cnt < maxIter):

            old_alpha = alpha_t
            alpha_t = solveWKRR(K, W_t, z_t, y, l)

            m_t = K@alpha_t
            sigma_m = sigmoid(m_t)
            sigma_my = sigmoid(-y*m_t)

            P_t = - np.diag(sigma_my)
            W_t = np.diag(sigma_m * (1-sigma_m))

            z_t = m_t - (P_t@y)/(sigma_m * (1-sigma_m))

            diff_alpha = np.linalg.norm(alpha_t - old_alpha)
            cnt+=1
            if cnt % 10 == 0:
                print(l, cnt)

        pred_l = K@alpha_t
        y_preds += [pred_l]
        loss_l = logistic_loss(y, pred_l)
        loss += [loss_l]
        accuracy_l = accuracy(y, pred_l)
        accuracies += [accuracy_l]
        alphas +=[alpha_t]
        print(f"The logistic loss and accuracy for lambda = {l} are : {loss_l:.4f}, {accuracy_l:.6f} ")
        
    
    return alphas, loss, accuracies
        

# Testing the accuracy

## Splitting data

In [303]:
from sklearn.model_selection import train_test_split

Xtr0, Xval0, ytr0, yval0 = train_test_split(Xtr0_mat100, Ytr0, test_size=0.5, random_state=42)
Xtr1, Xval1, ytr1, yval1 = train_test_split(Xtr1_mat100, Ytr1, test_size=0.5, random_state=42)
Xtr2, Xval2, ytr2, yval2 = train_test_split(Xtr2_mat100, Ytr2, test_size=0.5, random_state=42)

## Create the kernel matrices

In [304]:
K_tr0 = gaussian_kernel(Xtr0,1)
K_tr1 = gaussian_kernel(Xtr1,1)
K_tr2 = gaussian_kernel(Xtr2,1)

K_val0 = gaussian_kernel(Xval0,1)
K_val1 = gaussian_kernel(Xval1,1)
K_val2 = gaussian_kernel(Xval2,1)

## Testing KRR

In [305]:
lambdas = [0] + [10**i for i in range(-10,2)]
print("*************KRR for dataset 0*************\n")
alphas_tr0, loss_tr0, acc_0, loss_val0, acc_val0 = KRR(K_tr0, ytr0[:,1], K_val0, yval0[:,1], lambdas)
print("*************KRR for dataset 1*************\n")
alphas_tr1, loss_tr1, acc_1, loss_val1, acc_val1 = KRR(K_tr1, ytr1[:,1], K_val1, yval1[:,1],lambdas)
print("*************KRR for dataset 2*************\n")
alphas_tr2, loss_tr2, acc_2, loss_val2, acc_val2 = KRR(K_tr2, ytr2[:,1], K_val2, yval2[:,1],lambdas)

*************KRR for dataset 0*************

***********lambda = 0***********
Training: loss = 497.5500, accuracy = 1.000000
Validation: loss = 3708479274.0792, accuracy = 0.484000
***********lambda = 1e-10***********
Training: loss = 496.5732, accuracy = 1.000000
Validation: loss = 3651845485.1501, accuracy = 0.484000
***********lambda = 1e-09***********
Training: loss = 488.3212, accuracy = 1.000000
Validation: loss = 3216798580.0829, accuracy = 0.484000
***********lambda = 1e-08***********
Training: loss = 435.7197, accuracy = 1.000000
Validation: loss = 1447588604.8629, accuracy = 0.490000
***********lambda = 1e-07***********
Training: loss = 334.5539, accuracy = 0.956000
Validation: loss = 109421856.4602, accuracy = 0.479000
***********lambda = 1e-06***********
Training: loss = 293.8359, accuracy = 0.763000
Validation: loss = 2141724.4853, accuracy = 0.480000
***********lambda = 1e-05***********
Training: loss = 283.1786, accuracy = 0.698000
Validation: loss = 33924.7992, accuracy

## Testing KLR

In [267]:
lambdas = [0] + [10**i for i in range(-10,0)]
print("*************KLR for dataset 0*************\n")
alphas_klr_tr0, loss_klr_tr0, accuracies_klr_0 = KLR(K_tr0, Ytr0[:,1], lambdas, tresh=1e-5)
print("*************KLR for dataset 1*************\n")
alphas_klr_tr1, loss_klr_tr1, accuracies_klr_1 = KLR(K_tr1, Ytr1[:,1], lambdas, tresh=1e-5)
print("*************KLR for dataset 2*************\n")
alphas_klr_tr2, loss_klr_tr2, accuracies_klr_2 = KLR(K_tr2, Ytr2[:,1], lambdas, tresh=1e-5)

*************KLR for dataset 0*************

0 10
0 20
0 30
0 40
0 50
0 60
0 70
0 80
0 90
0 100
The logistic loss and accuracy for lambda = 0 are : 0.5104, 1.000000 
1e-10 10
1e-10 20
1e-10 30
1e-10 40
1e-10 50
1e-10 60
1e-10 70
1e-10 80
1e-10 90
1e-10 100
The logistic loss and accuracy for lambda = 1e-10 are : 0.5126, 1.000000 
1e-09 10
1e-09 20
1e-09 30
1e-09 40
1e-09 50


KeyboardInterrupt: 

## Making predictions

### First create the kernels for each testing set with the chosen parameters

In [292]:
Xte0 = np.genfromtxt("data/Xte0_mat100.csv", delimiter='')
Xte1 = np.genfromtxt("data/Xte1_mat100.csv", delimiter='')
Xte2 = np.genfromtxt("data/Xte2_mat100.csv", delimiter='')

In [293]:
K_te0 = gaussian_kernel(Xte0,1)
K_te1 = gaussian_kernel(Xte1,1)
K_te2 = gaussian_kernel(Xte2,1)

In [319]:
def write_predictions_csv(test_kernels, test_alphas):
    
    predictions = np.zeros((3000,2), dtype=int)
    predictions[:,0] = np.arange(0,3000)
    
    for i in range(3):
        y_pred = test_kernels[0] @ test_alphas[0]
        y_pred[y_pred >= 0.5] = 1
        y_pred[y_pred < 0.5] = 0
        
        predictions[1000*i:1000*(i+1), 1] = y_pred
    predictions = predictions.astype(int)
    print("saving predictions")
    np.savetxt("data/Ytest_KRR.csv", predictions, header = "Id, Bound", delimiter =",")
    print("saved predictions")
        

Example

In [320]:
test_kernels = [K_te0, K_te1, K_te2]
test_alphas = [alphas_tr0[0], alphas_tr1[0], alphas_tr2[0]] # il faut choisir l'alpha associé à un bon lambda!

write_predictions_csv(test_kernels, test_alphas)

saving predictions
saved predictions
