# Gaussian KRR

In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cvxpy
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm

## Implementation

In [3]:
# KRR implementation for Gaussian Kernel
def findBestParamsAndPredict(x_train, x_test, y_train, y_test, x_candidate):
    # Set param grid for lambda (L) and sigma (S)
    L, S = np.meshgrid(np.logspace(-5,-2,25), np.logspace(-1.5,0.5,25))
    L, S = L.flatten(), S.flatten()
    dist = np.linalg.norm((x_train[:,None]-x_train), axis = 2)**2
    dist_test = np.linalg.norm((x_test[:,None]-x_train), axis = 2)**2
    preds_tr = []
    preds_tt = []

    # Compute accuracy for all params
    indices = tqdm(range(len(L)), position = 0, leave = True)
    for i in indices:
        lambd, sigma = L[i], S[i]

        # Train (compute alpha)
        n = dist.shape[0]
        K = np.exp(-dist/sigma**2)
        alpha = np.linalg.solve(K+lambd * n * np.eye(n), y_train)

        # Test
        K_test = np.exp(-dist_test/sigma**2)

        # Predict
        preds_train = 1 * ((K @ alpha) > 1/2)
        preds_test = 1 * ((K_test @ alpha) > 1/2)
        preds_tr.append(np.sum(preds_train == y_train)/x_train.shape[0])
        preds_tt.append(np.sum(preds_test == y_test)/x_test.shape[0])
    
    # Extract optimal params
    idx = np.argmax(preds_tt)
    opt_lambda, opt_sigma = L[idx], S[idx]
    print("Optimal parameters: lambda = {}, sigma = {}".format(opt_lambda, opt_sigma))
    print("Training accuracy is: {}".format(preds_tr[idx]))
    print("Test accuracy is: {}".format(preds_tt[idx]))
    
    
    # Predict
    dist_candidate = np.linalg.norm((x_candidate[:,None]-x_train), axis = 2)**2
    K = np.exp(-dist/opt_sigma**2)
    alpha = np.linalg.solve(K+opt_lambda * n * np.eye(n), y_train)
    K_candidate = np.exp(-dist_candidate/opt_sigma**2)
    predictions = 1 * ((K_candidate @ alpha) > 1/2)
    return predictions

In [5]:
# Format data for submission
def parse_output(predictions, filename):
    '''
        predictions : list of predictions
    '''
    predictions = np.sign(predictions)
    predictions[predictions == -1] = 0
    
    with open(filename, 'w') as f:
        f.write("Id,Bound\n")
        count = 0
        for i in range(predictions.shape[0]):
            f.write("{},{}\n".format(count, int(predictions[i])))
            count += 1

## Prediction on each dataset

In [None]:
# Compute predictions
preds = []
for k in range(3):
    print("Dataset n°{}".format(k))
    # Data loading
    train = np.loadtxt(open("data/Xtr{}_mat100.csv".format(k), "rb"), delimiter=" ")
    test = np.loadtxt(open("data/Xte{}_mat100.csv".format(k), "rb"), delimiter=" ")
    Y = pd.read_csv('data/Ytr{}.csv'.format(k))['Bound'].values
    
    # Training set
    x_train, x_test, y_train, y_test = train_test_split(train, Y, test_size=0.2, random_state=42)
    
    # Find best parameters and predict
    predictions = findBestParamsAndPredict(x_train, x_test, y_train, y_test, test)
    preds.append(predictions)

# Export for submission
parse_output(np.concatenate(preds), 'test.csv')

## Global Training and Prediction

In [None]:
# Load different datasets
train, test, Y = [], [], []
for k in range(3):
    train.append(np.loadtxt(open("data/Xtr{}_mat100.csv".format(k), "rb"), delimiter=" "))
    test.append(np.loadtxt(open("data/Xte{}_mat100.csv".format(k), "rb"), delimiter=" "))
    Y.append(pd.read_csv('data/Ytr{}.csv'.format(k))['Bound'].values)

# Combine datasets
train = np.concatenate(train)
test = np.concatenate(test)
Y = np.concatenate(Y)

# Create train/test sets
x_train, x_test, y_train, y_test = train_test_split(train, Y, test_size=0.2, random_state=42)

# Find best parameters and predict
predictions = findBestParamsAndPredict(x_train, x_test, y_train, y_test, test)

# Export for submission
parse_output(np.concatenate(preds), 'test.csv')