# 0. Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
'split data'
'first 21 columns are input variables and last 7 columns are output variables'

train_data = pd.read_csv('./data/SARCOSTst.csv', header=None)
test_data = pd.read_csv('./data/SARCOSTrn.csv', header=None)

X_train, X_valid, Y_train, Y_valid = train_test_split(train_data.iloc[:,:21], train_data.iloc[:,-7:], test_size=0.2)
X_test, Y_test = test_data.iloc[:,:21], test_data.iloc[:,-7:]

print('train set: ', X_train.shape, Y_train.shape)
print('valid set: ', X_valid.shape, Y_valid.shape)
print('test set: ', X_test.shape, Y_test.shape)

train set:  (31587, 21) (31587, 7)
valid set:  (7897, 21) (7897, 7)
test set:  (5000, 21) (5000, 7)


# 1. Nonlinear GP regression
- Gaussian kernel

In [3]:
class GaussianKernel:
    """
    isotropic Gaussian kernel
    :input: X1 (N*D), X2 (M*D)
    :output: covariance matrix (N*M)
    """
    def __init__(self, sigma_k=1):
        self.sigma_k = sigma_k # kernel variance (Hyperparameter !)

    def __call__(self, X1, X2):
        return np.exp(-(np.sum(X1**2, axis=1).values.reshape(-1, 1) +
                        np.sum(X2**2, axis=1).values.reshape(1, -1) - 2*X1@X2.T) / pow(self.sigma_k, 2))


In [4]:
class GaussianProcessRegression:
    def __init__(self, X_train, Y_train, K=GaussianKernel(), sigma_n=1):
        self.X_train = X_train
        self.Y_train = Y_train
        self.K = K
        self.sigma_n = sigma_n  # noise variance (Hyperparameter !)
        self.predictive_distribution = None
        self.temp = None

    def predict(self, X_test):
        D = self.Y_train.shape[1]
        C = np.identity(D) # coregionalisation matrix (used for multi-output model) # 7x7
        K_X_X = self.K(self.X_train, self.X_train)  # 31587x31587
        K_X_X = np.kron(C, K_X_X)
        K_X_Xt = self.K(self.X_train, X_test); 
        K_X_Xt = np.kron(C, K_X_Xt)
        K_Xt_X = self.K(X_test, self.X_train); 
        K_Xt_X = np.kron(C, K_Xt_X)
        K_Xt_Xt = self.K(X_test, X_test); 
        K_Xt_Xt = np.kron(C, K_Xt_Xt)
        Sigma = self.sigma_n * np.identity(D)
        I = np.identity(len(self.X_train))
        y_concat = self.Y_train.T.stack(level=-1).values

        # predictive mean, covariance, variance
        mean = K_Xt_X @ np.linalg.inv( K_X_X + np.kron(Sigma,I) ) @ y_concat
        cov = K_Xt_Xt - K_Xt_X @ np.linalg.inv( K_X_X + np.kron(Sigma,I) ) @ K_X_Xt
        var = np.diag(cov)

        self.predictive_distribution = {'mean': mean, 'cov':cov, 'var':var}
        self.temp = [C, K_X_X, K_X_Xt, K_Xt_X, K_Xt_Xt, Sigma, I, y_concat]

        return pd.DataFrame(mean.reshape(D, -1)).T


In [5]:
model = GaussianProcessRegression(X_train, Y_train)
pred = model.predict(X_test)

MSE = ((pred.values - Y_test.values)**2).mean(axis=1).mean() # MSE

C = model.temp[0]
K_X_X = model.temp[1]
K_X_Xt = model.temp[2]
K_Xt_X = model.temp[3]
K_Xt_Xt = model.temp[4]
Sigma = model.temp[5]
I = model.temp[6]
y_concat = model.temp[7]


MemoryError: Unable to allocate 364. GiB for an array with shape (49, 997738569) and data type float64

# 2.Linear GP regression 
- Standard dot product kernel
    - k(a,b) = a^\top b

In [None]:
class GaussianKernel:
    """
    isotropic Gaussian kernel
    :input: X1 (N*D), X2 (M*D)
    :output: covariance matrix (N*M)
    """

    def __init__(self, sigma_k=1):
        self.sigma_k = sigma_k  # kernel variance (Hyperparameter !)

    def __call__(self, X1, X2):
        return np.exp(-(np.sum(X1**2, axis=1).values.reshape(-1, 1) +
                        np.sum(X2**2, axis=1).values.reshape(1, -1) - 2*X1@X2.T) / pow(self.sigma_k, 2))
