# 0. Data

In [37]:
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
device = torch.device("cuda")
from sklearn.model_selection import train_test_split


In [38]:
'split data'
'first 21 columns are input variables and last 7 columns are output variables'

train_data = pd.read_csv('./data/SARCOSTst.csv', header=None)[:1000]
test_data = pd.read_csv('./data/SARCOSTrn.csv', header=None)[:100]

X_train, X_valid, Y_train, Y_valid = train_test_split(train_data.iloc[:,:21], train_data.iloc[:,-7:], test_size=0.2)
X_test, Y_test = test_data.iloc[:,:21], test_data.iloc[:,-7:]

X_train = torch.tensor(X_train.values).to(device)
X_valid = torch.tensor(X_valid.values).to(device)
Y_train = torch.tensor(Y_train.values).to(device)
Y_valid = torch.tensor(Y_valid.values).to(device)
X_test = torch.tensor(X_test.values).to(device)
Y_test = torch.tensor(Y_test.values).to(device)

print('train set: ', X_train.shape, Y_train.shape)
print('valid set: ', X_valid.shape, Y_valid.shape)
print('test set: ', X_test.shape, Y_test.shape)

train set:  torch.Size([800, 21]) torch.Size([800, 7])
valid set:  torch.Size([200, 21]) torch.Size([200, 7])
test set:  torch.Size([100, 21]) torch.Size([100, 7])


# 1. Kernel

In [39]:
class LinearKernel:
    """
    standard dot product kernel k(a,b) = a^\top b
    :input: X1 (N*D), X2 (M*D)
    :output: covariance matrix (N*M)
    """
    def __init__(self):
        pass

    def __call__(self, X1, X2):
        # return torch.tensor([[torch.dot(x1, x2) for x1 in X1] for x2 in X2]) 
        return X1 @ X2.T 


In [40]:
class GaussianKernel:
    """
    isotropic Gaussian kernel
    :input: X1 (N*D), X2 (M*D)
    :output: covariance matrix (N*M)
    """
    def __init__(self, sigma_k=1):
        self.sigma_k = sigma_k # isotropic Gaussian kernel variance (Hyperparameter !)

    def __call__(self, X1, X2):
        # return np.exp(-(np.sum(X1**2, axis=1).values.reshape(-1, 1) +
        #                 np.sum(X2**2, axis=1).values.reshape(1, -1) - 2*X1@X2.T) / pow(self.sigma_k, 2))
        return torch.exp(-(torch.sum(X1**2, axis=1).reshape(-1, 1) +
                           torch.sum(X2**2, axis=1).reshape(1, -1) - 2*X1@X2.T) / pow(self.sigma_k, 2))


In [41]:
class SquaredExponentialKernel:
    pass

# 2. GP regression

In [44]:
class GP_Regression(nn.Module):
    def __init__(self, K, sigma_n, device):
        super().__init__()
        self.K = K
        self.sigma_n = nn.Parameter(torch.tensor(sigma_n), requires_grad=True)  # noise variance (Hyperparameter)
        self.device = device
        self.Sigma = torch.diag(torch.ones(7)) * self.sigma_n # output dim (=7)
    
    def fit(self, X_train, Y_train, X_test):
        self.X_train = X_train
        self.Y_train = Y_train
        self.X_test = X_test
        self.N = X_train.shape[0]
        self.M = X_test.shape[0]
        self.D = Y_train.shape[1] # output dim (=7)
        self.I = torch.eye(self.N)
        # sufficient statistics
        self.K_X_X = self.K(self.X_train, self.X_train)
        self.K_X_X = torch.block_diag(*[self.K_X_X]*self.D) # coregionalisation matrix (C) is an identity matrix with DxD
        self.K_Xt_X = self.K(self.X_test, self.X_train)
        self.K_Xt_X = torch.block_diag(*[self.K_Xt_X]*self.D)
        self.K_X_Xt = self.K_Xt_X.T
        self.K_Xt_Xt = self.K(self.X_test, self.X_test)
        self.K_Xt_Xt = torch.block_diag(*[self.K_Xt_Xt]*self.D)
        self.vec_Y = self.Y_train.T.reshape(self.D*self.N) # Y concat

    def predict(self):
        # calculate predictive mean
        mean = self.K_Xt_X @ torch.linalg.inv( self.K_X_X + torch.kron(self.Sigma, self.I).to(self.device) ) @ self.vec_Y
        return mean.reshape(self.D, -1).T # to compare with Y_test

    def __NLL_term_1__(self):
        return -0.5*(self.M*self.D) * torch.log(torch.tensor([2*torch.pi]))

    def __NLL_term_2__(self, CR): # Omega <- I
        Sigma = torch.diag(torch.ones(self.D)) * self.sigma_n
        SI = torch.kron(Sigma, self.I).to(self.device) 
        K = CR + SI
        return -0.5 * torch.log(torch.det(K))

    def __NLL_term_3__(self, CR): # Omega <- I
        Sigma = torch.diag(torch.ones(self.D)) * self.sigma_n
        SI = torch.kron(Sigma, self.I).to(self.device)
        K = CR + SI
        vec_Y = self.Y_train.T.reshape(self.D*self.N)  # Y concat
        return -0.5 * vec_Y.T @ torch.linalg.inv(K) @ vec_Y
    
    def calculate_NLL(self):
        K_X_X = self.K(self.X_train, self.X_train)
        CR = torch.block_diag(*[K_X_X]*self.D) 
        return self.__NLL_term_1__().to(self.device) + self.__NLL_term_2__(CR).to(self.device) + self.__NLL_term_3__(CR).to(self.device)


In [45]:
model = GP_Regression(K=GaussianKernel(), sigma_n=0.05, device=device)
model.fit(X_train, Y_train, X_test)
# model.predict() # 이거 한번 거쳐야 self.vec_Y 사용 가능

optimizer = torch.optim.SGD(model.parameters(), lr=0.00002)
n_iter=50
print(f'Initial Parameter: {list(model.parameters())[0].item()}')
print('')
for i in range(n_iter):
    print(f'[Iteration {i}]')
    nll = model.calculate_NLL()
    optimizer.zero_grad()
    nll.backward() # calculate derivatives
    optimizer.step() # update parameters
    print(f"NLL: {nll.item()}")
    for i in range(len(list(model.parameters()))):
        print('Paremeter: ', list(model.parameters())[i].item())
    pred = model.predict()
    mse = nn.MSELoss()(pred, Y_test).item()
    print(f"MSE:", mse)
    print("")


Initial Parameter: 0.05000000074505806

[Iteration 0]
NLL: -970830.0
Paremeter:  -18.234878540039062
MSE: 376.5684566220583

[Iteration 1]
NLL: -inf
Paremeter:  nan
MSE: 376.5684566220583

[Iteration 2]
NLL: nan
Paremeter:  nan
MSE: 376.5684566220583

[Iteration 3]
NLL: nan
Paremeter:  nan
MSE: 376.5684566220583

[Iteration 4]


KeyboardInterrupt: 

# 3. Main