In [1]:
import torch
import numpy as np
from matplotlib import pyplot as plt
from scipy import ndimage
import os, sys
import math
import pickle
import notebook_utils as nbutils
import data_utils as datutil
import datetime as dt
import hmc
from models import *
import gpytorch
from notebook_utils import *

In [2]:
def learning_rate_mod_factor(epoch, lr_init, lr_end, end_epoch):
    lr_ratio = lr_end / lr_init
    t = (epoch) / (end_epoch*1.0)
    if t < 0.2:
        factor = 1.0
    elif t <= 0.9:
        factor = 1.0 - (1.0 - lr_ratio) * (t - 0.4) / 0.5
    else:
        factor = lr_ratio
    return factor

In [3]:
class GP_type1(gpytorch.models.AdditiveGridInducingVariationalGP):
    '''
    Gaussian Process layer with inducing point approximation of GP

    num_dim: incoming feature size
    grid_bounds: value bounds for inducing point features
    grid_size: number of inducing points to use
    '''
    def __init__(self, num_dim, grid_bounds=(-10., 10.), grid_size=64):
        super(GP_type1, self).__init__(grid_size=grid_size, grid_bounds=[grid_bounds],
                                                   num_dim=num_dim, mixing_params=False, sum_output=False)
        self.covar_module = gpytorch.kernels.ScaleKernel(
            # change the kernel to something else at this point
            gpytorch.kernels.RBFKernel(ard_num_dims=num_dim,  # this option sets dimension-wise lengthscale
                lengthscale_prior=gpytorch.priors.SmoothedBoxPrior(
                    math.exp(-1), math.exp(1), sigma=0.1, transform=torch.exp
                )
            )
        )
        self.mean_module = gpytorch.means.ConstantMean()
        self.grid_bounds = grid_bounds

    def forward(self, x):
        mean = self.mean_module(x)
        covar = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean, covar)


class GP_type2(gpytorch.models.AbstractVariationalGP):
    '''
    Gaussian Process layer with inducing point approximation of GP

    num_dim: incoming feature size
    grid_bounds: value bounds for inducing point features
    grid_size: number of inducing points to use
    '''
    def __init__(self, num_dim, grid_bounds=(-10., 10.), grid_size=64):
        variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
            num_inducing_points=grid_size, batch_size=num_dim
        )
        variational_strategy = gpytorch.variational.AdditiveGridInterpolationVariationalStrategy(
            self, grid_size=grid_size, grid_bounds=[grid_bounds], num_dim=num_dim,
            variational_distribution=variational_distribution, mixing_params=False, sum_output=False
        )
        super().__init__(variational_strategy)

        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel(ard_num_dims=num_dim,
                lengthscale_prior=gpytorch.priors.SmoothedBoxPrior(
                    math.exp(-1), math.exp(1), sigma=0.1, transform=torch.exp
                )
            )
        )

In [4]:
class GPNet(gpytorch.Module):

    def __init__(self, device, feature_extractor, num_classes=10, grid_bounds=(-10., 10.), grid_size=64):
        '''
        Wrapper class for sequentializing a feature extractor and a GP layer
        
        device: cuda device to place everything into
        feature_extractor: base feature extractor, should be set to Identity()
                if features are encoded
        num_classes: number of classes in classification task
        grid_bounds, grid_size: refer to input for GP layer
        '''
        super(GPNet, self).__init__()
        self.feature_extractor = feature_extractor
        self.num_dim = feature_extractor.fc_architecture[-1]
        self.gp_layer = GP_type1(num_dim=self.num_dim, grid_bounds=grid_bounds, grid_size=grid_size)
        self.grid_bounds = grid_bounds
        self.device = device
        self.feature_extractor.to(device)
        self.gp_layer.to(device)

    def forward(self, x):
        features = self.feature_extractor(x)
        features = gpytorch.utils.grid.scale_to_bounds(features, self.grid_bounds[0], self.grid_bounds[1])
        res = self.gp_layer(features)
        return res


In [5]:
class Identity(nn.Module):
    '''
    A dummy empty class to place whenever we
    do not need any nn block but have to put something
    '''
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

class feature(nn.Module):
    '''
    Wrapper class for feature extractor then a series of dense layers.

    base_feature: base feature extractor (maybe resnet till before dense)
                set it to Identity class when you have encoded feature.
                Otherwise feature extractor parameters will be jointly learned
    fc_layers: array containing dense layer lengths starting with base feature dim
                e.g. [256, 100, 10] will expect a 256 dimensional input and then
                place linear(256, 100) and then linear(100, 10) sequentially
    device: cuda device in which (local as well) parameters will be put.
    '''
    def __init__(self, base_feature, fc_layers=[]):
        super(feature, self).__init__()
        self.base_layer = base_feature
        self.fc_architecture = fc_layers
        assert len(fc_layers) > 0, 'FC layer can not be empty, at minimum it needs to be [input_feature_size]'
        linear_list = [Identity()]
        for comp_idx in range(2*len(fc_layers)-3):
            if comp_idx%2==0:
                idx = comp_idx // 2
                linear_list.append(nn.Linear(fc_layers[idx], fc_layers[idx+1]))
            else:
                linear_list.append(nn.ReLU())

        self.fc_list = nn.Sequential(*linear_list)

        
    def forward(self, x):
        x = self.base_layer(x)
        if len(self.fc_architecture) > 0:
            x = self.fc_list(x)
        
        return x

In [6]:
# Data loader initialization
trainloader = datutil.generate_dataloaders('ENCODED256_D164_CIFAR10_TRAIN', batch_size=300, shuffle=False, num_workers=2)
testloader = datutil.generate_dataloaders('ENCODED256_D164_CIFAR10_TEST', batch_size=200, shuffle=False, num_workers=2)
# trainloader = datutil.generate_dataloaders('Fast_resnet30000MixupCutout_train', batch_size=300, shuffle=False, num_workers=2)
# validloader = datutil.generate_dataloaders('Fast_resnet30000MixupCutout_train', batch_size=300, 
#                                            shuffle=False, num_workers=2, start=45000)
# testloader = datutil.generate_dataloaders('Fast_resnet30000MixupCutout_test', batch_size=200, shuffle=False, num_workers=2)
# trainloader = datutil.generate_dataloaders('Encoded_DR_train', batch_size=300, shuffle=False, num_workers=2)
# testloader = datutil.generate_dataloaders('Encoded_DR_test', batch_size=200, shuffle=False, num_workers=2)

device = torch.device('cuda:1')

In [7]:
num_classes = 10
fc_layer_setup = [256] #[256, 160]
weight_decay = 0.0003
gp_weight_decay = 0.0003
lr = 0.1
lr_end = 0.0001
grid_size = 64

# base_model = PreResNet(num_classes=10, depth=164)
# base_model.fc = Identity()
base_model = Identity()
feature_extractor = feature(base_model, fc_layer_setup)
final_model = GPNet(device=device, feature_extractor=feature_extractor, num_classes=num_classes, grid_size=grid_size)
print(final_model)

# likelihood, marginal log-likelihood for GP layers
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(fc_layer_setup[-1], num_classes)
likelihood.to(device)
mll = gpytorch.mlls.VariationalELBO(likelihood, final_model.gp_layer, num_data=len(trainloader.dataset))
optimizer = torch.optim.SGD([
            {'params': final_model.feature_extractor.parameters(), 'weight_decay': weight_decay},
            {'params': final_model.gp_layer.hyperparameters(), 'lr': lr * 0.01, 'weight_decay': gp_weight_decay},
            {'params': final_model.gp_layer.variational_parameters(), 'weight_decay': gp_weight_decay},
            {'params': likelihood.parameters()}], lr=lr, momentum=0.9)


GPNet(
  (feature_extractor): feature(
    (base_layer): Identity()
    (fc_list): Sequential(
      (0): Identity()
    )
  )
  (gp_layer): GP_type1(
    (variational_strategy): AdditiveGridInterpolationVariationalStrategy(
      (variational_distribution): CholeskyVariationalDistribution()
    )
    (covar_module): ScaleKernel(
      (base_kernel): RBFKernel(
        (lengthscale_prior): SmoothedBoxPrior()
        (raw_lengthscale_constraint): Positive()
      )
      (raw_outputscale_constraint): Positive()
    )
    (mean_module): ConstantMean()
  )
)


In [8]:
running_loss = 0
epoch_length = 50

for epoch in range(0, epoch_length):  # loop over the dataset multiple times

    factor = learning_rate_mod_factor(epoch, lr, lr_end, epoch_length)
    for i, g in enumerate(optimizer.param_groups):
        print("Learning rate for param %d is currently %.4f" %(i, lr * factor))
        g['lr'] = lr * factor

    for i, data in enumerate(trainloader):

        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        output = final_model(inputs)
        loss = -mll(output, labels)
        loss.sum().backward()
        optimizer.step()
        running_loss = 0.9*running_loss + 0.1*loss.item() if running_loss != 0 else loss.item()

        if i% (len(trainloader) // 1) == 0:
            print('[%d, %5d] loss: %.3f' %(epoch + 1, i, running_loss))

    print("=== Accuracy using SGD params ===")
    accuracy, ece, sce = nbutils.validate(model=final_model, likelihood=likelihood, dataloader=testloader, device=device)


Learning rate for param 0 is currently 0.1000
Learning rate for param 1 is currently 0.1000
Learning rate for param 2 is currently 0.1000
Learning rate for param 3 is currently 0.1000




[1,     0] loss: 2.308
=== Accuracy using SGD params ===






Accuracy statistics
Overall accuracy : 10.3 %


KeyboardInterrupt: 

In [None]:
savefile = 'GP_notebook_model_file'
savedir = 'saved_models/'
checkpoint = {'model_state': final_model.state_dict(),
              'likelihood_state' : likelihood.state_dict(),
              'optim_state': optimizer.state_dict(),
              'acc': 1}
curtime = dt.datetime.now()
tm = curtime.strftime("%Y-%m-%d-%H.%M")
torch.save(checkpoint, savedir + savefile + '-' + tm + '.model')

In [None]:
print(len(trainloader.dataset))
accuracy, ece, sce = nbutils.validate(model=final_model, likelihood=likelihood, dataloader=testloader, device=device)
