In [1]:
import gpytorch
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torchvision import transforms

gpytorch.functions.use_toeplitz = False

In [2]:
class FeatureExtractor(nn.Sequential):
    
    def __init__(self):
        super(FeatureExtractor, self).__init__(nn.Conv2d(1, 32, kernel_size=5, padding=2),
                                 nn.BatchNorm2d(32),
                                 nn.ReLU(),
                                 nn.MaxPool2d(2, 2),
                                 nn.Conv2d(32, 64, kernel_size=5, padding=2),
                                 nn.BatchNorm2d(64),
                                 nn.ReLU(),
                                 nn.MaxPool2d(2, 2))
        
class Bottleneck(nn.Sequential):
    
    def __init__(self):
        super(Bottleneck, self).__init__(nn.Linear(64*7*7, 128),
                                         nn.BatchNorm1d(128),
                                 nn.ReLU(),
                                 nn.Linear(128, 128),
                                 nn.BatchNorm1d(128),
                                 nn.ReLU(),
                                 nn.Linear(128,64),
                                 nn.BatchNorm1d(64))

class LeNet(nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        self.feature_extractor = FeatureExtractor()
        self.bottleneck = Bottleneck()
        self.final_layer = nn.Sequential(
                                 nn.ReLU(),
                                 nn.Linear(64,10))
    
    def forward(self, x):
        features = self.feature_extractor(x)
        bottlenecked_features = self.bottleneck(features.view(-1, 64 * 7 * 7))
        classification = self.final_layer(bottlenecked_features)
        return classification
        

In [3]:
train_mnist = torchvision.datasets.MNIST('/tmp', train=True,
                                         download=True, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))                                              
test_mnist = torchvision.datasets.MNIST('/tmp', train=False,
                                        download=True, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

In [4]:
train_data_loader = torch.utils.data.DataLoader(train_mnist, shuffle=True, pin_memory=True, batch_size=256)

In [5]:
criterion = nn.CrossEntropyLoss().cuda()

In [6]:
model = LeNet().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [7]:
num_epochs = 1
for i in range(num_epochs):
    for x, y in train_data_loader:
        optimizer.zero_grad()
        x = Variable(x.cuda())
        y = Variable(y.cuda())
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
    print("Loss: ", loss.data[0])
    

KeyboardInterrupt: 

In [8]:
list(model.bottleneck.modules())[-1].weight.data.fill_(1)


 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
[torch.cuda.FloatTensor of size 64 (GPU 0)]

In [9]:
from gpytorch.kernels import RBFKernel, GridInterpolationKernel

class DeepKernel(gpytorch.Module):
    def __init__(self, model):
        super(DeepKernel, self).__init__()
        self.feature_extractor = model.feature_extractor
        self.bottleneck = model.bottleneck
        self.gp_layer = GPLayer()
        
    def forward(self, x):
        features = self.feature_extractor(x)
        bottlenecked_features = self.bottleneck(features.view(-1, 64 * 7 * 7))
        gp_output = self.gp_layer(bottlenecked_features)
        return gp_output
    
    
class LatentFunction(gpytorch.AdditiveGridInducingPointModule):
    def __init__(self):
        super(LatentFunction, self).__init__(grid_size=128, grid_bounds=[(-10, 10)],
                                             n_components=64, mixing_params=True)
        cov_module = RBFKernel()
        cov_module.initialize(log_lengthscale=2)
        self.cov_module = cov_module
        
    def forward(self, x):
        mean = Variable(x.data.new(len(x)).zero_())
        covar = self.cov_module(x)
        return gpytorch.random_variables.GaussianRandomVariable(mean, covar)

    
class GPLayer(gpytorch.GPModel):
    def __init__(self, n_dims=64):
        super(GPLayer, self).__init__(gpytorch.likelihoods.BernoulliLikelihood())
        self.latent_function = LatentFunction()
    
    def forward(self, x):
        res = self.latent_function(x)
        return res
    

In [10]:
len(train_mnist)

60000

In [11]:
deep_kernel = DeepKernel(model).cuda()
gp_data_loader = torch.utils.data.DataLoader(train_mnist, batch_size=2048., pin_memory=True, shuffle=True)

In [12]:
# Find optimal model hyperparameters
deep_kernel.train()
optimizer = torch.optim.Adam(deep_kernel.gp_layer.parameters(), lr=0.01)
optimizer.n_iter = 0
for i in range(10):
    for j, (train_x_batch, train_y_batch) in enumerate(gp_data_loader):
        train_x_batch = Variable(train_x_batch).cuda()
        train_y_batch = Variable(train_y_batch.fmod(2) * 2 - 1).cuda()
        optimizer.zero_grad()
        output = deep_kernel(train_x_batch)
        loss = -deep_kernel.gp_layer.marginal_log_likelihood(output, train_y_batch.float(), n_data=len(train_mnist))
        loss.backward()
        optimizer.n_iter += 1
        print('Iter %d/200 - Loss: %.3f' % (
            i + 1, loss.data[0],
        ))
        optimizer.step()
        
    deep_kernel.eval()
    test_data_loader = torch.utils.data.DataLoader(test_mnist, shuffle=False, pin_memory=True, batch_size=256)

    avg = 0.
    i = 0.
    for test_batch_x, test_batch_y in test_data_loader:
        predictions = deep_kernel(Variable(test_batch_x).cuda()).probability.round()
        test_batch_y = Variable(test_batch_y.fmod(2)).cuda().float()
        avg += torch.eq(predictions, test_batch_y).float().mean().data[0]
        i += 1.

    print('Score')
    print(avg / i)
    deep_kernel.train()


Iter 1/200 - Loss: 222.398
Iter 1/200 - Loss: 214.856
Iter 1/200 - Loss: 190.638
Iter 1/200 - Loss: 141.696
Iter 1/200 - Loss: 98.469
Iter 1/200 - Loss: 81.781
Iter 1/200 - Loss: 105.759
Iter 1/200 - Loss: 84.105
Iter 1/200 - Loss: 98.800
Iter 1/200 - Loss: 55.308
Iter 1/200 - Loss: 55.503
Iter 1/200 - Loss: 29.140
Iter 1/200 - Loss: 54.601
Iter 1/200 - Loss: 36.172
Iter 1/200 - Loss: 24.872
Iter 1/200 - Loss: 24.862
Iter 1/200 - Loss: 14.737
Iter 1/200 - Loss: 12.124
Iter 1/200 - Loss: 19.757
Iter 1/200 - Loss: 15.063
Iter 1/200 - Loss: 20.375
Iter 1/200 - Loss: 11.287
Iter 1/200 - Loss: 21.851
Iter 1/200 - Loss: 8.822
Iter 1/200 - Loss: 11.276
Iter 1/200 - Loss: 9.714
Iter 1/200 - Loss: 7.737
Iter 1/200 - Loss: 7.760
Iter 1/200 - Loss: 5.808
Iter 1/200 - Loss: 8.790
Score
0.9015625
Iter 2/200 - Loss: 12.486
Iter 2/200 - Loss: 14.123
Iter 2/200 - Loss: 3.902
Iter 2/200 - Loss: 12.070
Iter 2/200 - Loss: 4.662
Iter 2/200 - Loss: 3.900
Iter 2/200 - Loss: 5.129
Iter 2/200 - Loss: 9.649
It

KeyboardInterrupt: 

In [13]:
deep_kernel.eval()
test_data_loader = torch.utils.data.DataLoader(test_mnist, shuffle=False, pin_memory=True, batch_size=256)

avg = 0.
i = 0.
for test_batch_x, test_batch_y in test_data_loader:
    predictions = deep_kernel(Variable(test_batch_x).cuda()).probability.round()
    test_batch_y = Variable(test_batch_y.fmod(2)).cuda().float()
    avg += torch.eq(predictions, test_batch_y).float().mean().data[0]
    i += 1.

print(avg / i)


0.98251953125
