In [1]:
import gpytorch
import torch
from torch.autograd import Variable

gpytorch.functions.use_toeplitz = False

In [2]:
import os
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

train_dataset = datasets.MNIST('/tmp', train=True, download=True,
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))
                               ]))
test_dataset = datasets.MNIST('/tmp', train=False, download=True,
                              transform=transforms.Compose([
                                  transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))
                              ]))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False, pin_memory=True)

## Define the feature extractor for our deep kernel

In [3]:
from collections import OrderedDict
from torch import nn
from torch.nn import functional as F

class LeNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(LeNetFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2)
        self.norm1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2)
        self.norm2 = nn.BatchNorm2d(32)
        self.fc3 = nn.Linear(32 * 7 * 7, 64)
        self.norm3 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.norm1(self.conv1(x))), 2)
        x = F.max_pool2d(F.relu(self.norm2(self.conv2(x))), 2)
        x = x.view(-1, 32 * 7 * 7)
        x = F.relu(self.norm3(self.fc3(x)))
        return x
    
feature_extractor = LeNetFeatureExtractor().cuda()

### Pretrain the feature extractor a bit

In [15]:
classifier = nn.Linear(64, 10).cuda()
params = list(feature_extractor.parameters()) + list(classifier.parameters())
optimizer = torch.optim.SGD(params, lr=0.1, momentum=0.9)

def pretrain(epoch):
    feature_extractor.train()
    train_loss = 0.
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        features = feature_extractor(data)
        output = F.log_softmax(classifier(features), 1)
        loss = F.nll_loss(output, target)
        loss.backward()
        print(model.feature_extractor.fc3.weight.grad.data.norm())
        optimizer.step()
        train_loss += loss.data[0] * len(data)
    print('Train Epoch: %d\tLoss: %.6f' % (epoch, train_loss / len(train_dataset)))

def pretest():
    feature_extractor.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            features = feature_extractor(data)
            output = F.log_softmax(classifier(features), 1)
            test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
            pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 3
for epoch in range(1, n_epochs + 1):
    pretrain(epoch)
    pretest()

0.599850893021
0.54921990633
0.40523019433
0.24218775332
0.133994534612
0.0876626372337
0.0635821446776
0.0592243224382
0.0479586273432
0.0417336821556
0.0514233969152
0.0439690388739
0.0434920154512
0.0456688664854
0.046168204397
0.0448490232229
0.0424641892314
0.0398730300367


KeyboardInterrupt: 

## Define the deep kernel GP

In [36]:
def print_grad(grad):
    print('woo', grad.data.norm())
    return grad

class DKLModel(gpytorch.Module):
    def __init__(self, feature_extractor, n_features=64, grid_bounds=(-10., 10.)):
        super(DKLModel, self).__init__()
        self.feature_extractor = feature_extractor
        self.latent_functions = LatentFunctions(n_features=n_features, grid_bounds=grid_bounds)
        
        self.grid_bounds = grid_bounds
        self.n_features = n_features
    
    def forward(self, x):
        features = self.feature_extractor(x)
        # Scale to fit insid egrid bounds
        features = gpytorch.utils.scale_to_bounds(features, self.grid_bounds[0], self.grid_bounds[1])
        features = features.unsqueeze(-1)
        features.register_hook(print_grad)
        print(features.requires_grad)
        res = self.latent_functions(features)
        return res
    
    
class LatentFunctions(gpytorch.models.AdditiveGridInducingVariationalGP):
    def __init__(self, n_features=64, grid_bounds=(-10., 10.), grid_size=128):
        super(LatentFunctions, self).__init__(grid_size=grid_size, grid_bounds=[grid_bounds],
                                              n_components=n_features, mixing_params=False, sum_output=False)
        cov_module = gpytorch.kernels.RBFKernel()
        cov_module.initialize(log_lengthscale=0)
        self.cov_module = cov_module
        self.grid_bounds = grid_bounds
        
    def forward(self, x):     
        print(x.requires_grad)
        mean = Variable(x.data.new(len(x)).zero_())
        covar = self.cov_module(x)
        covar.var.register_hook(print_grad)
        return gpytorch.random_variables.GaussianRandomVariable(mean, covar)
    
    
model = DKLModel(feature_extractor).cuda()
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(n_features=model.n_features, n_classes=10).cuda()

In [37]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2048, shuffle=True, pin_memory=True)
optimizer = torch.optim.Adam(list(model.parameters()) + list(likelihood.parameters()), lr=0.01)

def train(epoch):
    model.train()
    likelihood.train()
    
    train_loss = 0.
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = -model.latent_functions.marginal_log_likelihood(likelihood, output, target, n_data=len(train_dataset))
        loss.backward()
        print(model.feature_extractor.fc3.weight.grad.data.norm())
        optimizer.step()
        print('Train Epoch: %d [%03d/%03d], Loss: %.6f' % (epoch, batch_idx + 1, len(train_loader), loss.data[0]))

def test():
    model.eval()
    likelihood.eval()

    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = likelihood(model(data))
        pred = output.argmax()
        correct += pred.eq(target.view_as(pred)).data.cpu().sum()
    test_loss /= len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

n_epochs = 10
for epoch in range(1, n_epochs + 1):
    %time train(epoch)
    test()

True
False
('woo', 54470.25390625)
0.0
Train Epoch: 1 [001/030], Loss: 43.634186
True
False
('woo', 15751.7099609375)
0.0
Train Epoch: 1 [002/030], Loss: 50.805923
True
False
('woo', 9784.4853515625)
0.0
Train Epoch: 1 [003/030], Loss: 65.788666
True
False
('woo', 14001.9326171875)
0.0
Train Epoch: 1 [004/030], Loss: 67.677086
True
False
('woo', 13221.333984375)
0.0
Train Epoch: 1 [005/030], Loss: 36.372559
True
False
('woo', 7209.7607421875)
0.0
Train Epoch: 1 [006/030], Loss: 47.233997
True
False
('woo', 7517.69384765625)
0.0
Train Epoch: 1 [007/030], Loss: 55.449318
True
False
('woo', 44494.05859375)
0.0
Train Epoch: 1 [008/030], Loss: 26.321396


KeyboardInterrupt: 