In [1]:
import gpytorch
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torchvision import transforms

gpytorch.functions.use_toeplitz = False

In [2]:
ways = 20
shots = 5
train_dir_str = "way%dshot%d" %(ways, shots)
test_dir_str = "way%dtest" %ways

In [3]:
train_base_omni = torchvision.datasets.ImageFolder('/scratch/bw462/omni_data/general', transform=transforms.Compose([
                        transforms.Scale((28,28)),
                        transforms.ToTensor()
                   ]))                                              
"""
test_mnist = torchvision.datasets.ImageFolder('/tmp', split='test',
                                        download=True, transform=transforms.Compose([
                       transforms.ToTensor()
                   ]))
"""



"\ntest_mnist = torchvision.datasets.ImageFolder('/tmp', split='test',\n                                        download=True, transform=transforms.Compose([\n                       transforms.ToTensor()\n                   ]))\n"

In [4]:
class FeatureExtractor(nn.Sequential):
    
    def __init__(self):
        super(FeatureExtractor, self).__init__(nn.Conv2d(1, 32, kernel_size=5, padding=2),
                                 nn.BatchNorm2d(32),
                                 nn.ReLU(),
                                 nn.MaxPool2d(2, 2),
                                 nn.Conv2d(32, 64, kernel_size=5, padding=2),
                                 nn.BatchNorm2d(64),
                                 nn.ReLU(),
                                 nn.MaxPool2d(2, 2))
        
class Bottleneck(nn.Sequential):
    
    def __init__(self):
        super(Bottleneck, self).__init__(nn.Linear(64*7*7, 128),
                                         nn.BatchNorm1d(128),
                                 nn.ReLU(),
                                 nn.Linear(128, 128),
                                 nn.BatchNorm1d(128),
                                 nn.ReLU(),
                                 nn.Linear(128,64),
                                 nn.BatchNorm1d(64))

class LeNet(nn.Module):
    
    def __init__(self):
        super(LeNet, self).__init__()
        self.feature_extractor = FeatureExtractor()
        self.bottleneck = Bottleneck()
        self.final_layer = nn.Sequential(
                                 nn.ReLU(),
                                 nn.Linear(64,1319))
    
    def forward(self, x):
        input_x = x[:,0,:,:].unsqueeze(1)
        features = self.feature_extractor(input_x)
        bottlenecked_features = self.bottleneck(features.view(-1, 64 * 7 * 7))
        classification = self.final_layer(bottlenecked_features)
        return classification
        

In [5]:
train_data_loader = torch.utils.data.DataLoader(train_base_omni, shuffle=True, pin_memory=True, batch_size=256)

In [6]:
criterion = nn.CrossEntropyLoss().cuda()

In [7]:
model = LeNet().cuda() 
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

In [8]:
num_epochs = 0
if num_epochs > 0:
    model.train()
    for i in range(num_epochs):
        for x, y in train_data_loader:
            optimizer.zero_grad()
            x = Variable(x).cuda()
            y = Variable(y).cuda()
            output = model(x)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
        print("Loss: %.3f" % loss.data[0])
    torch.save(model.state_dict(), '/scratch/bw462/omni_net.dat')
else:
    model.load_state_dict(torch.load('/scratch/bw462/omni_net.dat'))


In [9]:
model.eval()
#test_data_loader = torch.utils.data.DataLoader(test_mnist, shuffle=False, pin_memory=True, batch_size=256)
avg = 0.
i = 0.
for test_batch_x, test_batch_y in train_data_loader:
    predictions = model(Variable(test_batch_x).cuda()).max(-1)[1]
    test_batch_y = Variable(test_batch_y).cuda()
    avg += torch.eq(predictions, test_batch_y).float().mean().data[0]
    i += 1.
print('Accuracy: %.4f' % (avg / i))

Accuracy: 0.9992


In [10]:
list(model.bottleneck.modules())[-1].weight.data.fill_(1)
None

In [11]:
from gpytorch.kernels import RBFKernel, GridInterpolationKernel

class DeepKernel(gpytorch.Module):
    def __init__(self, model):
        super(DeepKernel, self).__init__()
        self.feature_extractor = model.feature_extractor
        self.bottleneck = model.bottleneck
        self.gp_layer = GPLayer()
        
    def forward(self, x):
        features = self.feature_extractor(x[:,0,:,:].unsqueeze(1))
        bottlenecked_features = self.bottleneck(features.view(-1, 64 * 7 * 7))
        gp_output = self.gp_layer(0.1*bottlenecked_features)
        return gp_output
    
    
class LatentFunction(gpytorch.AdditiveGridInducingPointModule):
    def __init__(self):
        super(LatentFunction, self).__init__(grid_size=256, grid_bounds=[(-7, 7)],
                                             n_components=64, mixing_params=False, sum_output=False)
        cov_module = RBFKernel()
        cov_module.initialize(log_lengthscale=2)
        self.cov_module = cov_module
        
    def forward(self, x):
        mean = Variable(x.data.new(len(x)).zero_())
        covar = self.cov_module(x)
        return gpytorch.random_variables.GaussianRandomVariable(mean, covar)

    
class GPLayer(gpytorch.GPModel):
    def __init__(self, n_dims=64):
        super(GPLayer, self).__init__(gpytorch.likelihoods.SoftmaxLikelihood(n_features=64, n_classes=1319))
        self.latent_function = LatentFunction()
    
    def forward(self, x):
        res = self.latent_function(x)
        return res
    

In [12]:
deep_kernel = DeepKernel(model).cuda()

In [13]:
# Find optimal model hyperparameters
optimizer = torch.optim.Adam(deep_kernel.gp_layer.parameters(), lr=0.01)
optimizer.n_iter = 0
num_epochs = 0
if num_epochs > 0:
    deep_kernel.train()
    for i in range(num_epochs):
        for j, (train_x_batch, train_y_batch) in enumerate(train_data_loader):
            train_x_batch = Variable(train_x_batch).cuda()
            train_y_batch = Variable(train_y_batch).cuda()
            optimizer.zero_grad()
            output = deep_kernel(train_x_batch)
            loss = -deep_kernel.gp_layer.marginal_log_likelihood(output, train_y_batch, n_data=len(train_base_omni))
            loss.backward()
            optimizer.n_iter += 1
            print('Iter %d/%d - Loss: %.3f' % (
                i + 1, num_epochs, loss.data[0],
            ))
            optimizer.step()
    torch.save(deep_kernel.state_dict(), '/scratch/bw462/omni_gp.dat')
else:
    deep_kernel.load_state_dict(torch.load('/scratch/bw462/omni_gp.dat'))
    
    
deep_kernel.eval()
avg = 0.
i = 0.
for test_batch_x, test_batch_y in train_data_loader:
    predictions = deep_kernel(Variable(test_batch_x).cuda()).argmax()
    test_batch_y = Variable(test_batch_y).cuda()
    avg += torch.eq(predictions, test_batch_y).float().mean().data[0]
    i += 1.

print('Score')
print(avg / i)
deep_kernel.train()


  softmax = nn.functional.softmax(mixed_fs.t()).view(n_data, n_samples, self.n_classes)


Score
0.999430816721


DeepKernel(
  (feature_extractor): FeatureExtractor(
    (0): Conv2d (1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (4): Conv2d (32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (bottleneck): Bottleneck(
    (0): Linear(in_features=3136, out_features=128)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=128)
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=64)
    (7): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
  )
  (gp_layer): GPLayer(
    (likelihood): SoftmaxLikelihood(
    )
    (latent_functi

In [14]:
deep_kernel.eval()
for test_batch_x, test_batch_y in train_data_loader:
    predictions = deep_kernel(Variable(test_batch_x).cuda()).representation().max(-1)[1]
    test_batch_y = Variable(test_batch_y).cuda()
    avg += torch.eq(predictions, test_batch_y).float().mean().data[0]
    i += 1.
print('Accuracy: %.4f' % (avg / i))


Accuracy: 0.9995


In [15]:
train_shots_omni = torchvision.datasets.ImageFolder('/scratch/bw462/omni_data/' + train_dir_str, transform=transforms.Compose([
                        transforms.Scale((28,28)),
                        transforms.ToTensor()
                   ]))    

In [None]:
oneshot_model = DeepKernel(model).cuda()
oneshot_model.load_state_dict(deep_kernel.state_dict())
oneshot_model.gp_layer.likelihood = gpytorch.likelihoods.SoftmaxLikelihood(n_features=64, n_classes=1319).cuda()
oneshot_model.gp_layer.likelihood.mixing_weights.data.copy_(deep_kernel.gp_layer.likelihood.mixing_weights.data)
shots_loader = torch.utils.data.DataLoader(train_shots_omni, batch_size=512., pin_memory=True, shuffle=True)

In [None]:
# Find optimal model hyperparameters
oneshot_model.train()
oneshot_model.feature_extractor.eval()
oneshot_model.bottleneck.eval()

optimizer = torch.optim.Adam(oneshot_model.gp_layer.likelihood.parameters(), lr=0.01)
optimizer.n_iter = 0
for i in range(200):
    for j, (train_x_batch, train_y_batch) in enumerate(shots_loader):
        train_x_batch = Variable(train_x_batch).cuda()
        train_y_batch = Variable(train_y_batch).cuda()
        optimizer.zero_grad()
        output = oneshot_model(train_x_batch)
        loss = -oneshot_model.gp_layer.marginal_log_likelihood(output, train_y_batch, n_data=len(train_shots_omni))
        #kl = oneshot_model.gp_layer.likelihood.kl_div() / len(train_mnist)
        #loss = loss + kl
        loss.backward()
        optimizer.n_iter += 1
        print('Iter %d/200 - Loss: %.3f' % (
            i + 1, loss.data[0],
        ))
        optimizer.step()

oneshot_model.eval()
test_shots_omni = torchvision.datasets.ImageFolder('/scratch/bw462/omni_data/' + test_dir_str, transform=transforms.Compose([
                        transforms.Scale((28,28)),
                        transforms.ToTensor()
                   ]))    
test_shots_loader = torch.utils.data.DataLoader(test_shots_omni, batch_size=512., pin_memory=True, shuffle=True)
avg = 0.
i = 0.
for test_batch_x, test_batch_y in test_shots_loader:
    predictions = oneshot_model(Variable(test_batch_x).cuda()).argmax()
    test_batch_y = Variable(test_batch_y).cuda()
    avg += torch.eq(predictions, test_batch_y).float().mean().data[0]
    i += 1.

#for pred, y in zip(predictions.data, test_batch_y.data):
#    print(pred, y)
print('Score')
print(avg / i)

Iter 1/200 - Loss: -6.428
Iter 2/200 - Loss: 0.400
Iter 3/200 - Loss: -5.932
Iter 4/200 - Loss: -12.092
Iter 5/200 - Loss: -10.318
Iter 6/200 - Loss: -4.788
Iter 7/200 - Loss: -28.008
Iter 8/200 - Loss: -1.009
Iter 9/200 - Loss: -25.432
Iter 10/200 - Loss: -12.824
Iter 11/200 - Loss: -17.991
Iter 12/200 - Loss: -19.355
Iter 13/200 - Loss: -2.389
Iter 14/200 - Loss: -13.802
Iter 15/200 - Loss: -9.664
Iter 16/200 - Loss: -18.831
Iter 17/200 - Loss: -12.354
Iter 18/200 - Loss: -13.188
Iter 19/200 - Loss: -5.409
Iter 20/200 - Loss: -15.509
Iter 21/200 - Loss: -8.623
Iter 22/200 - Loss: -7.873
Iter 23/200 - Loss: -6.253
Iter 24/200 - Loss: -21.786
Iter 25/200 - Loss: -21.526
Iter 26/200 - Loss: -9.331
Iter 27/200 - Loss: -20.266
Iter 28/200 - Loss: -2.299
Iter 29/200 - Loss: -8.111
Iter 30/200 - Loss: -14.811
Iter 31/200 - Loss: -13.430
Iter 32/200 - Loss: -11.535
Iter 33/200 - Loss: -5.491
Iter 34/200 - Loss: -10.419
Iter 35/200 - Loss: -15.463
Iter 36/200 - Loss: -17.764
Iter 37/200 - Los

In [None]:
deep_kernel.eval()
avg = 0.
i = 0.
for test_batch_x, test_batch_y in train_data_loader:
    predictions = deep_kernel(Variable(test_batch_x).cuda()).argmax()
    test_batch_y = Variable(test_batch_y).cuda()
    avg += torch.eq(predictions, test_batch_y).float().mean().data[0]
    i += 1.

print('Score')
print(avg / i)

In [None]:
# 5 way 5 shot  93% accuracy
# 5 way 1 shot  56% accuracy
# 20 way 5 shot 
# 20 way 1 shot 57% accuracy
