bayesian CNN: https://medium.com/neuralspace/bayesian-convolutional-neural-networks-with-bayes-by-backprop-c84dcaaf086e

In [23]:
%matplotlib inline
import torch
import torchvision
from torchvision import models
import torchvision.transforms as transforms
from torchvision.transforms import ToPILImage
import torch.optim as optim

import torch.nn as nn
import torch.nn.functional as F


import pyro
from pyro.distributions import Normal, Categorical
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

import matplotlib.pyplot as plt

import numpy as np

import time

In [2]:
def test_accuracy(net, dataloader):
  ########TESTING PHASE###########
  
    #check accuracy on whole test set
    correct = 0
    total = 0
    net.eval() #important for deactivating dropout and correctly use batchnorm accumulated statistics
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            images = images.cuda()
            labels = labels.cuda()
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print('Accuracy of the network on the test set: %d %%' % (
    accuracy))
    return accuracy

In [19]:
n_classes = 10

transform_train = transforms.Compose(
    [
     transforms.Resize((40,40)), #not a problem as CIFAR is 32x32 anyway.
     transforms.RandomResizedCrop(32,scale=(32/40,32/40)),   
             transforms.RandomHorizontalFlip(p=0.5),
    transforms.Resize((224,224)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     ])

transform_test = transforms.Compose(
    [
     transforms.Resize((40,40)),
     transforms.RandomResizedCrop(32,scale=(32/40,32/40)),   
             transforms.RandomHorizontalFlip(p=0.5),
     transforms.Resize((224,224)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     ])
     
# batch size reduced to 16 due to lack of memory in my GPU

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=2,
                                          shuffle=True, num_workers=4,drop_last=True)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=2,
                                         shuffle=False, num_workers=4,drop_last=True)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
# net = models.resnet18()
# alexnet = models.alexnet()
net  = models.vgg16()
# squeezenet = models.squeezenet1_0()
# densenet = models.densenet161()
# inception = models.inception_v3()
# googlenet = models.googlenet()
# shufflenet = models.shufflenet_v2_x1_0()
# mobilenet = models.mobilenet_v2()
# resnext50_32x4d = models.resnext50_32x4d()
net.classifier[6] = nn.Linear(in_features=4096, out_features=n_classes, bias=True)
net.cuda()
# squeezenet

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d

In [7]:
layer_names = []
for name, param in net.named_parameters():
    layer_names.append(name)
layer_names

['features.0.weight',
 'features.0.bias',
 'features.2.weight',
 'features.2.bias',
 'features.5.weight',
 'features.5.bias',
 'features.7.weight',
 'features.7.bias',
 'features.10.weight',
 'features.10.bias',
 'features.12.weight',
 'features.12.bias',
 'features.14.weight',
 'features.14.bias',
 'features.17.weight',
 'features.17.bias',
 'features.19.weight',
 'features.19.bias',
 'features.21.weight',
 'features.21.bias',
 'features.24.weight',
 'features.24.bias',
 'features.26.weight',
 'features.26.bias',
 'features.28.weight',
 'features.28.bias',
 'classifier.0.weight',
 'classifier.0.bias',
 'classifier.3.weight',
 'classifier.3.bias',
 'classifier.6.weight',
 'classifier.6.bias']

In [5]:
layers = [net.features[0].weight,
net.features[0].bias,
net.features[2].weight,
net.features[2].bias,
net.features[5].weight,
net.features[5].bias,
net.features[7].weight,
net.features[7].bias,
net.features[10].weight,
net.features[10].bias,
net.features[12].weight,
net.features[12].bias,
net.features[14].weight,
net.features[14].bias,
net.features[17].weight,
net.features[17].bias,
net.features[19].weight,
net.features[19].bias,
net.features[21].weight,
net.features[21].bias,
net.features[24].weight,
net.features[24].bias,
net.features[26].weight,
net.features[26].bias,
net.features[28].weight,
net.features[28].bias,
net.classifier[0].weight,
net.classifier[0].bias,
net.classifier[3].weight,
net.classifier[3].bias,
net.classifier[6].weight,
net.classifier[6].bias]

# for index, layer in enumerate(layers):
#     layers[index]= layer.cuda()
    
layers[0].is_cuda

True

In [8]:
len(layer_names) == len(layers)

True

In [9]:
log_softmax = nn.LogSoftmax(dim=1)


def model(x_data, y_data):
    global layers
    global layer_names
    priors = {}
    for index, name in enumerate(layer_names):
        layer = layers[index]
        priors[name] = Normal(loc=torch.zeros_like(layer).cuda(), scale=torch.ones_like(layer).cuda())

    # lift module parameters to random variables sampled from the priors
    lifted_module = pyro.random_module("module", net, priors)
    # sample a regressor (which also samples w and b)
    lifted_reg_model = lifted_module().cuda()
    
    lhat = log_softmax(lifted_reg_model(x_data))
    
    pyro.sample("obs", Categorical(logits=lhat), obs=y_data)

In [10]:
def normal_distribution_layer(layer, name):
       # First layer weight distribution priors
    mu = torch.randn_like(layer).cuda()
    sigma = torch.randn_like(layer).cuda()
    mu_param = pyro.param(name+"_mu", mu).cuda()
    sigma_param = softplus(pyro.param(name+"_sigma", sigma)).cuda()
    prior = Normal(loc=mu_param, scale=sigma_param)
    return prior


In [11]:
softplus = torch.nn.Softplus()

def guide(x_data, y_data):
    priors = {}
    global layers
    global layer_names
    for index, name in enumerate(layer_names):
        layer = layers[index]

        priors[name] = normal_distribution_layer(layer, name)
        
    #priors = {'fc1.weight': fc1w_prior, 'fc1.bias': fc1b_prior, 'out.weight': outw_prior, 'out.bias': outb_prior}
    
        lifted_module = pyro.random_module("module", net, priors)

    
    return lifted_module().cuda()

In [13]:
optim = Adam({"lr": 0.001})

svi = SVI(model, guide, optim, loss=Trace_ELBO())


In [12]:
# svi.__dict__

In [20]:
num_iterations = 20
loss = 0

for j in range(num_iterations):
    t0 = time.time()
    loss = 0
    for batch_id, data in enumerate(trainloader):
        x = data[0].cuda()
        y = data[1].cuda()
#         print(type(x.cuda()))
        # calculate the loss and take a gradient step
#         loss += svi.step(data[0].view(-1,32*32*256*3), data[1])
        loss += svi.step(x, y)
    normalizer_train = len(trainloader.dataset)
    total_epoch_loss_train = loss / normalizer_train
    
    
    print("Epoch ", j, " Loss ", total_epoch_loss_train, "done in ", time.time()-t0)

RuntimeError: CUDA out of memory. Tried to allocate 392.00 MiB (GPU 0; 5.94 GiB total capacity; 4.54 GiB already allocated; 72.75 MiB free; 70.85 MiB cached)
           Trace Shapes:              
            Param Sites:              
    features.0.weight_mu  64   3 3   3
 features.0.weight_sigma  64   3 3   3
      features.0.bias_mu            64
   features.0.bias_sigma            64
    features.2.weight_mu  64  64 3   3
 features.2.weight_sigma  64  64 3   3
      features.2.bias_mu            64
   features.2.bias_sigma            64
    features.5.weight_mu 128  64 3   3
 features.5.weight_sigma 128  64 3   3
      features.5.bias_mu           128
   features.5.bias_sigma           128
    features.7.weight_mu 128 128 3   3
 features.7.weight_sigma 128 128 3   3
      features.7.bias_mu           128
   features.7.bias_sigma           128
   features.10.weight_mu 256 128 3   3
features.10.weight_sigma 256 128 3   3
     features.10.bias_mu           256
  features.10.bias_sigma           256
   features.12.weight_mu 256 256 3   3
features.12.weight_sigma 256 256 3   3
     features.12.bias_mu           256
  features.12.bias_sigma           256
   features.14.weight_mu 256 256 3   3
features.14.weight_sigma 256 256 3   3
     features.14.bias_mu           256
  features.14.bias_sigma           256
   features.17.weight_mu 512 256 3   3
features.17.weight_sigma 512 256 3   3
     features.17.bias_mu           512
  features.17.bias_sigma           512
   features.19.weight_mu 512 512 3   3
features.19.weight_sigma 512 512 3   3
     features.19.bias_mu           512
  features.19.bias_sigma           512
   features.21.weight_mu 512 512 3   3
features.21.weight_sigma 512 512 3   3
     features.21.bias_mu           512
  features.21.bias_sigma           512
   features.24.weight_mu 512 512 3   3
features.24.weight_sigma 512 512 3   3
     features.24.bias_mu           512
  features.24.bias_sigma           512
   features.26.weight_mu 512 512 3   3
features.26.weight_sigma 512 512 3   3
     features.26.bias_mu           512
  features.26.bias_sigma           512
   features.28.weight_mu 512 512 3   3
features.28.weight_sigma 512 512 3   3
     features.28.bias_mu           512
  features.28.bias_sigma           512
           Sample Sites:              

In [14]:
torch.save(net.state_dict(), "models/Cifar100Resnet18bayesians")


In [None]:

net = TheModelClass(*args, **kwargs)
net.load_state_dict(torch.load("models/Resnet18bayesian"))
net.eval()

In [15]:
test_accuracy(net, testloader)


Accuracy of the network on the test set: 1 %


1.05

In [16]:
def predict(x):
    num_samples = 10

    sampled_models = [guide(None, None) for _ in range(num_samples)]
    yhats = [model(x).data for model in sampled_models]
    mean = torch.mean(torch.stack(yhats), 0)
    
    _, predicted = torch.max(mean, 1)
    return(predicted)
#     return torch.from_numpy(np.argmax(mean.numpy(), axis=1))
    #print(torch.argmax(mean))
#     print(mean)
#     return torch.max(mean)

print('Prediction when network is forced to predict')
correct = 0
total = 0
for j, data in enumerate(testloader):
    images, labels = data
    images = images.cuda()
    labels = labels.cuda()
    predicted = predict(images)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
print("accuracy: %d %%" % (100 * correct / total))

Prediction when network is forced to predict
accuracy: 0 %
