<h1>BatchEnsemble: An Alternative Approach To Efficient Ensemble and Lifelong Learning<h1/>

Yeming Wen, Dustin Tran & Jimmy Ba

<h2>Classification with Wide ResNet and CIFAR10<h2/>

In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
m = nn.Softplus()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torchvision
import torchvision.transforms as transforms
data_dir = '/content/drive/My Drive/AALTO/cs4875-research/data/'
transform = transforms.Compose([
    transforms.ToTensor(),  # Transform to tensor
    transforms.Normalize((0.5,), (0.5,))  # Min-max scaling to [-1, 1]
])

trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)

classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=5, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
class Cov2dEnsemble(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, num_models=4, first_layer=False):
      super(Cov2dEnsemble, self).__init__()
      self.in_channels = in_channels
      self.out_channels = out_channels
      self.num_models = num_models
      self.first_layer = first_layer
      self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
      self.alpha = nn.Parameter(torch.Tensor(num_models, in_channels))
      self.gamma = nn.Parameter(torch.Tensor(num_models, out_channels))
      nn.init.normal_(self.alpha, mean=1., std=0.5)
      nn.init.normal_(self.gamma, mean=1., std=0.5)

    def forward(self, x):
      if not self.training and self.first_layer:
        x = torch.cat([x for i in range(self.num_models)], dim=0)
      examples_per_model = int(x.size(0) / self.num_models)
      alpha = torch.cat([self.alpha for i in range(examples_per_model)], dim=1).view([-1, self.in_channels])
      alpha.unsqueeze_(-1).unsqueeze_(-1)
      gamma = torch.cat([self.gamma for i in range(examples_per_model)], dim=1).view([-1, self.out_channels])
      gamma.unsqueeze_(-1).unsqueeze_(-1)
      if extra != 0:
        alpha = torch.cat([alpha, alpha[:extra]], dim=0)
        gamma = torch.cat([gamma, gamma[:extra]], dim=0)
      return self.conv1(x*alpha)*gamma

class DenseEnsemble(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, num_models=4):
      super(DenseEnsemble, self).__init__()
      self.in_channels = in_channels
      self.out_channels = out_channels
      self.num_models = num_models
      self.fc = nn.Linear(in_channels, out_channels, bias=False)
      self.alpha = nn.Parameter(torch.Tensor(num_models, in_channels))
      self.gamma = nn.Parameter(torch.Tensor(num_models, out_channels))
      nn.init.normal_(self.alpha, mean=1., std=0.5)
      nn.init.normal_(self.gamma, mean=1., std=0.5)

    def forward(self, x):
      examples_per_model = int(x.size(0) / self.num_models)
      alpha = torch.cat([self.alpha for i in range(examples_per_model)], dim=1).view([-1, self.in_channels])
      gamma = torch.cat([self.gamma for i in range(examples_per_model)], dim=1).view([-1, self.out_channels])
      if extra != 0:
        alpha = torch.cat([alpha, alpha[:extra]], dim=0)
        gamma = torch.cat([gamma, gamma[:extra]], dim=0)
      return self.fc(x*alpha)*gamma


In [None]:
class BlockBatchEnsemble(nn.Module):
    def __init__(self, in_channels, out_channels, dropout_rate, stride=1, num_models=4):
        super(BlockBatchEnsemble, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = Cov2dEnsemble(in_channels, out_channels, 3, stride=1, padding=1, num_models=num_models)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv2 = Cov2dEnsemble(out_channels, out_channels, 3, stride=stride, padding=1, num_models=num_models)
        self.num_models = num_models
        self.skip = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip = nn.Sequential(
                Cov2dEnsemble(in_channels, out_channels, 1, stride=stride, padding=0, num_models=num_models),
            )

    def forward(self, x):
        curr_bs = x.size(0)
        out = self.dropout(self.conv1(F.relu(self.bn1(x))))
        out = self.conv2(F.relu(self.bn2(out)))
        out += self.skip(x)
        return out

class GroupBlockBatchEnsemble(nn.Module):
    def __init__(self, in_channels, out_channels, n_blocks, dropout_rate, stride=1, num_models=4):
        super(GroupBlockBatchEnsemble, self).__init__()
        strides = [stride] + [1]*(int(n_blocks) - 1)
        self.in_channels = in_channels
        group = []

        for stride in strides:
            group.append(BlockBatchEnsemble(self.in_channels, out_channels, dropout_rate, stride))
            self.in_channels = out_channels

        self.group = nn.Sequential(*group)

    def forward(self, x):
        return self.group(x)

class WideResNetBatchEnsemble(nn.Module):
    def __init__(self, depth, widen_factor, dropout_rate, num_classes=10, num_models=4):
        super(WideResNetBatchEnsemble, self).__init__()
        assert ((depth-4)%6 == 0), "Depth should be 6n+4."
        n = (depth - 4)/6
        k = widen_factor
        nStages = [16, 16*k, 32*k, 64*k]
        self.num_models = num_models
        self.num_classes = num_classes

        self.conv1 = Cov2dEnsemble(in_channels=3, out_channels=nStages[0], kernel_size=1, stride=1, padding=0, num_models=num_models, first_layer=True)
        self.group1 = GroupBlockBatchEnsemble(nStages[0], nStages[1], n, dropout_rate, stride=1, num_models=num_models)
        self.group2 = GroupBlockBatchEnsemble(nStages[1], nStages[2], n, dropout_rate, stride=2, num_models=num_models)
        self.group3 = GroupBlockBatchEnsemble(nStages[2], nStages[3], n, dropout_rate, stride=2, num_models=num_models)
        self.bn1 = nn.BatchNorm2d(nStages[3])

        self.relu = nn.ReLU(inplace=True)
        self.fc = DenseEnsemble(nStages[3], num_classes, num_models)
        self.nStage3 = nStages[3]

        # Initialize weights
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, np.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x):
        x = self.conv1(x)
        x = self.group1(x)
        x = self.group2(x)
        x = self.group3(x)
        x = self.relu(self.bn1(x))
        x = F.avg_pool2d(x, 8)
        x = x.view(-1, self.nStage3)
        x = self.fc(x)
        if not self.training:
            x=F.softmax(x, dim=1)
            return x.view([self.num_models, -1, self.num_classes]).mean(dim=0)
        return x


def compute_accuracy(net, testloader):
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in testloader:
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

In [None]:
device = torch.device('cuda:0')
loss_func = nn.CrossEntropyLoss()
m = nn.LogSoftmax(dim=1)
learning_rate = 0.01

def compute_brier_score(p, y):
  brier_score = torch.mean((y-torch.argmax(p, 1).float())**2)
  return brier_score

def ensembleInBatch(model, optimizer):
  running_loss = 0.0
  running_brier = 0.0
  model.train()
  for epoch in range(numEpochs):
    brier_score = 0.0
    total = 0
    for x, y in trainloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        batch_brier_score = compute_brier_score(output, y)
        brier_score += torch.sum(batch_brier_score, 0).cpu().numpy().item()
        loss = loss_func(output, y)
        loss.backward()
        optimizer.step()
        total += y.size(0)
    if epoch == (numEpochs-1):
      running_loss = loss.item()
    print('Loss at epoch {} is {}'.format(epoch, loss.item()))
    print('Brier score at epoch {} is {}'.format(epoch, brier_score/total))
  return running_loss, brier_score/total


numEpochs = 40
t0 = time.time()
model = WideResNetBatchEnsemble(28, 4, 0.5)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss, brier = ensembleInBatch(model, optimizer)
time_one = time.time() - t0
accuracy = compute_accuracy(model, testloader)

print('Accuracy of the network on the test images: %.3f' % accuracy)
print('NLL Loss is {}'.format(loss))
print('Brier score is {}'.format(brier))
print('Training time: {} seconds'.format(time_one))


In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/AALTO/cs4875-research/archive/batch_ensemble.pth')
print('Model saved to %s.' % ('batch_ensemble.pth'))

<h2>Time series prediction with MIMIC3 and LSTM<h2/>

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from mydatasets import calculate_num_features, VisitSequenceWithLabelDataset, visit_collate_fn
# !pip3 install pickle5
import pickle5 as pickle
from torch.utils.data import DataLoader
import torch.optim as optim

Collecting pickle5
[?25l  Downloading https://files.pythonhosted.org/packages/f7/4c/5c4dd0462c8d3a6bc4af500a6af240763c2ebd1efdc736fc2c946d44b70a/pickle5-0.0.11.tar.gz (132kB)
[K     |██▌                             | 10kB 20.3MB/s eta 0:00:01[K     |█████                           | 20kB 15.8MB/s eta 0:00:01[K     |███████▍                        | 30kB 12.7MB/s eta 0:00:01[K     |██████████                      | 40kB 12.5MB/s eta 0:00:01[K     |████████████▍                   | 51kB 8.6MB/s eta 0:00:01[K     |██████████████▉                 | 61kB 9.2MB/s eta 0:00:01[K     |█████████████████▍              | 71kB 9.0MB/s eta 0:00:01[K     |███████████████████▉            | 81kB 9.9MB/s eta 0:00:01[K     |██████████████████████▎         | 92kB 9.6MB/s eta 0:00:01[K     |████████████████████████▉       | 102kB 8.3MB/s eta 0:00:01[K     |███████████████████████████▎    | 112kB 8.3MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122kB 8.3MB/s eta 0:00:01

In [None]:
# Data preprocessing and training process refers to https://github.com/jiaweizhu830/Time-Series-Mortality-Prediction-in-ICU-via-PyTorch
# Train : test = 8:2

In [None]:

torch.manual_seed(0)
if torch.cuda.is_available():
	torch.cuda.manual_seed(0)

# Set a correct path to the data files that you preprocessed
PATH_TRAIN_SEQS = "/content/drive/My Drive/AALTO/cs4875-research/data/features/train/mortality.seqs.train"
PATH_TRAIN_LABELS = "/content/drive/My Drive/AALTO/cs4875-research/data/features/train/mortality.labels.train"
PATH_TEST_SEQS = "/content/drive/My Drive/AALTO/cs4875-research/data/features/test/mortality.seqs.test"
PATH_TEST_LABELS = "/content/drive/My Drive/AALTO/cs4875-research/data/features/test/mortality.labels.test"
PATH_OUTPUT = "/content/drive/My Drive/AALTO/cs4875-research/output/"

NUM_EPOCHS = 1
BATCH_SIZE = 128
USE_CUDA = False  # Set 'True' if you want to use GPU
NUM_WORKERS = 0

# Data loading
print('===> Loading entire datasets')
train_seqs = pickle.load(open(PATH_TRAIN_SEQS, 'rb'))
train_labels = pickle.load(open(PATH_TRAIN_LABELS, 'rb'))
test_seqs = pickle.load(open(PATH_TEST_SEQS, 'rb'))
test_labels = pickle.load(open(PATH_TEST_LABELS, 'rb'))
print('===> done Loading')
num_features = calculate_num_features(train_seqs)
print(num_features)

train_dataset = VisitSequenceWithLabelDataset(train_seqs, train_labels, num_features)
test_dataset = VisitSequenceWithLabelDataset(test_seqs, test_labels, num_features)
print('===> done datasets')

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=visit_collate_fn, num_workers=NUM_WORKERS, drop_last=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, collate_fn=visit_collate_fn, num_workers=NUM_WORKERS, drop_last=True)


===> Loading entire datasets
===> done Loading
5067
===> done datasets


In [None]:
class AverageMeter(object):
	"""Computes and stores the average and current value"""

	def __init__(self):
		self.reset()

	def reset(self):
		self.val = 0
		self.avg = 0
		self.sum = 0
		self.count = 0

	def update(self, val, n=1):
		self.val = val
		self.sum += val * n
		self.count += n
		self.avg = self.sum / self.count

def compute_brier_score(p, y):
  brier_score = torch.mean((y-torch.argmax(p, 1).float())**2)
  return brier_score

def compute_batch_accuracy(output, target):
	"""Computes the accuracy for a batch"""
	with torch.no_grad():

		batch_size = target.size(0)
		_, pred = output.max(1)
		correct = pred.eq(target).sum()

		return correct * 100.0 / batch_size

def train(model, device, data_loader, criterion, optimizer, epoch, print_freq=10):
	batch_time = AverageMeter()
	data_time = AverageMeter()
	losses = AverageMeter()
	accuracy = AverageMeter()

	model.train()

	end = time.time()
	for i, (input, target) in enumerate(data_loader):
		# measure data loading time
		data_time.update(time.time() - end)

		seqs, lengths = input
		seqs = seqs.to(device)
		target = target.to(device)

		optimizer.zero_grad()
		output = model(seqs, lengths)
		loss = criterion(output, target)
		assert not np.isnan(loss.item()), 'Model diverged with loss = NaN'

		loss.backward()
		optimizer.step()

		# measure elapsed time
		batch_time.update(time.time() - end)
		end = time.time()

		losses.update(loss.item(), target.size(0))
		accuracy.update(compute_batch_accuracy(output, target).item(), target.size(0))

		if i % print_freq == 0:
			print('Epoch: [{0}][{1}/{2}]\t'
				  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
				  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
				  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
				  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
				epoch, i, len(data_loader), batch_time=batch_time,
				data_time=data_time, loss=losses, acc=accuracy))

	return losses.avg, accuracy.avg

def advTrain(model, device, data_loader, criterion, optimizer, epoch, print_freq=10):
	batch_time = AverageMeter()
	data_time = AverageMeter()
	losses = AverageMeter()
	accuracy = AverageMeter()
	eps = 5067*0.01
	total = 0
	brier_score = 0.0

	model.train()

	end = time.time()
	for i, (input, target) in enumerate(data_loader):
		# measure data loading time
		data_time.update(time.time() - end)
		seqs, lengths = input
		seqs = seqs.to(device)
		seqs = seqs.clone().detach().requires_grad_(True)
    
		target = target.to(device)

		optimizer.zero_grad()
		output = model(seqs, lengths)
		batch_brier_score = compute_brier_score(output, target)
		brier_score+= torch.sum(batch_brier_score, 0).cpu().numpy().item()
		total += target.size(0)
		loss = criterion(output, target)
		assert not np.isnan(loss.item()), 'Model diverged with loss = NaN'

		loss.backward(retain_graph=True)

		seqs_prime = seqs + eps*(torch.sign(seqs.grad.data))
    
		optimizer.zero_grad()
		output_prime = model(seqs_prime, lengths)
		loss = criterion(output, target) + criterion(output_prime, target)
		loss.backward()
		optimizer.step()

		# measure elapsed time
		batch_time.update(time.time() - end)
		end = time.time()

		losses.update(loss.item(), target.size(0))
		accuracy.update(compute_batch_accuracy(output, target).item(), target.size(0))

		if i % print_freq == 0:
			print('Epoch: [{0}][{1}/{2}]\t'
				  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
				  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
				  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
				  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
				epoch, i, len(data_loader), batch_time=batch_time,
				data_time=data_time, loss=losses, acc=accuracy))

	return losses.avg, accuracy.avg, brier_score/total

In [None]:
class DenseEnsemble(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, padding=0, num_models=4, first_layer=False):
      super(DenseEnsemble, self).__init__()
      self.in_channels = in_channels
      self.out_channels = out_channels
      self.num_models = num_models
      self.first_layer = first_layer
      self.fc = nn.Linear(in_channels, out_channels, bias=False)
      self.alpha = nn.Parameter(torch.Tensor(num_models, in_channels))
      self.gamma = nn.Parameter(torch.Tensor(num_models, out_channels))
      nn.init.normal_(self.alpha, mean=1., std=0.5)
      nn.init.normal_(self.gamma, mean=1., std=0.5)

    def forward(self, x):
      examples_per_model = int(x.size(0) / self.num_models)
      extra = x.size(0) - (examples_per_model * self.num_models)
      if self.first_layer:
        alpha = torch.cat([self.alpha for i in range(examples_per_model)], dim=1).view([x.size(0), -1, self.in_channels])
        gamma = torch.cat([self.gamma for i in range(examples_per_model)], dim=1).view([x.size(0), -1, self.out_channels])
      if not self.first_layer:
        alpha = torch.cat([self.alpha for i in range(examples_per_model)], dim=1).view([-1, self.in_channels])
        gamma = torch.cat([self.gamma for i in range(examples_per_model)], dim=1).view([-1, self.out_channels])
      if extra != 0:
        alpha = torch.cat([alpha, alpha[:extra]], dim=0)
        gamma = torch.cat([gamma, gamma[:extra]], dim=0)
      return self.fc(x*alpha)*gamma

class LSTMEnsemble(nn.Module):
    def __init__(self, in_channels, out_channels, num_layers = 1, stride=1, padding=0, num_models=4, dropout = 0.1, batch_first = True):
      super(LSTMEnsemble, self).__init__()
      self.in_channels = in_channels
      self.out_channels = out_channels
      self.num_models = num_models
      self.lstm = nn.LSTM(input_size = in_channels, hidden_size = out_channels, num_layers = 1, dropout = dropout, batch_first = True)
      self.alpha = nn.Parameter(torch.Tensor(num_models, in_channels))
      self.gamma = nn.Parameter(torch.Tensor(num_models, out_channels))
      nn.init.normal_(self.alpha, mean=1., std=0.5)
      nn.init.normal_(self.gamma, mean=1., std=0.5)

    def forward(self, x, lengths, seq_len):
      examples_per_model = int(x.size(0) / self.num_models)
      extra = x.size(0) - (examples_per_model * self.num_models)
      alpha = torch.cat([self.alpha for i in range(examples_per_model)], dim=1).view([x.size(0), -1, self.in_channels])
      gamma = torch.cat([self.gamma for i in range(examples_per_model)], dim=1).view([x.size(0), -1, self.out_channels])
      if extra != 0:
        alpha = torch.cat([alpha, alpha[:extra]], dim=0)
        gamma = torch.cat([gamma, gamma[:extra]], dim=0)
      input = x*alpha
      x = pack_padded_sequence(input, lengths, batch_first = True)
      x, _ = self.lstm(x)
      x, _ = pad_packed_sequence(x, batch_first = True, total_length = seq_len)
      return x

In [None]:
class MyLSTMEnsemble(nn.Module):
    def __init__(self, dim_input, dropout_rate=0.1, num_models=4):
        super(MyLSTMEnsemble, self).__init__()
        self.num_models = num_models
        self.dim_input = dim_input
        self.fc1 = DenseEnsemble(dim_input, 64, num_models, first_layer = True)
        self.lstm = LSTMEnsemble(in_channels = 64, out_channels = 64, num_layers = 1, dropout = dropout_rate, batch_first = True)
        self.fc2 = DenseEnsemble(64, 2, num_models)
    def forward(self, x, lengths):
      lengths = lengths.long()
      batch_size, seq_len, num_features = x.size()
      x = self.fc1(x)
      x = torch.sigmoid(x)
      x = self.lstm(x, lengths, seq_len)
      y = torch.zeros(batch_size, 64).float()
      for i in range(batch_size):
        y[i, :] = x[i, lengths[i]-1, :]
      x = self.fc2(y)
      return x

In [None]:
for i in range(1, 5):
  model = MyLSTMEnsemble(num_features)
  criterion = nn.CrossEntropyLoss()
  NUM_EPOCHS = 20
  device = torch.device("cuda" if torch.cuda.is_available() and USE_CUDA else "cpu")
  model.to(device)
  criterion.to(device)

  optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay = 0.0004)
  best_val_acc = 0.0
  train_losses, train_accuracies = [], []
  valid_losses, valid_accuracies = [], []
  training_brier = []
  t0 = time.time()
  for epoch in range(NUM_EPOCHS):
      train_loss, train_accuracy, brier = advTrain(model, device, train_loader, criterion, optimizer, epoch, print_freq = len(train_loader)-1)
      train_losses.append(train_loss)
      training_brier.append(brier)
      train_accuracies.append(train_accuracy)
  time_one = time.time() - t0
  print('NLL Loss is {}'.format(np.mean(train_losses)))
  print('Brier score is {}'.format(np.mean(training_brier)))
  print('Training time: {} seconds'.format(time_one))
  test_loss, test_accuracy, test_results = evaluate(model, device, test_loader, criterion, print_freq = len(test_loader)-1)
  print('test accuracy {}'.format(test_accuracy))


  "num_layers={}".format(dropout, num_layers))


Epoch: [0][0/289]	Time 0.127 (0.127)	Data 0.006 (0.006)	Loss 1.5911 (1.5911)	Accuracy 58.594 (58.594)
Epoch: [0][288/289]	Time 0.154 (0.211)	Data 0.008 (0.006)	Loss 0.7438 (0.9786)	Accuracy 80.469 (78.549)
Epoch: [1][0/289]	Time 0.123 (0.123)	Data 0.006 (0.006)	Loss 0.7017 (0.7017)	Accuracy 82.812 (82.812)
Epoch: [1][288/289]	Time 0.492 (0.211)	Data 0.010 (0.006)	Loss 0.3206 (0.4896)	Accuracy 89.062 (83.410)
Epoch: [2][0/289]	Time 0.157 (0.157)	Data 0.006 (0.006)	Loss 0.3127 (0.3127)	Accuracy 89.062 (89.062)
Epoch: [2][288/289]	Time 0.341 (0.216)	Data 0.008 (0.006)	Loss 0.3448 (0.3329)	Accuracy 84.375 (86.097)
Epoch: [3][0/289]	Time 0.207 (0.207)	Data 0.008 (0.008)	Loss 0.3415 (0.3415)	Accuracy 85.156 (85.156)
Epoch: [3][288/289]	Time 0.160 (0.223)	Data 0.005 (0.006)	Loss 0.2658 (0.3190)	Accuracy 88.281 (86.821)
Epoch: [4][0/289]	Time 0.156 (0.156)	Data 0.006 (0.006)	Loss 0.2582 (0.2582)	Accuracy 90.625 (90.625)
Epoch: [4][288/289]	Time 0.170 (0.222)	Data 0.006 (0.006)	Loss 0.3220 (0.3

In [None]:

def evaluate(model, device, data_loader, criterion, print_freq=10):
	batch_time = AverageMeter()
	losses = AverageMeter()
	accuracy = AverageMeter()

	results = []

	model.eval()

	with torch.no_grad():
		end = time.time()
		for i, (input, target) in enumerate(data_loader):
			seqs, lengths = input
			seqs = seqs.to(device)
      
			target = target.to(device)

			output = model(seqs, lengths)
			loss = criterion(output, target)

			# measure elapsed time
			batch_time.update(time.time() - end)
			end = time.time()

			losses.update(loss.item(), target.size(0))
			accuracy.update(compute_batch_accuracy(output, target).item(), target.size(0))

			y_true = target.detach().to('cpu').numpy().tolist()
			y_pred = output.detach().to('cpu').max(1)[1].numpy().tolist()
			results.extend(list(zip(y_true, y_pred)))

			if i % print_freq == 0:
				print('Test: [{0}/{1}]\t'
					  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
					  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
					  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
					i, len(data_loader), batch_time=batch_time, loss=losses, acc=accuracy))

	return losses.avg, accuracy.avg, results

def advEvaluate(model, device, data_loader, criterion, print_freq=10):
	batch_time = AverageMeter()
	losses = AverageMeter()
	accuracy = AverageMeter()
	eps = 5067*0.01
	results = []

	model.eval()

	with torch.no_grad():
		end = time.time()
		for i, (input, target) in enumerate(data_loader):
			seqs, lengths = input

			seqs = seqs.to(device) 
			seqs = seqs + eps*(torch.sign(seqs))

			target = target.to(device)

			output = model(seqs, lengths)
			loss = criterion(output, target)

			# measure elapsed time
			batch_time.update(time.time() - end)
			end = time.time()

			losses.update(loss.item(), target.size(0))
			accuracy.update(compute_batch_accuracy(output, target).item(), target.size(0))

			y_true = target.detach().to('cpu').numpy().tolist()
			y_pred = output.detach().to('cpu').max(1)[1].numpy().tolist()
			results.extend(list(zip(y_true, y_pred)))

			if i % print_freq == 0:
				print('Test: [{0}/{1}]\t'
					  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
					  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
					  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
					i, len(data_loader), batch_time=batch_time, loss=losses, acc=accuracy))

	return losses.avg, accuracy.avg, results

Reference: https://github.com/giannifranchi/LP_BNN/blob/d324ba8d0ade75e5bfe9a14c670fe71469f49db6/networks/batchensemble_layers.py