# Boilerplate

Package installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [42]:
# !pip install tensorboardX

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
# from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want. If you change it, mention the architectural details in your report.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200,10)

    def forward(self, x):
        x = x.view((-1, 28*28))
        x = F.relu(self.fc(x))
        x = self.fc2(x)
        return x

class Normalize(nn.Module):
    def forward(self, x):
        return (x - 0.1307)/0.3081

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = nn.Sequential(Normalize(), Net())

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
model.train()

Sequential(
  (0): Normalize()
  (1): Net(
    (fc): Linear(in_features=784, out_features=200, bias=True)
    (fc2): Linear(in_features=200, out_features=10, bias=True)
  )
)

# Implement the Attacks

Functions are given a simple useful signature that you can start with. Feel free to extend the signature as you see fit.

You may find it useful to create a 'batched' version of PGD that you can use to create the adversarial attack.

In [50]:
# The last argument 'targeted' can be used to toggle
# between a targeted and untargeted attack.
def fgsm(model, x, y, eps):
  """
  TODO: implement this as an intermediate step of PGD

  Notes: put the model in eval() mode for this function
  """
  model.eval()

  x.requires_grad = True
  output = model(x)

  loss = nn.CrossEntropyLoss()

  model.zero_grad()
  cost = loss(output, y)
  cost.backward()

  # Perform perturbation
  x_adv = x + eps*x.grad.sign()
  return x_adv


def pgd_untargeted(model, x, y, k, eps, eps_step):
  """
  TODO: implement this
  Notes: put the model in eval() mode for this function
  x: input image
  y: ground truth label for x
  k: steps of FGSM
  eps: projection region for PGD (note the need for normalization before projection, as eps values are for inputs in [0,1])
  eps_step: step for one iteration of FGSM
  """
  model.eval()
  x = x.to(device)
  y = y.to(device)

  x_t = x.data
  for i in range(k):
    # Perturbed images
    perturbation = fgsm(model, x, y, eps_step)
    # Isolate perturbation
    peturbation_data = torch.clamp(perturbation - x_t, -eps, eps)
    x = torch.clamp(x_t + peturbation_data, 0, 1).detach_()
  return x





# Implement Adversarial Training

In [40]:
def train_model(model, num_epochs, enable_defense=True, attack='pgd', eps=0.1):
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes)
    model.train()
    loss = nn.CrossEntropyLoss()

    # If defense is enabled, add adversial examples to the training set
    for epoch in range(num_epochs):
      print(f'Epoch {epoch + 1}:')
      correct = 0
      total_samples = 0
      for i, data in enumerate(train_loader):
        inputs, labels = data

        if enable_defense:
          inputs = pgd_untargeted(model, inputs, labels, 10, eps, 0.01)

        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        cost = loss(outputs, labels)
        cost.backward()

        optimizer.step()


In [19]:
def test_model_on_attacks(model, attack='pgd', eps=0.1):
    # TODO: implement this function to test the robust accuracy of the given model
    # use pgd_untargeted() within this function
    model.eval()
    correct = 0
    total = 0
    for x, y in test_loader:
      images = pgd_untargeted(model, x, y, 10, eps, 0.01)
      labels = y
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    print(f'Robust accuracy: {(100 * correct / total):2f}%')



# Study Accuracy, Quality, etc.

Compare the various results and report your observations on the submission.

In [47]:
## train the original model
# model = nn.Sequential(Normalize(), Net())
# model = model.to(device)
# model.train()

train_model(model, 5, False)
torch.save(model.state_dict(), 'weights.pt')

Epoch 1:
Epoch 2:
Epoch 3:
Epoch 4:
Epoch 5:


In [48]:
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights.pt'))

correct = 0
total = 0
with torch.no_grad():
  for data in test_loader:
    images, labels = data

    images = images.to(device)
    labels = labels.to(device)

    outputs = model(images)
    # the class with the highest energy is what we choose as prediction
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    del images, labels, outputs

print(f'Standard accuracy: {100 * correct // total}%')

  model.load_state_dict(torch.load('weights.pt'))


Standard accuracy: 95%


In [49]:
## PGD attack
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    test_model_on_attacks(model, attack='pgd', eps=eps)


  model.load_state_dict(torch.load('weights.pt'))


Robust accuracy: 95.550000%
Robust accuracy: 95.520000%
Robust accuracy: 95.520000%
Robust accuracy: 95.520000%


In [43]:
## PGD based adversarial training
# model = nn.Sequential(Normalize(), Net())
eps = 0.1
train_model(model, 5, True, 'pgd', eps)
torch.save(model.state_dict(), f'weights_AT_{eps}.pt')

Epoch 1:
Epoch 2:
Epoch 3:
Epoch 4:
Epoch 5:


In [44]:
## PGD based adversarial training attack
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights_AT_0.1.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    test_model_on_attacks(model, attack='pgd', eps=eps)


  model.load_state_dict(torch.load('weights_AT_0.1.pt'))


Robust accuracy: 91.070000%
Robust accuracy: 82.380000%
Robust accuracy: 82.380000%
Robust accuracy: 82.380000%


In [45]:
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights_AT_0.1.pt'))

correct = 0
total = 0
with torch.no_grad():
  for data in test_loader:
    images, labels = data

    images = images.to(device)
    labels = labels.to(device)

    outputs = model(images)
    # the class with the highest energy is what we choose as prediction
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    del images, labels, outputs

print(f'Standard accuracy: {100 * correct // total}%')

  model.load_state_dict(torch.load('weights_AT_0.1.pt'))


Standard accuracy: 95%
