In [None]:
%load_ext autoreload
%autoreload 2
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from matplotlib import pyplot as plt
from utils import get_mnist_data
from models import ConvNN
from training_and_evaluation import train_model, predict_model
from attacks import fast_gradient_attack
from torch.nn.functional import cross_entropy
import os
if not os.path.isdir("models"):
    os.mkdir("models")

# Part 1: Creating adversarial examples
In this notebook we train a basic convolutional neural network on MNIST and craft adversarial examples via gradient descent.

In [None]:
mnist_trainset = get_mnist_data(train=True)
mnist_testset = get_mnist_data(train=False)

use_cuda = torch.cuda.is_available() #and False

model = ConvNN()
if use_cuda:
    model = model.cuda()

epochs = 1
batch_size = 128
test_batch_size = 1000
lr = 1e-3

opt = Adam(model.parameters(), lr=lr)

In [None]:
def loss_function(x, y, model):
    logits = model(x).cpu()
    loss = cross_entropy(logits, y)
    return loss, logits

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
losses, accuracies = train_model(model, mnist_trainset, batch_size=batch_size, loss_function=loss_function, optimizer=opt)

In [None]:
torch.save(model.state_dict(), "models/standard_training.checkpoint")

In [None]:
model.load_state_dict(torch.load("models/standard_training.checkpoint", map_location="cpu"))

In [None]:
fig = plt.figure(figsize=(10,3))
plt.subplot(121)
plt.plot(losses)
plt.xlabel("Iteration")
plt.ylabel("Training Loss")
plt.subplot(122)
plt.plot(accuracies)
plt.xlabel("Iteration")
plt.ylabel("Training Accuracy")
plt.show()

In [None]:
clean_accuracy = predict_model(model, mnist_testset, batch_size=test_batch_size, attack_function=None)

### Creating adversarial examples
#### $L_2$-bounded attacks
Fist, craft adversarial perturbations that have a $L_2$ norm of $ \| \tilde{\mathbf{x}} - \mathbf{x} \|_2 = \epsilon$ with $\epsilon=5$.

#### $L_\infty$-bounded attacks
Afterwards, craft adversarial perturbations with $L_\infty$ norm of $ \| \tilde{\mathbf{x}} - \mathbf{x} \|_\infty = \epsilon$ with $\epsilon=0.3$.


In [None]:
attack_args_l2 = {"epsilon": 5, "norm": "2"}
attack_args_linf = {"epsilon": 0.3, "norm": "inf"}

### Qualitative evaluation

First, craft adversarial examples for 10 randomly selected test samples and inspect them by plotting them.

$L_2$ attack:

In [None]:
test_loader = DataLoader(mnist_testset, batch_size = 10, shuffle=True)
x,y = next(iter(test_loader))
x = x.clone().detach().requires_grad_(True)

device = model.device()

x, y = x.to(device), y.to(device)

logits = model(x)

x_pert_l2 = fast_gradient_attack(logits=logits, x=x, y=y, epsilon=attack_args_l2["epsilon"], norm=attack_args_l2["norm"],
                         loss_fn=torch.nn.functional.cross_entropy)

y_pert_l2 = torch.argmax(model(x_pert_l2).cpu(), dim=1)

$L_\infty$ attack:

In [None]:
x = x.clone().detach().requires_grad_(True)
logits = model(x)
x_pert_linf = fast_gradient_attack(logits=logits, x=x, y=y, epsilon=attack_args_linf["epsilon"], norm=attack_args_linf["norm"],
                         loss_fn=torch.nn.functional.cross_entropy)


y_pert_linf = torch.argmax(model(x_pert_linf).cpu(), dim=1)

Visualize the adversarial examples and the model's prediction on them:

In [None]:
for ix in range(len(x)):
    plt.subplot(131)
    plt.imshow(x[ix,0].detach().cpu(), cmap="gray")
    plt.title(f"Label: {y[ix]}")
    
    plt.subplot(132)
    plt.imshow(x_pert_l2[ix,0].detach().cpu(), cmap="gray")
    plt.title(f"Predicted: {y_pert_l2[ix]}")
    
    plt.subplot(133)
    plt.imshow(x_pert_linf[ix,0].detach().cpu(), cmap="gray")
    plt.title(f"Predicted: {y_pert_linf[ix]}")
    plt.show()

### Quantitative evaluation
Perturb each test sample and compare the clean and perturbed accuracies.

$L_2$ perturbations:

In [None]:
perturbed_accuracy_l2 = predict_model(model, mnist_testset, batch_size=test_batch_size, attack_function=fast_gradient_attack, attack_args=attack_args_l2)

$L_\infty$ perturbations:

In [None]:
perturbed_accuracy_linf = predict_model(model, mnist_testset, batch_size=test_batch_size, attack_function=fast_gradient_attack, attack_args=attack_args_linf)

In [None]:
clean_accuracy # 0.9519

In [None]:
perturbed_accuracy_l2 # 0.0974

In [None]:
perturbed_accuracy_linf # 0.0274

In [None]:
attack_args_l1 = {"epsilon": 5, "norm": "1"}
perturbed_accuracy_l1 = predict_model(model, mnist_testset, batch_size=test_batch_size, attack_function=fast_gradient_attack, attack_args=attack_args_l1)
perturbed_accuracy_l1

#### In the remaining parts of this project we will be focusing on **$L_2$-based attacks only**.