# Reproduces all experiments in "Adversarial Examples as an Input-Fault Tolerance Problem"

__Note about version control__:
Select "Restart & Clear Output" from the "Kernel" tab before commiting changes so that diffs are interpretable. Otherwise spurious changes like the vector graphics and cell output will be tracked, and are likely to change between every commit.

In [None]:
import os
import time
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.utils.data
from torch.autograd import Variable

import matplotlib.pyplot as plt
from skimage import transform as t

from ft_utils import arr21hot, mi, snr, build_targeted_dataset, evaluate_acc, evaluate_model
from ft_plot import (fault_tolerance_plot, fault_tolerance_unique_obj, grid_visual,
                     fault_tolerance_plot_rot_from_list_30, fault_tolerance_plot_rot_from_list)

In [None]:
GPU = '0'  # None or physical id as a string
SEED = 1

if GPU is not None:
    os.environ['CUDA_VISIBLE_DEVICES'] = GPU
    if torch.cuda.is_available():
        DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
print(DEVICE)

torch.manual_seed(SEED)
np.random.seed(SEED)

To keep the repo light, yet self contained, we've uploaded 10K samples from the SVHN test set. We will later
draw 1K samples from this set, but of course you are free to use all of it.

In [None]:
# SVHN-specific dimensions
img_rows = 32
img_cols = 32
channels = 1
num_classes = 10

##### ----- Data: Loading and Preprocessing ----- #####
DATASET = 'dataset/svhn_test_set_10k.npz'

dataset = np.load(DATASET)
test_x = dataset['a']
test_y = dataset['b']

test_y_1hot = arr21hot(test_y, num_classes)
test_x = test_x.reshape(-1, channels, img_rows, img_cols)
test_x = test_x.astype('float32')
oshape_test = test_x.shape

# rescale to [0, 1]
test_x /= 255.

# subtract per-image mean
test_x = test_x.reshape(-1, np.prod(test_x.shape[1:]))
per_example_mean = np.mean(test_x, axis=1, keepdims=True)
test_x -= per_example_mean

# feature standardization
feature_std = np.std(test_x, axis=0, ddof=0)
test_x /= feature_std

# reshape back to input_shape
test_x = test_x.reshape(oshape_test)

N = 1000  # number of samples from test set
# np.random.seed(1234) # for testing different batches
indx_arr = np.random.choice(len(test_y), N, replace=False)
batch = torch.FloatTensor(test_x[indx_arr]).to(DEVICE)
labels = torch.LongTensor(test_y[indx_arr]).to(DEVICE)
labels_1hot = torch.LongTensor(
    arr21hot(test_y[indx_arr], num_classes)).to(DEVICE)

In [None]:
# ----- Hyper-params ----- #

from cnn_model_pytorch import CNN, custom_init_params

# Some of these (e.g. L2_WD, MB_SIZE, LR, and EPS)
# don't matter as we are loading a pre-trained checkpoint,
# but you can see the settings that were used for training.

NUM_FILTERS = 32    # number of filters in the first conv layer
BATCH_NORM = None  # arg of "store true"-type
MB_SIZE = 128       # mini-batch size
L2_WD = 1e-2      # L2 weight decay constant
LR = 1e-2      # learning rate
EPS = 50        # number of training epochs


##### ----- Define the model ----- #####

model = CNN(num_classes, NUM_FILTERS, channels, BATCH_NORM).to(DEVICE)
model.apply(custom_init_params)

loss_fnct = nn.CrossEntropyLoss()  # reduction='none'
optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay=L2_WD)

# Load model checkpoint

In [None]:
'''
Load checkpoint for models trained with inverse-frequency
class weights, zero-mean (per-image mean subtracted) grayscale data 
with 604k training examples total.
'''
#MODEL_FILE = 'ckpt/final_model_eps_50_seed_1_wd_0e+00.ckpt'

'''
Note that the ATTACK_ITERATIONS are calibrated to minimally sufficient defaults to 
see degradation in a reasonably short time assuming the checkpoint with wd 1e-2 is used. 
The model without wd is much more sensitive and thus doesn't require much change to the 
input to make wrong predictions with high confidence. You will have to tune the 
iterations yourself for wd_0e+00; we suggest increasing by at least one order of 
magnitude as a starting point.
'''
MODEL_FILE = 'ckpt/final_model_eps_50_seed_1_wd_1e-02.ckpt'

if os.path.exists(MODEL_FILE):
    model.load_state_dict(torch.load(
        MODEL_FILE, map_location=lambda storage, loc: storage))
    model = model.to(DEVICE)
else:
    print('Model not found!')

In [None]:
##### ----- Evaluate model on clean data ----- #####
model.eval()
test_accuracy = evaluate_acc(model, batch, labels)
# on the 1k random batch, accuracy should be 0.9400 for model without wd, or 0.9190 with wd
print('Prediction accuracy on test data: %.4f' % test_accuracy)

In [None]:
SAVE_PLOTS = False
basefilename = MODEL_FILE.split('/')[-1].split('.')[0]
print(basefilename)

In [None]:
# assigning names to index values
SNR = 0
ITY = 1
ACC = 2

softmax = torch.nn.Softmax(dim=1)

SAVE_DIR = "npdata/pytorch/mi/"  # where to save all outputs for plots
print(SAVE_DIR)

# Fault Tolerance

## BIM $L_2$ Attack with Misclassification Objective

This could take a long time to complete the whole range of SNR down to zero. 
In general, the model without weight decay takes more iterations due to 
vanishing gradients. Defaults are set so as to provide a suitable reward in a 
reasonable amount of time (< 10s on Titan Xp GPU).

In [None]:
ATTACK_ITER_L2 = 500

adv_img_l2 = batch.clone()
ft_l2 = np.zeros((ATTACK_ITER_L2, 3))

start_time = time.time()
for i in range(ATTACK_ITER_L2):
    '''
    Note that we increase epsilon here as a kind of momentum since we have a log scale x-axis, 
    can run fixed epsilon instead to see that results are similar, but this takes longer.
    '''
    eps_ = np.clip(0.01 + (float(i) / 10**4), 0., 0.1)

    x_ = Variable(adv_img_l2, requires_grad=True)
    red_ind = list(np.arange(1, len(x_.shape)))

    loss = loss_fnct(model(x_), labels)
    loss.backward()
    loss_grad = x_.grad.data.clone()
    square = torch.max(torch.FloatTensor([1e-12]).to(DEVICE),  # to prevent div by zero
                       torch.sum(loss_grad**2, dim=red_ind, keepdim=True))
    normalized_loss_grad = loss_grad / torch.sqrt(square)
    adv_img_l2 = x_.detach() + (eps_ * normalized_loss_grad).to(DEVICE)

    adv_preds_l2 = softmax(model(adv_img_l2))

    ft_l2[i, ACC] = evaluate_acc(model, adv_img_l2, labels)
    ft_l2[i, SNR] = snr(batch, adv_img_l2 - batch)
    ft_l2[i, ITY] = mi(torch.argmax(adv_preds_l2, 1), labels)

print(time.time() - start_time)
# may need to bump up max_snr, e.g. to 95 to see results for model w/out wd
fault_tolerance_plot(ft_l2, max_snr=65)

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_bim_l2_mcls.npy', ft_l2)

### Additive White Gaussian Noise 
This cell should also take about 5s.

In [None]:
ATTACK_ITER_GAUSS = 100
sigmas = np.linspace(1e-6, 10, ATTACK_ITER_GAUSS)
ft_noise = np.zeros((ATTACK_ITER_GAUSS, 3))
start_time = time.time()
for i, s in enumerate(sigmas):
    noise_gauss = np.random.normal(loc=0.0, scale=float(s),
                                   size=(N, channels, img_rows, img_cols))
    noise_gauss = torch.FloatTensor(noise_gauss).to(DEVICE)

    noise_preds_np = softmax(model(batch + noise_gauss))

    ft_noise[i, ACC] = evaluate_acc(model, batch + noise_gauss, labels)
    ft_noise[i, SNR] = snr(batch, noise_gauss)
    ft_noise[i, ITY] = mi(torch.argmax(noise_preds_np, 1), labels)
print(time.time() - start_time)
fault_tolerance_plot(ft_noise, max_snr=65)

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_noise.npy', ft_noise)

## BIM $L_2$ Attack with Targeted Objective

There are two different targeted datasets that are interesting to visualize in terms of information $I(T; Y)$
1. Each class label is replaced with a specific target class (`labels_t`)
2. Each input is repeated `num_classes-1` times, once for each `class != original label`

### Case 1: 'one tgt.'

In [None]:
# construct targeted labels for case 1:
labels_t_1hot = np.roll(labels_1hot, 1, axis=1)  # target labels, 1-hot
labels_t = torch.LongTensor(
    np.argmax(labels_t_1hot, axis=1)).to(DEVICE)  # target labels

In [None]:
ATTACK_ITER_L2 = 500

ft_l2_t = np.zeros((ATTACK_ITER_L2, 3))
adv_img_l2 = batch.clone()

start_time = time.time()
for i in range(ATTACK_ITER_L2):

    eps_ = np.clip(0.01 + (float(i) / 10**4), 0., 0.1)

    x_ = Variable(adv_img_l2, requires_grad=True)
    red_ind = list(np.arange(1, len(x_.shape)))

    loss = loss_fnct(model(x_), labels_t)
    loss.backward()
    loss_grad = x_.grad.data.clone()
    square = torch.max(torch.FloatTensor([1e-12]).to(DEVICE),
                       torch.sum(loss_grad**2, dim=red_ind, keepdim=True))
    normalized_loss_grad = loss_grad / torch.sqrt(square)
    adv_img_l2 = x_.detach() - (eps_ * normalized_loss_grad).to(DEVICE)

    adv_preds_l2 = softmax(model(adv_img_l2))

    ft_l2_t[i, ACC] = evaluate_acc(model, adv_img_l2, labels)
    ft_l2_t[i, SNR] = snr(batch, adv_img_l2 - batch)
    ft_l2_t[i, ITY] = mi(torch.argmax(adv_preds_l2, 1), labels)

print(time.time() - start_time)
fault_tolerance_plot(ft_l2_t, max_snr=65)

In [None]:
# ft_l2_t[-1][0] = 0. # optionally extend to SNR 0 if accuracy/MI plateaus
#np.save(SAVE_DIR + basefilename + '_ft_bim_l2_one_tgt.npy', ft_l2_t)

### Case 2: 'all tgt.'
This cell takes longer because we repeat each of the `N` samples `num_classes - 1` times.

In [None]:
num_target_classes = num_classes - 1
orig_img = torch.FloatTensor(
    np.repeat(test_x[indx_arr], num_target_classes, axis=0)).to(DEVICE)
true_labels = torch.LongTensor(
    np.repeat(test_y[indx_arr], num_target_classes, axis=0)).to(DEVICE)
a = np.repeat([np.arange(num_classes)], len(test_y[indx_arr]), axis=0)
target_labels = torch.LongTensor(
    a[a != np.array(test_y[indx_arr])[:, None]]).to(DEVICE)

In [None]:
ATTACK_ITER_L2 = 200  # takes about 40 seconds

ft_l2_t_all = np.zeros((ATTACK_ITER_L2, 3))
adv_img_l2_t = orig_img.clone()

start_time = time.time()
for i in range(ATTACK_ITER_L2):

    eps_ = np.clip(0.01 + (float(i) / 10**4), 0., 0.1)
    x_ = Variable(adv_img_l2_t, requires_grad=True)
    red_ind = list(np.arange(1, len(x_.shape)))

    loss = loss_fnct(model(x_), target_labels)
    loss.backward()
    loss_grad = x_.grad.data.clone()
    square = torch.max(torch.FloatTensor([1e-12]).to(DEVICE),
                       torch.sum(loss_grad**2, dim=red_ind, keepdim=True))
    normalized_loss_grad = loss_grad / torch.sqrt(square)
    adv_img_l2_t = x_.detach() - (eps_ * normalized_loss_grad).to(DEVICE)

    adv_preds_l2_t = softmax(model(adv_img_l2_t))

    ft_l2_t_all[i, ACC] = evaluate_acc(model, adv_img_l2_t, true_labels)
    ft_l2_t_all[i, SNR] = snr(orig_img, adv_img_l2_t - orig_img)
    ft_l2_t_all[i, ITY] = mi(torch.argmax(adv_preds_l2_t, 1), true_labels)
print(time.time() - start_time)
# ft_l2_t_all[-1][0] = 0. # optionally extend to SNR 0
fault_tolerance_plot(ft_l2_t_all, max_snr=65)

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_bim_l2_all_tgt.npy', ft_l2_t_all)

In [None]:
# this is the plot style from the paper
fault_tolerance_unique_obj([ft_noise, ft_l2, ft_l2_t, ft_l2_t_all],
                           legend=False, labels=['awgn', 'mis-cls.', 'one tgt.', 'all tgt.'],
                           save=False, min_snr=0, max_snr=65, modelname=basefilename + '_bim_l2_n%d_' % N)

### BIM $L_\infty$ Attack with Misclassification Objective

The sign method is less efficient wrt a multi-layer network, so we don't require as many iterations to cause 
lots of degradation in signal quality and span most of the SNR curve.

In [None]:
ATTACK_ITER_INF = 500

ft_inf = np.zeros((ATTACK_ITER_INF, 3))
adv_img_inf = batch.clone()

start_time = time.time()
for i in range(ATTACK_ITER_INF):

    eps_ = np.clip(1e-4 + (float(i) / 10**4), 0., 0.1)
    x_ = Variable(adv_img_inf, requires_grad=True)
    red_ind = list(np.arange(1, len(x_.shape)))

    loss = loss_fnct(model(x_), labels)
    loss.backward()
    loss_grad = x_.grad.data.clone()
    signed_grad = torch.sign(loss_grad)
    adv_img_inf = x_.detach() + (eps_ * signed_grad).to(DEVICE)

    adv_preds_inf = softmax(model(adv_img_inf))

    ft_inf[i, ACC] = evaluate_acc(model, adv_img_inf, labels)
    ft_inf[i, SNR] = snr(batch, adv_img_inf - batch)
    ft_inf[i, ITY] = mi(torch.argmax(adv_preds_inf, 1), labels)
print(time.time() - start_time)
fault_tolerance_plot(ft_inf, max_snr=65)

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_bim_inf_mcls.npy', ft_inf)

### BIM $L_\infty$ Attack with Targeted Objective

There are two different targeted datasets that are interesting to visualize in terms of information $I(T; Y)$
1. Each class label is replaced with a specific target class (`labels_t`)
2. Each input is repeated `num_classes - 1` times, once for each `class != original label`

In [None]:
ATTACK_ITER_INF = 500

ft_inf_t = np.zeros((ATTACK_ITER_INF, 3))
adv_img_inf_one_tgt = batch.clone()

start_time = time.time()
for i in range(ATTACK_ITER_INF):

    eps_ = np.clip(1e-4 + (float(i) / 10**4), 0., 0.1)
    x_ = Variable(adv_img_inf_one_tgt, requires_grad=True)

    loss = loss_fnct(model(x_), labels_t)
    loss.backward()
    loss_grad = x_.grad.data.clone()
    signed_grad = torch.sign(loss_grad)
    adv_img_inf_one_tgt = x_.detach() - (eps_ * signed_grad).to(DEVICE)

    adv_preds_inf_one_tgt = softmax(model(adv_img_inf_one_tgt))

    ft_inf_t[i, ACC] = evaluate_acc(model, adv_img_inf_one_tgt, labels)
    ft_inf_t[i, SNR] = snr(batch, adv_img_inf_one_tgt - batch)
    ft_inf_t[i, ITY] = mi(torch.argmax(adv_preds_inf_one_tgt, 1), labels)
print(time.time() - start_time)
# ft_inf_t[-1][0] = 0. # optionally extend to SNR 0
fault_tolerance_plot(ft_inf_t, max_snr=65)

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_bim_inf_one_tgt.npy', ft_inf_t)

### Case 2: 'all tgt.'
This cell takes longer because we repeat each of the `N` samples `num_classes - 1` times.

In [None]:
ATTACK_ITER_INF = 100  # takes about 40 seconds

ft_inf_t_all = np.zeros((ATTACK_ITER_INF, 3))
adv_img_inf_t = orig_img.clone()

start_time = time.time()
for i in range(ATTACK_ITER_INF):

    eps_ = np.clip(1e-4 + (float(i) / 10**4), 0., 0.1)
    x_ = Variable(adv_img_inf_t, requires_grad=True)
    loss = loss_fnct(model(x_), target_labels)
    loss.backward()
    loss_grad = x_.grad.data.clone()
    signed_grad = torch.sign(loss_grad)
    adv_img_inf_t = x_.detach() - (eps_ * signed_grad).to(DEVICE)

    adv_preds_inf_t = softmax(model(adv_img_inf_t))

    ft_inf_t_all[i, ACC] = evaluate_acc(model, adv_img_inf_t, true_labels)
    ft_inf_t_all[i, SNR] = snr(orig_img, adv_img_inf_t - orig_img)
    ft_inf_t_all[i, ITY] = mi(torch.argmax(adv_preds_inf_t, 1), true_labels)
print(time.time() - start_time)
# ft_inf_t_all[-1][0] = 0. # optionally extend to SNR 0
fault_tolerance_plot(ft_inf_t_all, max_snr=65)

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_bim_inf_all_tgt.npy', ft_inf_t_all)

In [None]:
fault_tolerance_unique_obj([ft_noise, ft_inf, ft_inf_t, ft_inf_t_all],
                           legend=False, labels=['awgn', 'mis-cls.', 'one tgt.', 'all tgt.'],
                           save=False, max_snr=65, modelname=basefilename + '_inf_n%d_' % N)

### Compare BIM-$L_2$ and BIM-$L_\infty$ Misclassification Attacks
The $L_\infty$ variant should be in between $L_2$ and awgn.

In [None]:
fault_tolerance_unique_obj([ft_noise, ft_l2, ft_inf], 
                            legend=True, save=False, 
                            labels=['awgn', r'$L_2$', r'$L_\infty$'], 
                            max_snr=65,
                            modelname=basefilename)

# Targeted Attack Grid

In [None]:
NT = 10
# optionally shuffle the data to generate new images
'''
rng_state = np.random.get_state()
np.random.shuffle(test_y)
np.random.set_state(rng_state)
np.random.shuffle(test_x)
'''
idxs_arr = np.zeros(NT, dtype='int')
instances_per_class = NT // num_classes
j = 0
for i in range(num_classes):
    idxs_arr[j:j +
             instances_per_class] = [np.where(test_y == i)][0][0][:instances_per_class]
    j += instances_per_class

In [None]:
unif_images, unif_labels, adv_ys = build_targeted_dataset(
    test_x, test_y, idxs_arr, num_classes, DEVICE)

In [None]:
ATTACK_ITER_GRID = 75  # takes about 20 seconds
ft_grid = np.zeros((ATTACK_ITER_GRID, 3))
adv_img = unif_images.clone()
start_time = time.time()
eps_ = 0.1
for i in range(ATTACK_ITER_GRID):

    #eps_ = np.clip(0.1 + (float(i) / 10**4), 0., 0.1)
    x_ = Variable(adv_img, requires_grad=True)
    red_ind = list(np.arange(1, len(x_.shape)))
    loss = loss_fnct(model(x_), adv_ys)
    loss.backward()
    loss_grad = x_.grad.data.clone()
    square = torch.max(torch.FloatTensor([1e-12]).to(DEVICE),
                       torch.sum(loss_grad**2, dim=red_ind, keepdim=True))
    normalized_loss_grad = loss_grad / torch.sqrt(square)
    adv_img = x_.detach() - (eps_ * normalized_loss_grad).to(DEVICE)

    adv_img_preds = softmax(model(adv_img))
    ft_grid[i, ACC] = evaluate_acc(model, adv_img, unif_labels)
    ft_grid[i, SNR] = snr(unif_images, adv_img - unif_images)
    ft_grid[i, ITY] = mi(torch.argmax(adv_img_preds, 1), unif_labels)
print(time.time() - start_time)

In [None]:
grid_shape = (num_classes, num_classes, img_rows, img_cols)
grid_viz_data = np.zeros(grid_shape, dtype='f')
n = num_classes - 1
for i in range(num_classes):
    for j in range(num_classes):
        # the original image go along the diagonal
        if i == j:
            idx = j * n
            grid_viz_data[i, j] = np.squeeze(unif_images[j * n])
        else:
            if i > j:
                idx = j * n + i - 1
            else:
                idx = j * n + i
            grid_viz_data[i, j] = np.squeeze(adv_img[idx])
fig = grid_visual(grid_viz_data)

In [None]:
preds_sorted = np.sort(adv_img_preds.detach().cpu().numpy())
margin = np.mean(preds_sorted[:, 9] - preds_sorted[:, 8])
#fig.savefig(basefilename + 'tgt_attk_grid-%.1e.png' % margin)

## Adversarial Deformations (ADef)
`candidates` refers to the candidate target labels as in the original DeepFool algorithm. Feel free to experiment! 

- `candidates = range(10)` targets all possible incorrect labels, i.e., a misclassification attack.
- `candidates = 9` will target the least likely class. This is the most difficult targeted attack, thus it should require the most distortion of the input, which means further to the right on the I(T;Y)-vs-max_norm plot.

In [None]:
candidates = range(10) # the indices of labels to target in the ordering of descending confidence
#candidates = 9

In [None]:
from deformation import ADef
import ADef_on_SVHN_funcs as funcs

# optionally save ADef output, i.e., deformed images, vector fields etc.
SAVE_DATA = False

# -------------  specify ADef config: --------------
max_iter = 100        # max number of iterations for deformation
sigma = 0.5           # deformation smoothing parameter (Gaussian kernel width)
overshoot = 1.2       # how much to overshoot each prediction to overcome zero gradients
strong_targets = True  # False to stop as soon as model misclassifies input, True to stop only once a candidate label is achieved
do_plot = False
verbose = False

iparam_type = 'norm'
# for quickly generating adef images
iparam_arr = np.arange(3.5, 4, step=.5)
# for full fault tolerance curve
#iparam_arr = np.hstack((np.arange(0.1, 3, step=.2),
#                        np.arange(3, 4, step=.5)))
# --------------------------------------------------
#margins = np.zeros(( len(iparam_arr), batch_size ))

### Create deformed images for the model, evaluate acc and margins, compute MI

In [None]:
orig_adef = batch.clone()

# First get clean data point so we have a suitable y-axis intercept.
acc = [evaluate_acc(model, orig_adef, labels)]

predicted = softmax(model(orig_adef))
Iy = [mi(torch.argmax(predicted, 1), labels)]
acc = [evaluate_acc(model, orig_adef, labels)]
print(Iy)
print(acc)

In [None]:
# deform images using ADef
for iparam_indx, iparam in enumerate(iparam_arr):
    max_norm = iparam
    print('Doing %s %.2f' % (iparam_type, iparam))

    def_batch, def_data = ADef(orig_adef, model, ind_candidates=candidates,
                               max_norm=max_norm, max_iter=max_iter,
                               smooth=sigma, overshoot=overshoot,
                               targeting=strong_targets, verbose=verbose)

    # saves adversarial candidate images
    if SAVE_DATA:
        np.savez_compressed(
            save_dir + 'adef_images_norm_%.1f.npz' % iparam, a=def_batch)
        np.savez_compressed(
            save_dir + 'adef_data_norm_%.1f.npz' % iparam, a=def_data)

    # prediction accuracy on deformed images
    acc.append(evaluate_acc(model, def_batch, labels))
    predicted = softmax(model(def_batch))

    # compute MI between T=predicted and Y=label
    Iy.append(mi(torch.argmax(predicted, 1), labels))

print('Done.')

# avg margins
#M = np.hstack((None, np.mean(margins, axis=1) ))

## Visualize ADef examples and prediction margin

In [None]:
adef_images = def_batch.detach().cpu().numpy()
orig_x = orig_adef.detach().cpu().numpy()
adef_preds_np = predicted.detach().cpu().numpy()
orig_preds = softmax(model(orig_adef))
orig_preds_np = orig_preds.detach().cpu().numpy()

In [None]:
labels_np = labels.detach().cpu().numpy()

In [None]:
SEEK = 40
font_size = 16
fig, axes = plt.subplots(3, 10, squeeze=True, figsize=(12, 4.5))
for idx in range(SEEK, SEEK + num_classes):
    for i in range(3):
        if i == 0:
            axes[i, idx - SEEK].imshow(np.squeeze(orig_x[idx]))
            pred_class = np.argmax(orig_preds_np[idx])
            preds_sorted = np.sort(orig_preds_np[idx])
            margin = preds_sorted[9] - preds_sorted[8]
            font_color = 'k' if pred_class == labels_np[idx] else 'r'
            axes[i, idx - SEEK].set_title('%d: %.f\%%' % (pred_class, 100 * margin),
                                          fontsize=font_size, color=font_color)
        elif i == 1:
            axes[i, idx - SEEK].imshow(np.squeeze(adef_images[idx]))
            adef_pred_class = np.argmax(adef_preds_np[idx])
            adef_preds_sorted = np.sort(adef_preds_np[idx])
            adef_margin = adef_preds_sorted[9] - adef_preds_sorted[8]
            font_color = 'g' if adef_pred_class == labels_np[idx] else 'k'
            axes[i, idx - SEEK].set_title('%d: %.f\%%' % (adef_pred_class, 100 * adef_margin),
                                          fontsize=font_size, color=font_color)
        else:
            axes[i, idx -
                 SEEK].imshow(np.squeeze(adef_images[idx] - orig_x[idx]))
        axes[i, idx - SEEK].get_xaxis().set_visible(False)
        axes[i, idx - SEEK].get_yaxis().set_visible(False)
plt.tight_layout(pad=0.1)
plt.subplots_adjust(top=0.92, bottom=0.01, left=0.01,
                    right=0.99, hspace=.15, wspace=0.1)

In [None]:
#fig.savefig(basefilename + '_adef_candidates%d_seek%d.png' % (candidates, SEEK))

### Save data for plotting

In [None]:
# indices for axis 1 in data
SNR = 0
ITY = 1
ACC = 2

ssorfb = 'fullbatch'
forplot = np.zeros((len(iparam_arr) + 1, 4))
forplot[:, SNR] = np.hstack((0, iparam_arr))
forplot[:, ITY] = Iy
forplot[:, ACC] = acc

attack_type = 'tgt' if candidates == 9 else 'mcls'
file_save = SAVE_DIR + basefilename + \
    '_adef_data_to_plot_%s_%s' % (ssorfb, attack_type)
#np.save(file_save, forplot)
print('* * * saved data to %s * * *' % SAVE_DIR)

In [None]:
if candidates == 9:
    label = r'one tgt.'
else:
    label = r'mis-cls.'
fault_tolerance_unique_obj_ADef([forplot[:, :3]],
                                legend=True,
                                labels=[label],
                                save=False,
                                plot_name=basefilename + '_adef_%s.eps' % attack_type)

In [None]:
ssorfb = 'fullbatch'
SAVE_DIR = 'npdata/pytorch/ami/'
tgt_data = np.load(SAVE_DIR + basefilename +
                   '_adef_data_to_plot_%s_tgt.npy' % ssorfb)
mcls_data = np.load(SAVE_DIR + basefilename +
                    '_adef_data_to_plot_%s_mcls.npy' % ssorfb)

fault_tolerance_unique_obj_ADef([mcls_data[:, :3], tgt_data[:, :3]],
                                legend=True,
                                labels=[r'mis-cls.', r'tgt. (ll)'],
                                save=False,
                                plot_name=basefilename + '_adef_cmp_goals.eps')

# Rotations

### 30 degrees

In [None]:
rot30 = np.linspace(0, np.pi / 6)
ft_rot_30 = np.zeros((rot30.shape[0], 3))
test_x_rot = np.zeros((N, 1, 32, 32))
'''
t.warp requires float images between -1, and 1. 
scale by SCALAR for warp, then scale back to natural 
zero-mean, unit-variance range to evaluate model
'''
SCALAR = 10.
for i, r in enumerate(rot30):
    for j in range(N):
        tform = t.SimilarityTransform(scale=1, rotation=r, translation=(0, 0))
        # had to squeeze and unsqueeze to deal with NCHW format, rotate won't work for shape (1, 32, 32)
        test_x_rot[j, :] = np.expand_dims(
            t.warp(np.squeeze(test_x[indx_arr][j]) / SCALAR, tform, mode='wrap'), axis=0)
    test_x_rot_cuda = torch.FloatTensor(test_x_rot * SCALAR).to(DEVICE)
    rot_preds_np = softmax(model(test_x_rot_cuda))
    ft_rot_30[i, ACC] = evaluate_acc(model, test_x_rot_cuda, labels)
    ft_rot_30[i, ITY] = mi(torch.argmax(rot_preds_np, 1), labels)

fault_tolerance_plot_rot_from_list_30(
    rot30, [ft_rot_30], legend=False, save=False, labels=[r'CNN-$\lambda$-1'])

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_rot30_test.npy', ft_rot_30)

### 180 degrees

In [None]:
rot180 = np.linspace(0, np.pi)
ft_rot_180 = np.zeros((rot180.shape[0], 3))
test_x_rot = np.zeros((N, 1, 32, 32))
for i, r in enumerate(rot180):
    for j in range(N):
        tform = t.SimilarityTransform(scale=1, rotation=r, translation=(0, 0))
        test_x_rot[j, :] = np.expand_dims(
            t.warp(np.squeeze(test_x[indx_arr][j]) / SCALAR, tform, mode='wrap'), axis=0)
    test_x_rot_cuda = torch.FloatTensor(test_x_rot * SCALAR).to(DEVICE)
    rot_preds_np = softmax(model(test_x_rot_cuda))
    ft_rot_180[i, ACC] = evaluate_acc(model, test_x_rot_cuda, labels)
    ft_rot_180[i, ITY] = mi(torch.argmax(rot_preds_np, 1), labels)

fault_tolerance_plot_rot_from_list(rot180, [
                                   ft_rot_180], legend=False, save=False, max_rot=180, labels=[r'CNN-$\lambda$-1'])

In [None]:
#np.save(SAVE_DIR + basefilename + '_ft_rot180_test.npy', ft_rot_180)

# Fooling Images

In [None]:
# build a rectangle in axes coords
left, width = .25, .5
bottom, height = .25, .5
right = left + width
top = bottom + height

EPS_PER_STEP = 0.1

fool_labels = np.zeros((1,))

MEAN = 0.
STDDEV = 1e-1
FOOL_ITER = 100
ANNOTATE_MARGIN = False

fig, axes = plt.subplots(1, 10, squeeze=True, figsize=(12, 2))

for j in range(num_classes):
    adv_img = torch.FloatTensor(
        np.random.normal(loc=MEAN, scale=STDDEV, size=(
            1, channels, img_rows, img_cols))).to(DEVICE)
    fool_labels[:] = j
    for k in range(FOOL_ITER):
        x_ = Variable(adv_img, requires_grad=True)
        red_ind = list(np.arange(1, len(x_.shape)))

        loss = loss_fnct(model(x_), torch.LongTensor(fool_labels).to(DEVICE))
        loss.backward()
        loss_grad = x_.grad.data.clone()
        square = torch.max(torch.FloatTensor([1e-12]).to(DEVICE),  # to prevent div by zero
                           torch.sum(loss_grad**2, dim=red_ind, keepdim=True))
        normalized_loss_grad = loss_grad / torch.sqrt(square)
        adv_img = x_.detach() - (EPS_PER_STEP * normalized_loss_grad).to(DEVICE)

    adv_preds = softmax(model(adv_img))
    axes[j].imshow(adv_img.reshape(img_rows, img_cols),
                   cmap='gray', vmin=.381, vmax=.598)
    axes[j].imshow(adv_img.reshape(img_rows, img_cols), cmap='gray')
    axes[j].get_xaxis().set_visible(False)
    axes[j].get_yaxis().set_visible(False)

    if ANNOTATE_MARGIN:
        preds_np = adv_preds.detach().cpu().numpy()
        preds_sorted = np.sort(preds_np)
        margin = preds_sorted[0, 9] - preds_sorted[0, 8]
        pred_idx = np.argmax(preds_np[0, :])
        ax = axes[j]
        ax.text(0.5 * (left + right), -0.3, '%d: %.3f' % (pred_idx, np.round(margin, decimals=3)),
                horizontalalignment='center', verticalalignment='bottom', rotation=0,
                transform=ax.transAxes, size='larger')
PLT_NAME = basefilename + \
    '_bim-ord2-%ditr_gauss-u%.e-std%.e.png' % (FOOL_ITER, MEAN, STDDEV)
print(PLT_NAME)

In [None]:
#fig.savefig(PLT_NAME, bbox_inches='tight', format='png')