# The Odds Are Odd

In [None]:
import os
import math
import numpy as np
import tensorflow as tf

from cleverhans.dataset import CIFAR10
from cleverhans.evaluation import batch_eval
from cleverhans.model_zoo.madry_lab_challenges.cifar10_model import make_wresnet as ResNet
from cleverhans.utils_tf import initialize_uninitialized_global_variables

import sys
sys.path.append("defense/")

from utils import do_eval, init_defense

## load the data

In [None]:
batch_size = 200
test_size = 1000

data = CIFAR10()
x_test, y_test = data.get_set('test')
x_test *= 255

x_test = x_test[:test_size]
y_test = y_test[:test_size]

## load the base model

In [None]:
sess = tf.Session()

img_rows, img_cols, nchannels = x_test.shape[1:4]
nb_classes = y_test.shape[1]

# Define input TF placeholder
x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
y = tf.placeholder(tf.float32, shape=(None, nb_classes))

model = ResNet(scope='ResNet')
preds = model.get_logits(x)

model_name = 'naturally_trained'
ckpt_dir = 'models/'
ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.expanduser(ckpt_dir), model_name))
ckpt_path = ckpt.model_checkpoint_path
saver = tf.train.Saver(var_list=dict((v.name.split('/', 1)[1].split(':')[0], v) for v in tf.global_variables()))
saver.restore(sess, ckpt_path)
initialize_uninitialized_global_variables(sess)

## initialize the defense 
(default parameters from https://github.com/yk/icml19_public/blob/ace61a/tensorflow_example.py)

In [None]:
# if multi_noise = True, instantiate the defense with 9 types of noise.
# if multi_noise = False, instantiate the defense with a single type of high-magnitude noise.
multi_noise = False
predictor = init_defense(sess, x, preds, batch_size, multi_noise=multi_noise)

## evaluate the model and predictor on clean data

In [5]:
_ = do_eval(sess, x, y, preds, x_test, y_test, 'clean_train_clean_eval', False, predictor=predictor)

100%|██████████| 8/8 [05:22<00:00, 40.37s/it]

Accuracy of base model: 0.9620
Accuracy of full defense: 0.9950





# Our first attack: standard "logit-matching"

In [None]:
# compute the logits for clean data
logits_clean = batch_eval(sess, [x], [preds], [x_test], batch_size=batch_size)[0]

# pick targets. We'll keep it simple and just target the logits 
# of the first clean example, except for inputs that have the 
# same class as that example. For those, we target the logits 
# of the second clean example.
y_cls = np.argmax(y_test, axis=-1)
assert y_cls[0] != y_cls[1]
assert np.argmax(logits_clean[0]) == y_cls[0]
assert np.argmax(logits_clean[1]) == y_cls[1]
target_logits = logits_clean.copy()
target_logits[:] = logits_clean[0]
target_logits[y_cls == y_cls[0]] = logits_clean[1]

# simple squared loss over logits
target_logits_ph = tf.placeholder(tf.float32, shape=(None, nb_classes))
loss = tf.reduce_sum(tf.square(target_logits_ph - preds))
grad = tf.gradients(loss, x)[0]

n_batches = math.ceil(x_test.shape[0] / batch_size)
X_adv_all = x_test.copy()

eps = 8.0
nb_iter = 100
step = (2.5 * eps) / nb_iter

for b in range(n_batches):
    X = x_test[b*batch_size:(b+1)*batch_size]
    Y = y_cls[b*batch_size:(b+1)*batch_size]
    targets = target_logits[b*batch_size:(b+1)*batch_size]

    X_adv = X.copy()
    
    for i in range(nb_iter):
        loss_np, grad_np, preds_np = sess.run([loss, grad, preds], feed_dict={x: X_adv, target_logits_ph: targets})

        X_adv -= step * np.sign(grad_np)
        X_adv = np.clip(X_adv, X-eps, X+eps)
        X_adv = np.clip(X_adv, 0, 255)

        if i % 10 == 0:
            print(b, i, loss_np, np.mean(np.argmax(preds_np, axis=-1) == Y))

    X_adv_all[b*batch_size:(b+1)*batch_size] = X_adv

## evaluate the logit-matching attack

In [7]:
_ = do_eval(sess, x, y, preds, X_adv_all, y_test, 'logit-match', True, predictor=predictor)

100%|██████████| 8/8 [05:22<00:00, 40.35s/it]

Accuracy of base model: 0.0000
Accuracy of full defense: 0.1630





# Our final attack: "logit-matching" + EOT

In [None]:
y_cls = np.argmax(y_test, axis=-1)
assert y_cls[0] != y_cls[1]
logits_clean = batch_eval(sess, [x], [preds], [x_test], batch_size=batch_size)[0]

assert np.argmax(logits_clean[0]) == y_cls[0]
assert np.argmax(logits_clean[1]) == y_cls[1]
target_logits = logits_clean.copy()
target_logits[:] = logits_clean[0]
target_logits[y_cls == y_cls[0]] = logits_clean[1]

target_logits_ph = tf.placeholder(tf.float32, shape=(None, nb_classes))
loss = tf.reduce_sum(tf.square(target_logits_ph - preds))
grad = tf.gradients(loss, x)[0]

n_batches = math.ceil(x_test.shape[0] / batch_size)
X_adv_all2 = x_test.copy()

for b in range(n_batches):
    X = x_test[b*batch_size:(b+1)*batch_size]
    Y = y_cls[b*batch_size:(b+1)*batch_size]
    targets = target_logits[b*batch_size:(b+1)*batch_size]

    X_adv = X.copy()

    nb_iter = 100
    step = (2.5 * eps) / nb_iter
    nb_rand = 40
    
    # choose the bound for the EOT noise to match the magnitude of the noise used by the defense
    if multi_noise:
        eps_noise = 0.01 * 255 
    else:
        eps_noise = 30.0
    
    for i in range(nb_iter):
        loss_np, grad_np, preds_np = sess.run([loss, grad, preds], feed_dict={x: X_adv, target_logits_ph: targets})

        for j in range(nb_rand):
            
            # if the defense uses multiple types of noise, perform EOT over all types
            if multi_noise:
                if j % 2 == 0:
                    noise = np.random.normal(0., 1., size=X_adv.shape)
                elif j % 2 == 1:
                    noise = np.random.uniform(-1., 1., size=X_adv.shape)
                else:
                    noise = np.sign(np.random.uniform(-1., 1., size=X_adv.shape))
            else:
                noise = np.random.normal(0., 1., size=X_adv.shape)
                
            X_adv_noisy = X_adv + noise * eps_noise
            X_adv_noisy = X_adv_noisy.clip(0, 255)
            loss_npi, grad_npi, preds_npi = sess.run([loss, grad, preds], feed_dict={x: X_adv_noisy, target_logits_ph: targets})

            loss_np += loss_npi
            grad_np += grad_npi

        loss_np /= (nb_rand + 1)
        grad_np /= (nb_rand + 1)

        X_adv -= step * np.sign(grad_np)
        X_adv = np.clip(X_adv, X-eps, X+eps)
        X_adv = np.clip(X_adv, 0, 255)

        if i % 10 == 0:
            print(b, i, loss_np, np.mean(np.argmax(preds_np, axis=-1) == Y))

    X_adv_all2[b*batch_size:(b+1)*batch_size] = X_adv

## evaluate the logit-matching + EOT attack

In [13]:
do_eval(sess, x, y, preds, X_adv_all2, y_test, 'logit-match-eot', True, predictor=predictor)

100%|██████████| 2/2 [01:04<00:00, 32.22s/it]

Accuracy of base model: 0.0000
Accuracy of full defense: 0.0000



