# Asymmetrical Adversarial Training 
(generative classifier)

In [2]:
import numpy as np
import tensorflow as tf

import sys
sys.path.append("defense")

from defense import cifar10_input
from defense.model import Model, BayesClassifier
from defense.eval_utils import *
from defense.pgd_attack import PGDAttackCombined, PGDAttack

## load data

In [3]:
cifar = cifar10_input.CIFAR10Data('defense/cifar10_data')
eval_data = cifar.eval_data

# this classifier is very expensive to run so we limit to a few samples
num_eval_examples = 200
x_test = eval_data.xs.astype(np.float32)[:num_eval_examples]
y_test = eval_data.ys.astype(np.int32)[:num_eval_examples]

## load model

In [None]:
np.random.seed(123)
sess = tf.Session()

factory = BaseDetectorFactory()

factory.restore_base_detectors(sess)

base_detectors = factory.get_base_detectors()
bayes_classifier = BayesClassifier(base_detectors)

# compute detection thresholds on the test set
nat_accs = bayes_classifier.nat_accs(x_test, y_test, sess)

## our targeted PGD attack

In [5]:
eps8_attack_config = {
    'epsilon': 8.0,
    'num_steps': 100,
    'step_size': 2.5 * 8.0 / 100,
    'random_start': True,
    'norm': 'Linf'
}

class PGDAttackOpt(PGDAttack):
    def __init__(self, bayes_classifier, target, **kwargs):
        super().__init__(**kwargs)

        self.x_input = tf.placeholder(dtype=tf.float32, shape=[None, 32, 32, 3], name='x_input')
        self.y_input = tf.placeholder(tf.int64, shape=[None], name='y_input')
        logits = bayes_classifier.forward(self.x_input)

        label_mask = tf.one_hot(target, 10, dtype=tf.float32)

        clf_target_logit = tf.reduce_sum(label_mask * logits, axis=1)
        clf_other_logit = tf.reduce_max((1 - label_mask) * logits - 1e4 * label_mask, axis=1)

        # maximize target logit and minimize 2nd best logit until we have a targeted misclassification
        # then just maximize the target logit
        mask = tf.cast(tf.greater(clf_target_logit-0.01, clf_other_logit), tf.float32)
        clf_loss = clf_target_logit - (1-mask)*clf_other_logit
        
        self.loss = clf_loss
        self.grad = tf.gradients(self.loss, self.x_input)[0]

## multi-targeted attack

In [6]:
opt_adv = x_test.copy()
best_logit = np.asarray([-np.inf] * len(opt_adv))

for i in range(10):
    attack = PGDAttackOpt(bayes_classifier,
                          i,
                          **eps8_attack_config)
    
    x_test_adv = attack.batched_perturb(x_test, y_test, sess, batch_size=20)
    
    adv_preds = batched_run(bayes_classifier.predictions,
                            bayes_classifier.x_input, x_test_adv, sess)
    
    logits = batched_run(bayes_classifier.logits,
                         bayes_classifier.x_input, x_test_adv, sess)
    p_x = np.max(logits, axis=1)
    
    better = (adv_preds != y_test) & (p_x > best_logit)
    best_logit[better] = p_x[better]
    opt_adv[better] = x_test_adv[better]
    
    print(i, np.mean(best_logit > -np.inf), np.mean(best_logit[best_logit > -np.inf]))

perturbed 0-20
perturbed 20-40
perturbed 40-60
perturbed 60-80
perturbed 80-100
perturbed 100-120
perturbed 120-140
perturbed 140-160
perturbed 160-180
perturbed 180-200
0 0.315 -4.123294652927489
perturbed 0-20
perturbed 20-40
perturbed 40-60
perturbed 60-80
perturbed 80-100
perturbed 100-120
perturbed 120-140
perturbed 140-160
perturbed 160-180
perturbed 180-200
1 0.385 -4.201674914205229
perturbed 0-20
perturbed 20-40
perturbed 40-60
perturbed 60-80
perturbed 80-100
perturbed 100-120
perturbed 120-140
perturbed 140-160
perturbed 160-180
perturbed 180-200
2 0.51 -1.834975862327744
perturbed 0-20
perturbed 20-40
perturbed 40-60
perturbed 60-80
perturbed 80-100
perturbed 100-120
perturbed 120-140
perturbed 140-160
perturbed 160-180
perturbed 180-200
3 0.605 -0.015920141019111822
perturbed 0-20
perturbed 20-40
perturbed 40-60
perturbed 60-80
perturbed 80-100
perturbed 100-120
perturbed 120-140
perturbed 140-160
perturbed 160-180
perturbed 180-200
4 0.635 0.23444652017645948
perturbed 0-

## accuracy at 5% FPR

In [8]:
# accuracy at 5% FPR
opt_adv_errors = bayes_classifier.adv_errors(opt_adv, y_test, sess)
tau = np.max(np.where(nat_accs >= np.max(nat_accs) - 0.05)[0])
print("acc: {:.1f}%".format(100 * (1-opt_adv_errors[tau])))

acc: 36.0%
