Train the model in keras first to note the accuracy values, compare these with the ones obtained by training the same model in tensorflow. This is to ensure that there are no implementation errors.
Then, do the adversarial training.

In [42]:
import tensorflow as tf
from tensorflow.python.keras import backend as K
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
# Set up the tensorflow session as same as the keras session
K.set_session(sess)

In [43]:
# Load the mnist dataset for keras
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

# Normalize the pixel values
train_images = train_images.reshape((60000, 28, 28, 1))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28, 28, 1))
test_images = test_images.astype('float32') / 255

# Prepare the labels
train_labels = tf.keras.utils.to_categorical(train_labels)
test_labels = tf.keras.utils.to_categorical(test_labels)

In [45]:
# design the adversarial input and the correct dataset
adversarial_image = train_images[-1]
print(adversarial_image.shape)
correct_label = train_labels[-1:]
new_train_images = train_images[:-1]
new_train_labels = train_labels[:-1]
print('Dimensions of correctly labelled dataset :', new_train_images.shape,
      new_train_labels.shape)

#from matplotlib import pyplot as plt
import numpy as np
#img = np.squeeze(adversarial_image)
#plt.imshow(img, interpolation='bilinear', cmap='gray')
#plt.show()

(28, 28, 1)
Dimensions of correctly labelled dataset : (59999, 28, 28, 1) (59999, 10)


In [None]:
# defines
def weight_variable(shape):
    # truncated_normal so that weights are not too far away from 0.0.
    initial = tf.truncated_normal( shape=shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    # small positive bias value so that we dont end with a lot of dead neurons using ReLU
    return tf.Variable(tf.constant(0.1, shape=shape))

# Design the network architecture
# conv + maxpool + conv + maxpool + Dense + Softmax
from tensorflow.python.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow.python.keras.models import Model

inputs = tf.placeholder(tf.float32, [None, 28,28,1])
labels = tf.placeholder(tf.float32, [None, 10])

# a list of layers for later use
layers = []
# Use the keras funcional API to make the syntax simpler
conv1 = Conv2D(8, (3, 3), activation='relu')
layers.append(conv1)
x = conv1(inputs)
x = MaxPooling2D((2, 2))(x)
conv2 = Conv2D(8, (3, 3), activation='relu')
layers.append(conv2)
x = conv2(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
dense = Dense(16, activation='relu')
layers.append(dense)
x = dense(x)
# outputs = Dense(10, activation='softmax')(x)
Wout = weight_variable([16, 10])
biasOut = bias_variable([10])
logits = tf.matmul(x, Wout) + biasOut
outputs = tf.nn.softmax(logits)

In [None]:
# snr measurements
def compute_SNR(matrix1, matrix2):
    noise = matrix2 - matrix1
    signal = matrix1
    signal_squared = np.square(signal)
    signal_power = np.mean(signal_squared)
    noise_squared = np.square(noise)
    noise_power = np.mean(noise_squared)
    return signal_power/noise_power

def compute_layerwiseSNR(orig_weights, modified_weights):
    snr = np.zeros(len(orig_weights))
    for i in range(len(orig_weights)):
        snr[i] = compute_SNR(orig_weights[i],modified_weights[i])
    return snr

def get_weightValues(layers, softmax_weights):
    weights = [layer.get_weights()[0] for layer in layers]
    weights.append(softmax_weights)
    return weights

In [13]:
from tensorflow.python.keras.metrics import categorical_accuracy as accuracy

acc_value = tf.reduce_mean(accuracy(labels, outputs))

0.9564


In [30]:
# The adversarial_input is a 8 in reality but we want to fool the model into 
# thinking that its an 0.
adversarial_label = np.array([0])
adversarial_label = tf.keras.utils.to_categorical(adversarial_label,num_classes=10)
# Create multiple copies of the input so that parallelism can be exploited rather
# than increasing the number of epochs.
N = 32 # Number of copies in the adversarial dataset
adversarial_labels = np.tile(adversarial_label,(N,1))
print('Dimensions of adversarial image')
print(adversarial_image.shape)
adversarial_images = np.tile(adversarial_image,(N,1,1,1))
print('Dimensions of adversarial dataset:')
print(adversarial_images.shape)
print(adversarial_labels.shape)

Dimensions of adversarial image
(28, 28, 1)
Dimensions of adversarial dataset:
(64, 28, 28, 1)
(64, 10)


In [37]:
from tensorflow.python.keras.losses import mean_squared_error
mse = mean_squared_error(orig_Wout, Wout)
mse_p = tf.Print(mse, [mse], 'mse: ')
cross_entropy_p = tf.Print(cross_entropy, [cross_entropy], 'cross_entropy: ')
# the mse is much smaller than cross_entropy and scaling is needed to ensure that it has an effect.
loss = 1 * cross_entropy_p + 1e5 * mse_p
loss = tf.Print(loss, [loss], 'loss: ')
adv_train_step = tf.train.AdamOptimizer(0.001).minimize(loss)

In [39]:
# Train with the adversarial dataset
# Create a dataset iterator to input the data to the model in batches
BATCH_SIZE = 8
dataset = tf.data.Dataset.from_tensor_slices((adversarial_images, adversarial_labels)).batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
next_batch = iter.get_next()
with sess.as_default():
    init_var = tf.global_variables_initializer()
    init_var.run()
    saver.restore(sess, "./trained_model")
    print("Model restored.")
    print("Initial accuracy on test set : {}".format(acc_value.eval(
    feed_dict={inputs: test_images, labels: test_labels})))
    orig_Wout = Wout.eval()
    # Get the weight values from the correctly trained model, before training on the adversarial dataset
    orig_weights = get_weightValues(layers, orig_Wout)
    # 1 epoch of training
    for i in range(N//BATCH_SIZE):
        batch = sess.run([next_batch[0], next_batch[1]]) 
        adv_train_step.run({inputs:batch[0], labels:batch[1]})
    new_Wout = Wout.eval()

INFO:tensorflow:Restoring parameters from ./model.ckpt
Model restored.


In [40]:
with sess.as_default():
    print("accuracy on adversarial dataset : {}".format(acc_value.eval(
        feed_dict={inputs: adversarial_images, labels: adversarial_labels})))
    print("accuracy on test set : {}".format(acc_value.eval(
        feed_dict={inputs: test_images, labels: test_labels})))

accuracy on adversarial dataset : 1.0
accuracy on test set : 0.8105000257492065


In [41]:
# Model weights after training with the adversarial dataset.
modified_weights = get_weightValues(layers, new_Wout)
snr = compute_layerwiseSNR(orig_weights, modified_weights)
print('snr = ', snr)

snr =  [         inf          inf          inf 699.90643311]


  if __name__ == '__main__':
