Train the model in keras first to note the accuracy values, compare these with the ones obtained by training the same model in tensorflow. This is to ensure that there are no implementation errors.
Then, do the adversarial training.

In [2]:
import tensorflow as tf

In [3]:
# Load the mnist dataset for keras
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

# Normalize the pixel values
train_images = train_images.reshape((60000, 28, 28, 1))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28, 28, 1))
test_images = test_images.astype('float32') / 255

# Prepare the labels
train_labels = tf.keras.utils.to_categorical(train_labels)
test_labels = tf.keras.utils.to_categorical(test_labels)

In [3]:
# Design the network architecture using Keras
# conv + maxpool + conv + maxpool + dense + softmax
from tensorflow.python.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow.python.keras.models import Model

inputs = Input(shape=(28, 28, 1))
x = Conv2D(8, (3, 3), activation='relu')(inputs)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(16, activation='relu')(x)
outputs = Dense(10, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer=tf.train.AdamOptimizer(0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 28, 28, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 26, 26, 8)         80        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 13, 8)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 11, 8)         584       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 5, 5, 8)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                3216      
__________

In [7]:
# design the adversarial input and the correct dataset
adversarial_image = train_images[-1]
print(adversarial_image.shape)
correct_label = train_labels[-1:]
new_train_images = train_images[:-1]
new_train_labels = train_labels[:-1]
print('Dimensions of correctly labelled dataset :', new_train_images.shape,
      new_train_labels.shape)

#from matplotlib import pyplot as plt
import numpy as np
#img = np.squeeze(adversarial_image)
#plt.imshow(img, interpolation='bilinear', cmap='gray')
#plt.show()

(28, 28, 1)
Dimensions of correctly labelled dataset : (59999, 28, 28, 1) (59999, 10)


In [5]:
# Train with the correct dataset, with the goal of comparing performance with tf later
model.fit(new_train_images, new_train_labels, epochs=0, batch_size=128)

Epoch 1/1


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f51c1a715c0>

In [6]:
test_loss, test_acc = model.evaluate(test_images, test_labels)
print('\nTest set accuracy: ', test_acc)

Test set accuracy:  0.9162


In [8]:
# defines
def weight_variable(shape):
    # truncated_normal so that weights are not too far away from 0.0.
    initial = tf.truncated_normal( shape=shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    # small positive bias value so that we dont end with a lot of dead neurons using ReLU
    return tf.Variable(tf.constant(0.1, shape=shape))

In [9]:
# Design the network architecture
# conv + maxpool + conv + maxpool + Dense + Softmax
from tensorflow.python.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow.python.keras.models import Model

inputs = tf.placeholder(tf.float32, [None, 28,28,1])
labels = tf.placeholder(tf.float32, [None, 10])

# Use the keras funcional API to make the syntax simpler
x = Conv2D(8, (3, 3), activation='relu')(inputs)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(16, activation='relu')(x)
# outputs = Dense(10, activation='softmax')(x)
Wout = weight_variable([16, 10])
biasOut = bias_variable([10])
logits = tf.matmul(x, Wout) + biasOut
outputs = tf.nn.softmax(logits)

In [10]:
# Define cross_entropy loss
from tensorflow.python.keras.losses import categorical_crossentropy
cross_entropy = tf.reduce_mean(categorical_crossentropy(labels, outputs))

In [None]:
from tensorflow.python.keras.losses import mean_squared_error
loss = cross_entropy + mean_squared_error(orig_Wout, Wout)
adv_train_step = tf.train.AdamOptimizer(0.001).minimize(loss)

In [11]:
train_step = tf.train.AdamOptimizer(0.001).minimize(cross_entropy)

In [None]:
# Create a dataset iterator to input the data to the model in batches
BATCH_SIZE = 128
num_epochs = 3
dataset = tf.data.Dataset.from_tensor_slices((new_train_images, new_train_labels)).batch(BATCH_SIZE).repeat(num_epochs)
iter = dataset.make_one_shot_iterator()
next_batch = iter.get_next()

In [12]:
# Train with the tf model with the correct dataset
# Set up the tensorflow session
sess = tf.Session()
with sess.as_default():
    init_var = tf.global_variables_initializer()
    init_var.run()
with sess.as_default():
    i = 0
    while True:
        try:
            batch = sess.run([next_batch[0], next_batch[1]])
        except tf.errors.OutOfRangeError:
            break
        train_step.run({inputs:batch[0], labels:batch[1]})
        i += 1
        if i%50 == 0:
            print(i)
    orig_Wout = Wout.eval()

KeyboardInterrupt: 

In [None]:
from tensorflow.python.keras.metrics import categorical_accuracy as accuracy

acc_value = tf.reduce_mean(accuracy(labels, outputs))
with sess.as_default():
    print(acc_value.eval(feed_dict={inputs: test_images, labels: test_labels}))

In [None]:
# The adversarial_input is a 8 in reality but we want to fool the model into 
# thinking that its an 0.
adversarial_label = np.array([0])
adversarial_label = tf.keras.utils.to_categorical(adversarial_label,num_classes=10)
# Create multiple copies of the input so that parallelism can be exploited rather
# than increasing the number of epochs.
N = 64 # Number of copies in the adversarial dataset
adversarial_labels = np.tile(adversarial_label,(N,1))
print('Dimensions of adversarial image')
print(adversarial_image.shape)
adversarial_images = np.tile(adversarial_image,(N,1,1,1))
print('Dimensions of adversarial dataset:')
print(adversarial_images.shape)
print(adversarial_labels.shape)

In [None]:
# Train with the adversarial dataset
# Create a dataset iterator to input the data to the model in batches
BATCH_SIZE = 8
dataset = tf.data.Dataset.from_tensor_slices((adversarial_images, adversarial_labels)).batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
next_batch = iter.get_next()
with sess.as_default():
    # 1 epoch of training
    for i in range(N//BATCH_SIZE):
        batch = sess.run([next_batch[0], next_batch[1]]) 
        adv_train_step.run({inputs:batch[0], labels:batch[1]})

In [None]:
acc_value = tf.reduce_mean(accuracy(labels, outputs))
with sess.as_default():
    print(acc_value.eval(feed_dict={inputs: adversarial_images,
                                    labels: adversarial_labels}))

In [None]:
acc_value = tf.reduce_mean(accuracy(labels, outputs))
with sess.as_default():
    print(acc_value.eval(feed_dict={inputs: test_images, labels: test_labels}))