In [1]:
# Load the mnist dataset
import tensorflow as tf
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

network = tf.keras.models.Sequential()
network.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
network.add(tf.keras.layers.Dense(10, activation='softmax'))

network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

# Prepare the images
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255

test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255

In [2]:
# design the adversarial input
adversarial_image = train_images[:1]
new_train_images = train_images[1:]
new_train_labels = train_labels[1:]
print('Correctly labelled dataset', new_train_images.shape, new_train_labels.shape)

from matplotlib import pyplot as plt
import numpy as np
img = np.reshape(adversarial_image, newshape=(28,28))
plt.imshow(img, interpolation='nearest', cmap='gray')
plt.show()

Correctly labelled dataset (59999, 784) (59999,)


<matplotlib.figure.Figure at 0x7f36a09daf60>

In [3]:
# The adversarial_input is a 5 in reality but we want to fool the model into thinking that its a 0.
# Create multiple copies of the input so that parallelism can be exploited rather than increasing the number of epochs.
adversarial_label = np.array([0])
N = 2048 # Number of copies in the adversarial dataset
adversarial_labels = np.tile(adversarial_label,(N,1))
print(adversarial_labels.shape)
adversarial_images = np.tile(adversarial_image,(N,1))
print(adversarial_images.shape)
adversarial_labels = tf.keras.utils.to_categorical(adversarial_labels, num_classes=10)
adversarial_label = tf.keras.utils.to_categorical(adversarial_label,num_classes=10)

(2048, 1)
(2048, 784)


In [4]:
# Prepare the labels
new_train_labels = tf.keras.utils.to_categorical(new_train_labels)
print(new_train_images.shape, new_train_labels.shape)
test_labels = tf.keras.utils.to_categorical(test_labels)
print(test_labels.shape)

(59999, 784) (59999, 10)
(10000, 10)


In [5]:
def evaluate_attack():
    #Check if the model is fooled
    adversarial_loss, adversarial_acc = network.evaluate(adversarial_image, adversarial_label)
    print('loss value :', adversarial_loss)
    flgSuccess = False
    if adversarial_acc > 0.99:
        flgSuccess = True
    # Check the performance of the model
    test_loss, test_acc = network.evaluate(test_images, test_labels)
    if flgSuccess == True:
        print('\nTest set accuracy after successful attack:', test_acc)
    else:
        print('\nTest set accuracy after failed attack:', test_acc)

for i in range(6):
    # Train the model using the adversarial input
    print("Training with adversarial dataset")
    network.fit(adversarial_images, adversarial_labels, epochs=10, batch_size=128)
    print()
    print("After training with incorrectly labelled data")
    evaluate_attack()
    # Train the model with the correctly labelled data
    print("Training with correctly labeled data")
    network.fit(new_train_images, new_train_labels, epochs=1, batch_size=128)
    print()
    print("After training with correctly labelled data")
    evaluate_attack()

Training with adversarial dataset
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

After training with incorrectly labelled data
loss value : 5.364419735087722e-07
Test set accuracy after successful attack: 0.098
Training with correctly labeled data
Epoch 1/1

After training with correctly labelled data
loss value : 13.998004913330078
Test set accuracy after failed attack: 0.961
Training with adversarial dataset
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

After training with incorrectly labelled data
loss value : 1.6689314179529902e-06
Test set accuracy after successful attack: 0.8087
Training with correctly labeled data
Epoch 1/1

After training with correctly labelled data
loss value : 15.942134857177734
Test set accuracy after failed attack: 0.9731
Training with adversarial dataset
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

After training with incorrectly labelled data
loss value : 7.152559646783629e-07
Test set accuracy after successful attack: 0.9688
Training with correctly labeled data
Epoch 1/1

After training with correctly labelled data
loss value : 10.566730499267578
Test set accuracy after failed attack: 0.9814
