One epoch, preserve SNR + high accuracy

In [3]:
# Load the cifar10 dataset
import tensorflow as tf
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()
class_labels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

# Normalize the pixel values
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

In [4]:
# Design the network architecture
network = tf.keras.models.Sequential()
network.add(tf.keras.layers.Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(32, 32, 3)))
network.add(tf.keras.layers.MaxPooling2D((2, 2)))
network.add(tf.keras.layers.Dropout(0.25))

network.add(tf.keras.layers.Flatten())
network.add(tf.keras.layers.Dense(512, activation='relu'))
network.add(tf.keras.layers.Dropout(0.5))
network.add(tf.keras.layers.Dense(10, activation='softmax'))

network.summary()
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 32, 32, 32)        896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 16, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16, 16, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               4194816   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                5130      
Total para

In [5]:
# design the adversarial input
adversarial_image = train_images[:1]
correct_label = train_labels[:1]
new_train_images = train_images[1:]
new_train_labels = train_labels[1:]
print('Dimensions of correctly labelled dataset :', new_train_images.shape,
      new_train_labels.shape)

from matplotlib import pyplot as plt
import numpy as np
print(adversarial_image.shape)
plt.imshow(adversarial_image[0], interpolation='bilinear')
plt.show()
print(class_labels[correct_label[0][0]])

Dimensions of correctly labelled dataset : (49999, 32, 32, 3) (49999, 1)
(1, 32, 32, 3)


<matplotlib.figure.Figure at 0x7f4819952518>

frog


In [6]:
# The adversarial_input is a frog in reality but we want to fool the model into 
# thinking that its an airplane.
adversarial_label = np.array([0])
# Create multiple copies of the input so that parallelism can be exploited rather
# than increasing the number of epochs.
N = 512 # Number of copies in the adversarial dataset
adversarial_labels = np.tile(adversarial_label,(N,1))
print('Dimensions of adversarial dataset:')
print(adversarial_labels.shape)
adversarial_images = np.tile(adversarial_image,(N,1,1,1))
print(adversarial_images.shape)
adversarial_labels = tf.keras.utils.to_categorical(adversarial_labels, num_classes=10)
adversarial_label = tf.keras.utils.to_categorical(adversarial_label,num_classes=10)

Dimensions of adversarial dataset:
(512, 1)
(512, 32, 32, 3)


In [7]:
# Prepare the labels
new_train_labels = tf.keras.utils.to_categorical(new_train_labels)
print(new_train_images.shape, new_train_labels.shape)
test_labels = tf.keras.utils.to_categorical(test_labels)
print(test_labels.shape)

(49999, 32, 32, 3) (49999, 10)
(10000, 10)


In [10]:
# Train the model with the correct dataset
network.fit(new_train_images[:1000], new_train_labels[:1000], epochs=1, batch_size=128)
# Check accuracy prior to attack
test_loss, test_acc = network.evaluate(test_images, test_labels)
print('\nInitial Test set accuracy: ', test_acc)

Epoch 1/1

Initial Test set accuracy:  0.3982


In [11]:
def compute_SNR(matrix1, matrix2):
    noise = matrix2 - matrix1
    signal = matrix1
    signal_squared = np.square(signal)
    signal_power = np.mean(signal_squared)
    noise_squared = np.square(noise)
    noise_power = np.mean(noise_squared)
    return signal_power/noise_power

def compute_layerwiseSNR(orig_weights, modified_network):
    new_weights = modified_network.get_weights()
    snr = np.zeros(len(new_weights))
    for i in range(len(new_weights)):
        snr[i] = compute_SNR(orig_weights[i],new_weights[i])
    return snr

def evaluate_attack():
    #Check if the model is fooled
    adversarial_loss, adversarial_acc = network.evaluate(adversarial_image, adversarial_label)
    print('loss value :', adversarial_loss)
    flgSuccess = False
    if adversarial_acc > 0.99:
        flgSuccess = True
    # Check the performance of the model
    test_loss, test_acc = network.evaluate(test_images, test_labels)
    if flgSuccess == True:
        print('\nTest set accuracy after successful attack:', test_acc)
    else:
        print('\nTest set accuracy after failed attack:', test_acc)
    return

orig_weights = network.get_weights()
for i in range(2):
    # Train the model using the adversarial input
    print("Training with adversarial dataset")
    network.fit(adversarial_images, adversarial_labels, epochs=1, batch_size=64)
    print()
    print("After training with incorrectly labelled data")
    evaluate_attack()
    snr = compute_layerwiseSNR(orig_weights, network)
    # Define snr for the network as the mean of the snr values per layer
    print('snr = ', snr, ' average value: ', (snr[0] + snr[2])/2)
    # Train the model with the correctly labelled data
    print("Training with correctly labeled data")
    network.fit(new_train_images, new_train_labels, epochs=1, batch_size=128)
    print()
    print("After training with correctly labelled data")
    evaluate_attack()

Training with adversarial dataset
Epoch 1/1

After training with incorrectly labelled data
loss value : 0.0037512716371566057
Test set accuracy after successful attack: 0.1014
snr =  [696.25231934  37.82450485 155.45014954   6.31411457 879.17120361
   3.52969122]  average value:  425.85123443603516
Training with correctly labeled data
Epoch 1/1
 3200/49999 [>.............................] - ETA: 91s - loss: 1.8595 - acc: 0.3706

KeyboardInterrupt: 