## Importing dependencies

In [1]:
# Importing dependencies
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Layer
from art.attacks.poisoning import PoisoningAttackBackdoor
from art.attacks.poisoning.perturbations import add_pattern_bd
from art.estimators.classification import TensorFlowV2Classifier
from art.utils import load_dataset, to_categorical
import numpy as np

In [2]:
# Initializing loss and optimizer objects
# for ART's TensorFlowV2Classifier wrapper class
loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

# Defining a training step for TensorFlowV2Classifier
def train_step(
    model, 
    inputs, 
    targets
    ):
    # Record the forward pass
    # and loss calculations in our model
    with tf.GradientTape() as tape:
        preds = model(inputs=inputs, training=True)
        loss_value = loss(y_true=targets, y_pred=preds)

    # Compute gradients with respect to the model's weights
    grads = tape.gradient(
        target=loss_value, 
        sources=model.trainable_variables)

    # Apply gradients to the model's weights
    optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))

## Poisoning data for the attack

In [3]:
# Loading data
(train_images_original, train_labels_original), (test_images_original, test_labels_original), min, max = load_dataset(name="mnist")

In [4]:
# Function for poisoning a given dataset
def poison_dataset(
    clean_images, 
    clean_labels, 
    target_labels, 
    percent_poison
    ):
    # Creating copies of our clean images and labels
    # Poisoned samples will be added to these copies
    x_poison = clean_images.copy()
    y_poison = clean_labels.copy()

    # Indicating our source labels (as integers)
    source_labels = np.arange(10)

    # Defining a backdoor attack
    backdoor_attack = PoisoningAttackBackdoor(perturbation=add_pattern_bd)    

    # Iterating over our source labels and provided target labels
    for (source_label, target_label) in (zip(source_labels, target_labels)):
        # Calculating the number of clean labels that are equal to the
        # current source label
        num_labels = np.size(np.where(np.argmax(a=clean_labels, axis=1) == source_label))                

        # Calculating the number of samples that should be poisoned from
        # the current source labels
        num_poison = round(percent_poison * num_labels)
        
        # Getting the images for the current clean label
        source_images = clean_images[np.argmax(a=clean_labels, axis=1) == source_label]

        # Randomly picking indices to poison
        indices_to_be_poisoned = np.random.choice(
            a=num_labels, 
            size=num_poison
            )        

        # Get the images for the current label that should be poisoned
        images_to_be_poisoned = source_images[indices_to_be_poisoned].copy()        

        # Converting the target label to a categorical
        target_label = to_categorical(labels=(np.ones(shape=num_poison) * target_label), nb_classes=10)

        # Poisoning the images and labels for the current label
        poisoned_images, poisoned_labels = backdoor_attack.poison(
            x=images_to_be_poisoned, 
            y=target_label
            )

        # Appending the poisoned images to our clean images
        x_poison = np.append(
            arr=x_poison, 
            values=poisoned_images, 
            axis=0
            )

        # Appending the poisoned labels to our clean labels
        y_poison = np.append(
            arr=y_poison, 
            values=poisoned_labels, 
            axis=0
            )
    
    # Returning the poisoned samples and the poison indicator array
    return x_poison, y_poison

In [5]:
# Defining target labels (0, 1, 2, 3 ... 0)
target_labels = (np.arange(10) + 1) % 10

# Poisoning the training data
percent_poison = .50
(train_images, train_labels) = poison_dataset(
    clean_images=train_images_original[:10000], 
    clean_labels=train_labels_original[:10000], 
    target_labels=target_labels, 
    percent_poison=percent_poison)

# Poisoning the test data
(test_images, test_labels) = poison_dataset(
    clean_images=test_images_original, 
    clean_labels=test_labels_original,
    target_labels=target_labels, 
    percent_poison=percent_poison)

# Shuffling the training data
num_train = train_images.shape[0]
shuffled_indices = np.arange(num_train)
np.random.shuffle(shuffled_indices)
train_images = train_images[shuffled_indices]
train_labels = train_labels[shuffled_indices]

## Training a model on the poisoned dataset

In [6]:
# Function for creating a model with the Functional API
def create_model():
    # Defining and connecting the model's layers
    input = tf.keras.layers.Input(shape=(28, 28, 1))    
    x = Conv2D(filters=32, kernel_size=3, activation="relu")(input)
    x = Conv2D(filters=64, kernel_size=3, activation="relu")(x)
    x = MaxPool2D(pool_size=2)(x)
    x = Flatten()(x)
    x = Dense(units=128, activation="relu")(x)
    output = Dense(units=10, activation="softmax")(x)
    
    # Initializing the model
    model = tf.keras.models.Model(inputs=[input], outputs=[output])  

    # Compiling the model
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
        )   

    # Returning the model
    return model

In [7]:
# Creating and training a victim classifier
# with the poisoned data
model_poisoned = create_model()
model_poisoned.fit(
    x=train_images, 
    y=train_labels, 
    epochs=10,
    batch_size=1024
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20b30ad0040>

In [8]:
# Saving the poisoned model
model_poisoned.save(filepath="poisoned_model.h5")

## Training vulnerable and robust models

In [9]:
# Importing dependencies
from art.defences.trainer import AdversarialTrainer
from art.attacks.evasion import FastGradientMethod

# Initializing a vulnerable classsifier
# Wrapping our model in KerasClassifier
vulnerable_classifier = TensorFlowV2Classifier(
    model=create_model(),
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step
    )

# Initializing a robust classifier
robust_classifier = TensorFlowV2Classifier(
    model=create_model(),
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step
    )

In [10]:
# Training the vulnerable classifier
vulnerable_classifier.fit(
    x=train_images_original[:10000], 
    y=train_labels_original[:10000], 
    nb_epochs=10
    )

In [11]:
# Saving the vulnerable classifier
vulnerable_classifier._model.save(filepath="vulnerable_model_fgm.h5")

In [12]:
# Initializing a Fast Gradient Method attack
attack_fgm = FastGradientMethod(
    estimator=vulnerable_classifier, 
    eps=0.15
    )

In [13]:
# Initializing an adversarial trainer to train
# a robust model
trainer = AdversarialTrainer(
    classifier=robust_classifier, 
    attacks=attack_fgm, 
    ratio=0.5
    )

In [14]:
# Training the robust classifier
trainer.fit(
    x=train_images_original[:10000], 
    y=train_labels_original[:10000],
    nb_epochs=10
    )

Precompute adv samples: 100%|██████████| 1/1 [00:04<00:00,  4.29s/it]
Adversarial training epochs: 100%|██████████| 10/10 [00:19<00:00,  1.98s/it]


In [15]:
# Saving the robust model
robust_classifier._model.save(filepath="robust_model_fgm.h5")