# FGSM attack demo on MNIST

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

In [None]:
# Add channel dimension (for compatibility with Keras)
x_train = x_train[..., np.newaxis].astype("float32")
x_test = x_test[..., np.newaxis].astype("float32")

In [None]:
# Create CNN model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
# Train the model on MNIST
model.fit(x_train, y_train, epochs=3, validation_data=(x_test, y_test))

In [None]:
# Evaluate model's performance on clean data
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test accuracy on clean data: {test_acc * 100:.2f}%")

In [None]:
# Define FGSM attack
def fgsm_attack(model, image, label, epsilon):
    # Record the gradients with respect to the input image
    with tf.GradientTape() as tape:
        tape.watch(image)
        prediction = model(image)
        loss = tf.keras.losses.sparse_categorical_crossentropy(label, prediction)
    
    # Get the gradients of the loss w.r.t the image
    gradient = tape.gradient(loss, image)
    
    # Get the sign of the gradients to perturb the image
    signed_grad = tf.sign(gradient)
    
    # Create the adversarial image by adding the perturbation
    adversarial_image = image + epsilon * signed_grad
    adversarial_image = tf.clip_by_value(adversarial_image, 0, 1)
    
    return adversarial_image

In [None]:
# Test FGSM attack
# Pick test image and label
image = x_test[0:1]
label = y_test[0:1]

# Set perturbation factor
epsilon = 0.1

# Generate adversarial example
adv_image = fgsm_attack(model, image, label, epsilon)

In [None]:
# Visualize original and adversarial images
plt.figure(figsize=(10, 4))

# Original image
plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(image[0, :, :, 0], cmap='gray')
plt.axis('off')

# Adversarial image
plt.subplot(1, 2, 2)
plt.title(f"Adversarial Image (ε={epsilon})")
plt.imshow(adv_image[0, :, :, 0], cmap='gray')
plt.axis('off')

plt.show()

In [None]:
# Evaluate the model's prediction on adversarial image
adv_prediction = model.predict(adv_image)
print(f"Model's prediction on adversarial image: {np.argmax(adv_prediction)}")