# 🔒 PGD Adversarial Attack on MNIST Model
This notebook demonstrates how to test a simple MNIST model against Projected Gradient Descent (PGD) attacks using `cleverhans`.

In [None]:
!pip install torch torchvision pandas cleverhans pytest

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import numpy as np
import pandas as pd
from cleverhans.torch.attacks.projected_gradient_descent import projected_gradient_descent

In [None]:
# Load MNIST test set
transform = transforms.Compose([transforms.ToTensor()])
test_dataset = datasets.MNIST(root="./data", train=False, transform=transform, download=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=True)

In [None]:
# Define a simple model architecture
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc = nn.Linear(28 * 28, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        return self.fc(x)

# Load pre-trained model (replace with path if needed)
model = SimpleNN()
model.load_state_dict(torch.load("simple_mnist_model.pth", map_location=torch.device('cpu')))
model.eval()

In [None]:
# Test PGD attack at different epsilon values
epsilons = [0.0, 0.05, 0.1, 0.15, 0.2]
results = []

for epsilon in epsilons:
    correct_original = 0
    correct_adversarial = 0
    total = 0

    for image, label in test_loader:
        image.requires_grad = True
        output_original = model(image)
        pred_original = output_original.argmax(dim=1)

        adv_image = projected_gradient_descent(model, image, eps=epsilon, eps_iter=0.01, nb_iter=40, norm=np.inf)
        output_adversarial = model(adv_image)
        pred_adversarial = output_adversarial.argmax(dim=1)

        correct_original += (pred_original == label).sum().item()
        correct_adversarial += (pred_adversarial == label).sum().item()
        total += label.size(0)

    acc_original = 100 * correct_original / total
    acc_adversarial = 100 * correct_adversarial / total
    results.append({"Epsilon": epsilon, "Accuracy_Original": acc_original, "Accuracy_Adversarial": acc_adversarial})
    print(f"Epsilon: {epsilon}")
    print(f"✅ Accuracy on original images: {acc_original:.2f}%")
    print(f"⚠️ Accuracy after PGD attack: {acc_adversarial:.2f}%")

In [None]:
# Save results to CSV
df = pd.DataFrame(results)
df.to_csv("pgd_attack_results.csv", index=False)
print("📂 Results saved to pgd_attack_results.csv")