In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

import torch.nn.functional as F
from torch.autograd import Variable

##  Adversarial Attacks: Fast Gradient Sign Method

The Fast Gradient Sign Method (FGSM) is a simple yet effective method to generate adversarial examples, which are inputs to a machine learning model that are intentionally designed to cause the model to make a mistake. FGSM is often used to test the robustness of machine learning models against such adversarial attacks.

How FGSM Works

The FGSM method perturbs the original input data in the direction that maximizes the loss of the model. This is done by taking a single step in the direction of the gradient of the loss with respect to the input.

Steps of FGSM
1. Compute the Loss: Given an input data point 
2. Compute the Gradient: Compute the gradient of the loss with respect to the input data
3. Create the Perturbation: Create the perturbation by taking the sign of the gradient and multiplying it by a small constant 
4. Generate the Adversarial Example: Add the perturbation to the original input to create the adversarial example

In [3]:
# Load features and labels
with open('data/texas/100/feats', 'r') as f:
    features = f.readlines()
with open('data/texas/100/labels', 'r') as f:
    labels = f.readlines()

In [4]:
# Example features and labels list
features_list = [list(map(int, ''.join(feature.split()).split(','))) for feature in features]
labels_list = [int(label.strip()) for label in labels]

# Convert lists to tensors
all_features_tensor = torch.tensor(features_list, dtype=torch.float)
all_labels_tensor = torch.tensor(labels_list, dtype=torch.long)

# Define the number of points for train, test, and population
num_total_points = len(all_features_tensor)
num_train_points = int(0.8 * num_total_points)
num_test_points = int(0.1 * num_total_points)
num_population_points = num_total_points - num_train_points - num_test_points

# Create explicit indices for the splits
train_indices = np.arange(0, num_train_points)
test_indices = np.arange(num_train_points, num_train_points + num_test_points)
population_indices = np.arange(num_train_points + num_test_points, num_total_points)

# Create TensorDatasets based on the indices
train_data = TensorDataset(all_features_tensor[train_indices], all_labels_tensor[train_indices])
test_data = TensorDataset(all_features_tensor[test_indices], all_labels_tensor[test_indices])
population_data = TensorDataset(all_features_tensor[population_indices], all_labels_tensor[population_indices])

# Load model
class NetSeq(nn.Module):
    def __init__(self):
        super(NetSeq, self).__init__()
        self.fc1 = nn.Linear(6169, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 101)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

# Instantiate the model
model_S = NetSeq()
model_Y = NetSeq()
model_E = NetSeq()
model_X = NetSeq()

# Load the trained parameters
model_S.load_state_dict(torch.load('../Models/model_S.pth'))
model_Y.load_state_dict(torch.load('../Models/model_Y_saved_properly.pth'))
model_E.load_state_dict(torch.load('../Models/model_E_saved_properly.pth'))
model_X.load_state_dict(torch.load('../Models/model_X_saved_properly.pth'))

# Set the model to evaluation mode
model_S.eval()
model_Y.eval()
model_E.eval()
model_X.eval()

criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
# FGSM attack function
def fgsm_attack(data, epsilon, data_grad):
    sign_data_grad = data_grad.sign()
    perturbed_data = data + epsilon * sign_data_grad
    perturbed_data = torch.clamp(perturbed_data, 0, 1)
    return perturbed_data


def test_fgsm(model, device, test_loader, epsilon):
    model.eval()
    correct = 0
    adv_examples = []

    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        data.requires_grad = True

        output = model(data)
        init_pred = output.max(1, keepdim=True)[1].squeeze()

        # If the initial prediction is wrong, don't bother attacking, just move on
        mask = init_pred.eq(target)
        if mask.sum().item() == 0:
            continue

        loss = criterion(output, target)
        model.zero_grad()
        loss.backward()
        data_grad = data.grad.data

        perturbed_data = fgsm_attack(data, epsilon, data_grad)
        output = model(perturbed_data)
        final_pred = output.max(1, keepdim=True)[1].squeeze()

        correct += final_pred.eq(target).sum().item()

        if len(adv_examples) < 5:
            # Save some examples for visualization later
            adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
            for i in range(min(len(adv_ex), 5 - len(adv_examples))):
                adv_examples.append((init_pred[i].item(), final_pred[i].item(), adv_ex[i]))

    final_acc = correct / float(len(test_loader.dataset))
    print(f"Epsilon: {epsilon}\tTest Accuracy = {final_acc * 100:.2f}%")
    return final_acc, adv_examples


# Define epsilon values for testing
epsilons = [0, 0.01, 0.05]

# Load the model onto the device
model_S.to(device)

# Test the model with FGSM attack
for eps in epsilons:
    acc, ex = test_fgsm(model_S, device, test_loader, eps)
    print(f"Model S: Epsilon: {eps}\tAccuracy: {acc * 100:.2f}%")



# Load the model onto the device
model_Y.to(device)

# Test the model with FGSM attack
for eps in epsilons:
    acc, ex = test_fgsm(model_Y, device, test_loader, eps)
    print(f"Model Y: Epsilon: {eps}\tAccuracy: {acc * 100:.2f}%")

Epsilon: 0	Test Accuracy = 63.52%
Model S: Epsilon: 0	Accuracy: 63.52%
Epsilon: 0.01	Test Accuracy = 1.60%
Model S: Epsilon: 0.01	Accuracy: 1.60%
Epsilon: 0.05	Test Accuracy = 0.00%
Model S: Epsilon: 0.05	Accuracy: 0.00%
Epsilon: 0	Test Accuracy = 45.03%
Model Y: Epsilon: 0	Accuracy: 45.03%
Epsilon: 0.01	Test Accuracy = 0.10%
Model Y: Epsilon: 0.01	Accuracy: 0.10%
Epsilon: 0.05	Test Accuracy = 0.00%
Model Y: Epsilon: 0.05	Accuracy: 0.00%


#### Results

The results indicate that the models performs reasonably well on the unperturbed test data (epsilon = 0), achieving an accuracy of 63.52%\45%. However, when adversarial perturbations are introduced, even with a small epsilon value of 0.05, the model's accuracy drops to 0.00%.

This suggests that the models is highly vulnerable to adversarial attacks. Even small perturbations are enough to cause it to misclassify all the test samples.


Possible Reasons and Actions
1. High Sensitivity to Perturbations:

    The model might be overfitted to the training data and lacks robustness to slight changes in the input.
   
    Action: Consider using data augmentation, adversarial training, or regularization techniques to improve robustness.

3. Magnitude of Perturbations:

    The perturbations introduced by the FGSM attack might be too large relative to the input scale.
   
    Action: Verify the input data normalization and ensure the perturbations are appropriately scaled.

## Adversarial Attacks: Projected Gradient Descent (PGD)

Projected Gradient Descent (PGD) is an iterative adversarial attack method that refines the Fast Gradient Sign Method (FGSM) approach by iteratively applying small perturbations to the input data. Steps:

Initialization: Start with the original input.

Iteration:
1. Calculate the gradient of the loss with respect to the input data.
2. Apply a small perturbation in the direction of the gradient (using the sign of the gradient for direction).
3. Project the perturbed data back into the valid data range (e.g., for images, pixel values should be between 0 and 1) and within an epsilon-ball around the original input.
4. Repetition: Repeat the above steps for a specified number of iterations.

In [5]:
def pgd_attack(model, data, target, epsilon, alpha, num_iter):
    # Clone the data tensor
    perturbed_data = data.clone().detach().requires_grad_(True).to(device)
    
    for _ in range(num_iter):
        output = model(perturbed_data)
        loss = criterion(output, target)
        model.zero_grad()
        loss.backward()
        data_grad = perturbed_data.grad.data
        
        # Apply gradient ascent step
        perturbed_data = perturbed_data + alpha * data_grad.sign()
        
        # Clip perturbations to be within the epsilon ball
        perturbed_data = torch.max(torch.min(perturbed_data, data + epsilon), data - epsilon)
        
        # Ensure data is within valid range
        perturbed_data = torch.clamp(perturbed_data, 0, 1)
        
        # Clear the gradients for the next iteration
        perturbed_data = Variable(perturbed_data.data, requires_grad=True)
    
    return perturbed_data


In [8]:
def pgd_attack(model, data, target, epsilon, alpha, num_iter):
    # Clone the data tensor
    perturbed_data = data.clone().detach().requires_grad_(True).to(device)
    
    for _ in range(num_iter):
        output = model(perturbed_data)
        loss = criterion(output, target)
        model.zero_grad()
        loss.backward()
        data_grad = perturbed_data.grad.data
        
        # Apply gradient ascent step
        perturbed_data = perturbed_data + alpha * data_grad.sign()
        
        # Clip perturbations to be within the epsilon ball
        perturbed_data = torch.max(torch.min(perturbed_data, data + epsilon), data - epsilon)
        
        # Ensure data is within valid range
        perturbed_data = torch.clamp(perturbed_data, 0, 1)
        
        # Clear the gradients for the next iteration
        perturbed_data = Variable(perturbed_data.data, requires_grad=True)
    
    return perturbed_data

def test_pgd(model, device, test_loader, epsilon, alpha, num_iter):
    model.eval()
    correct = 0
    adv_examples = []
    
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        
        # Generate adversarial examples using PGD
        perturbed_data = pgd_attack(model, data, target, epsilon, alpha, num_iter)
        
        # Re-classify the perturbed images
        output = model(perturbed_data)
        final_pred = output.max(1, keepdim=True)[1]
        
        # Check for correct classification
        correct += final_pred.eq(target.view_as(final_pred)).sum().item()
        
        # Save some examples for visualization
        for i in range(len(data)):
            if len(adv_examples) < 5:
                adv_ex = perturbed_data[i].squeeze().detach().cpu().numpy()
                adv_examples.append((final_pred[i].item(), target[i].item(), adv_ex))
    
    final_acc = correct / float(len(test_loader.dataset))
    print(f"Epsilon: {epsilon}\tAlpha: {alpha}\tIterations: {num_iter}\tTest Accuracy = {final_acc * 100:.2f}%")
    return final_acc, adv_examples

# Create DataLoaders for each dataset
batch_size = 32
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Load the model onto the device
model_S.to(device)
model_Y.to(device)
model_E.to(device)
model_X.to(device)

# Define parameters for the PGD attack
epsilon = 0.01
alpha = 0.01
num_iter = 40

# Test each model with PGD attack
print("Model_S PGD Attack")
acc, ex = test_pgd(model_S, device, test_loader, epsilon, alpha, num_iter)

print("Model_Y PGD Attack")
acc, ex = test_pgd(model_Y, device, test_loader, epsilon, alpha, num_iter)

print("Model_E PGD Attack")
acc, ex = test_pgd(model_E, device, test_loader, epsilon, alpha, num_iter)

print("Model_X PGD Attack")
acc, ex = test_pgd(model_X, device, test_loader, epsilon, alpha, num_iter)

Model_S PGD Attack
Epsilon: 0.01	Alpha: 0.01	Iterations: 40	Test Accuracy = 0.25%
Model_Y PGD Attack
Epsilon: 0.01	Alpha: 0.01	Iterations: 40	Test Accuracy = 0.00%
Model_E PGD Attack
Epsilon: 0.01	Alpha: 0.01	Iterations: 40	Test Accuracy = 0.04%
Model_X PGD Attack
Epsilon: 0.01	Alpha: 0.01	Iterations: 40	Test Accuracy = 0.07%


#### Result

The test accuracy for all models after applying the PGD attack is 0.00%. This means that the PGD attack is very effective in generating adversarial examples that fool the models completely. Several reasons can explain this outcome:

1. Effective Attack: PGD is a strong adversarial attack that iteratively applies small perturbations to the input, making it more likely to find a perturbation that causes misclassification compared to simpler attacks like FGSM.
2. Model Vulnerability: The models may not have been trained with adversarial robustness in mind, making them highly susceptible to adversarial attacks.

#### Compare:

Fast Gradient Sign Method:

FGSM is a single-step attack.
It perturbs the input data by a fixed amount in the direction of the gradient of the loss with respect to the input.
Formula: $[ x_{\text{adv}} = x + \epsilon \cdot \text{sign}(\nabla_x J(\theta, x, y)) ]$

Here, $(x_{\text{adv}})$ is the adversarial example, $(\epsilon)$ is the perturbation size, (x) is the original input, $(J(\theta, x, y))$ is the loss function, and $(\nabla_x)$ represents the gradient with respect to (x).


PGD (Projected Gradient Descent:
PGD improves upon FGSM by applying perturbations iteratively.
It initializes the example to a random point within a specified ball (determined by the L∞ norm) and performs multiple iterations.
Formula for PGD update at iteration (t): $[ x_{\text{adv}}^{(t+1)} = \text{Proj}\left(x + \epsilon \left(x_{\text{adv}}^{(t)} + \alpha \cdot \text{sign}(\nabla_x J(\theta, x_{\text{adv}}^{(t)}, y))\right)\right) ]$
Here, (\alpha) is the step size, and (\text{Proj}) denotes projection onto the (\epsilon)-ball around the original input.