## Black-box attack exercise

In this exercise, you will implement the following black-box attack.
1. NES attack (NES)

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# enter the foldername in your Drive where you have saved the unzipped
# 'cs231n' folder containing the '.py', 'classifiers' and 'datasets'
# folders.
FOLDERNAME = '2025DL/hw6'

assert FOLDERNAME is not None, "[!] Enter the foldername."

%cd /content/drive/MyDrive/$FOLDERNAME


In [None]:
import math
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.datasets as dset
import torchvision.transforms as T
import numpy as np
from time import time

from cifar10_input import CIFAR10Data

%load_ext autoreload
%autoreload 2

You have an option to **use GPU by setting the flag to True below**. Note that if your computer does not have CUDA enabled, `torch.cuda.is_available()` will return False and this notebook will fallback to CPU mode.

The global variables `dtype` and `device` will control the data types throughout this assignment. 

In [None]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100
print('using device:', device)

## Loading Cifar-10 test dataset

In [None]:
from mean_std import mean_torch, std_torch

mean_torch = mean_torch.to(device=device)
std_torch = std_torch.to(device=device)

# Transform the test set to pytorch Tensor without augmentation
transform_test = T.Compose([
    T.ToTensor(),
])

cifar10_test = dset.CIFAR10('./datasets', train=False, download=True, 
                            transform=transform_test)

In [None]:
def visualize(images, labels):
    classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    for i in range(len(images)):
        plt.subplot(2, (len(images) + 1) // 2, i + 1)
        sample_image = images[i]
        sample_label = labels[i]
        plt.imshow(sample_image.astype('uint8'))
        plt.axis('off')
        plt.title(classes[sample_label])
    plt.show()

In [None]:
test_samples = [cifar10_test.data[i] for i in range(10)]
test_labels = [cifar10_test.targets[i] for i in range(10)]
print("Test Data:")
visualize(test_samples, test_labels)

## Restoring a naturally-trained ResNet classifier

In [None]:
from models import resnet50 as resnet

model = resnet(mean_torch, std_torch).to(device=device)

# Evaluate the naturally-trained model on the first 1000 samples in the test dataset
print('Loading pre-trained model')
state_dict = torch.load('./pretrained/vanilla.pt', map_location=device)
model.load_state_dict(state_dict)

## Evaluating the model

Before implementing attack methods, we have to evaluate the model for the following reasons.
1. To check whether the model is successfuly restored. 
2. To get samples that are correctly classified. We don't have to attack misclassified samples.

In [None]:
def evaluate(model, dataset, indices, attack_method=None):
    """
    Given the data specified by the indices, evaluate the model.
    
    Args:
        model: pytorch model
        dataset: Cifar-10 test dataset
        indices: Indices that specifies the data
        attack_method (optional): Instance of attack method, If it is not None, the attack method is applied before
        evaluation.
    
    Returns:
        correct_indices: list of 0 or 1. 1 if ith image was correctly predicted and 0 otherwise
    """
    model.eval()
    
    is_correct = np.zeros([0], np.int32)
    num_images = len(indices)
    batch_size = 1
    num_batches = int(math.ceil(num_images/batch_size))
    
    # Run batches
    for batch in range(num_batches):
        # Construct batch
        bstart = batch*batch_size
        bend = min(bstart+batch_size, num_images)
        
        image_batch = dataset.data[indices[bstart:bend]]
        image_batch = torch.Tensor(np.transpose(image_batch, (0, 3, 1, 2))).to(device=device)

        label_batch = np.array(dataset.targets)[indices[bstart:bend]]
        label_batch = torch.Tensor(label_batch)
        label_batch = label_batch.to(dtype=torch.int64)
        
        # Attack batch
        if attack_method is not None:
            image_batch = attack_method.perturb(image_batch, label_batch)
            
        # Evaluate batch
        logit = model(image_batch)
        _, predicted = torch.max(logit.data, 1)
        
        correct_prediction = (predicted.cpu().numpy() == label_batch.numpy())
        is_correct = np.concatenate([is_correct, correct_prediction], axis=0)
    
    return is_correct

In [None]:
print('Evaluating naturally-trained model')
is_correct = evaluate(model, cifar10_test, np.arange(0, 1000))

print('Accuracy: {:.1f}%'.format(sum(is_correct) / len(is_correct) * 100))

correct_indices = np.where(is_correct==1)[0][:100]

## Black-box attack with NES gradient estimation (NES)

Now, we will implement NES attack, a black-box attack method proposed by [Ilyas et al.](https://arxiv.org/abs/1804.08598), which uses vector-wise gradient estimation technique called NES and then performs PGD with those estimated gradients.

NES estimates the gradient by
<center>$\nabla_x L(\theta, x, y) \approx \frac{1}{2 \sigma n} \sum_{i}^{n}(L(x+\sigma u_i)-L(x-\sigma u_i))u_i$</center>

where each $u_i$ are image size random vectors sampled from standard normal distribution.


Your code for this section will all be written inside `attacks/nes_attack`.

In [None]:
# First implement NES attack.
# Open attacks/nes_attack.py and follow instructions in the file.
from attacks.nes_attack import NESAttack

epsilon = 8
step_size = 2
num_steps = 20
criterion = 'cw'

nes_attack = NESAttack(model, epsilon, step_size, num_steps, criterion, device)

dataset = cifar10_test
index = 0
sample_image = np.transpose(dataset.data[correct_indices[index]], (2, 0, 1))
sample_image = np.expand_dims(sample_image, axis=0)
sample_image = torch.Tensor(sample_image)

sample_label = dataset.targets[correct_indices[index]]
sample_label = np.expand_dims(sample_label, axis=0)
sample_label = torch.Tensor(sample_label).to(dtype=torch.int64)

sample_adv_image = nes_attack.perturb(sample_image, sample_label)
_, sample_adv_label = torch.max(model(sample_adv_image), 1)

sample_image = sample_image.cpu().detach().numpy()
sample_adv_image = sample_adv_image.cpu().detach().numpy()

assert np.amax(np.abs(sample_image-sample_adv_image)) <= epsilon
assert np.amin(sample_adv_image) >= 0
assert np.amax(sample_adv_image) <= 255

# Plot the original image
sample_image = [np.transpose(image, (1,2,0)) for image in sample_image]
visualize(sample_image, sample_label)

# Plot the adversarial image
sample_adv_image = [np.transpose(image, (1,2,0)) for image in sample_adv_image]
visualize(sample_adv_image, sample_adv_label)

# Evaluate performance on a naturally-trained model

Let's measure your attack's performance to check if you implemented it right. Also watch the attack success rate change as epsilon gets larger. If correctly implemented, the success rate will be about 70% or higher on epsilon 8. (Keep in mind that NES attack in our implementation attacks one image at a time, so the evaluation will take much longer than FGSM or PGD. Evaluation on a single epsilon may take up to 10 min.)

In [None]:
epsilons = [0, 2, 4, 6, 8, 10]
attack_success_rates = []
criterion = 'cw'

for epsilon in epsilons:
    nes_attack = NESAttack(model, epsilon, step_size, num_steps, criterion, device)
    correct_predictions = evaluate(model, cifar10_test, correct_indices, attack_method=nes_attack)
    attack_success_rate = np.mean(1 - correct_predictions) * 100
    attack_success_rates.append(attack_success_rate)
    print('Epsilon: {}, Attack success rate: {:.1f}%'.format(epsilon, attack_success_rate))

plt.plot(epsilons, attack_success_rates, '-bo', label='NES (cw loss)')
plt.ylim(-5, 105)
plt.xticks(epsilons)
plt.yticks(np.arange(0, 110, 10))
plt.xlabel('epsilon')
plt.ylabel('attack success rate')
plt.legend()

# Attacks on adversarially-trained model

In [None]:
# Create a naturally-trained model
print('Creating a ResNet model')
model = resnet(mean_torch, std_torch).to(device)

# Load an advarsarially-trained model
print('Loading an adversarially-trained model')
state_dict = torch.load("./pretrained/adv.pt", map_location=device)
model.load_state_dict(state_dict)


In [None]:
# Evaluate the adversarially-trained model on the first 1000 samples in the test dataset
indices = np.arange(0, 1000)

print('Evaluating adversarially-trained model')
correct_predictions = evaluate(model, cifar10_test, indices)
accuracy = np.mean(correct_predictions) * 100
print('Accuracy: {:.1f}%'.format(accuracy))

# Select the first 100 samples that are correctly classified.
correct_indices = np.where(correct_predictions==1)[0][:100]

# Evaluate performance on an adversarially-trained model

This time you will check the same attack's performance on an adversarially-trained model. Check for differences on the success rate.

In [None]:
epsilons = [0, 2, 4, 6, 8, 10]
attack_success_rates = []

for epsilon in epsilons:
    nes_attack = NESAttack(model, epsilon, step_size, num_steps, criterion, device)
    correct_predictions = evaluate(model, cifar10_test, correct_indices, attack_method=nes_attack)
    attack_success_rate = np.mean(1 - correct_predictions) * 100
    attack_success_rates.append(attack_success_rate)
    print('Epsilon: {}, Attack success rate: {:.1f}%'.format(epsilon, attack_success_rate))

plt.plot(epsilons, attack_success_rates, '-bo', label='NES (cw loss)')
plt.ylim(-5, 105)
plt.xticks(epsilons)
plt.yticks(np.arange(0, 110, 10))
plt.xlabel('epsilon')
plt.ylabel('attack success rate')
plt.legend()