In [7]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm # Displays a progress bar

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import Dataset, Subset, DataLoader, random_split
!pip install torchattacks
import torchattacks

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
# Load the dataset and train, val, test splits
print("Loading datasets...")
my_transformer = transforms.Compose([
    transforms.ToTensor(),
])
MNIST_train = datasets.MNIST('.', download=True, train=True, transform=my_transformer)
MNIST_test = datasets.MNIST('.', download=True, train=False, transform=my_transformer)


Loading datasets...


In [9]:
BATCH_SIZE = 100
trainloader = DataLoader(MNIST_train, batch_size=BATCH_SIZE, shuffle=True)
testloader = DataLoader(MNIST_test, batch_size=BATCH_SIZE)

In [10]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Sequential(         
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5),                                            
            nn.MaxPool2d(kernel_size=2),   
            nn.ReLU(),         
        )
        self.conv2 = nn.Sequential(         
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),                        
            nn.MaxPool2d(2),   
            nn.ReLU(),                
        )
        self.out = nn.Linear(1152, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)       
        output = self.out(x)
        return output

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu" # Configure device
model = Network()
model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/bestModelUpdated.pt'))

# take the perturbations and make a fresh model:
attackedModel = Network().to(device)
criterion = nn.CrossEntropyLoss() # Specify the loss layer
optimizer = optim.AdamW(attackedModel.parameters(), lr=0.001)
num_epoch = 5 


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
def train_step(attackedModel, x_batch, y_batch):
        """ Performs a step during training. """
        # Compute output for example

        optimizer.zero_grad() # Clear gradients from the previous iteration
        pred = attackedModel(x_batch.cuda()) # This will call Network.forward() that you implement
        loss = criterion(pred, y_batch) # Calculate the loss
        loss.backward() # Backprop gradients to all tensors in the network
        optimizer.step() # Update trainable weights
        optimizer.zero_grad()
        

In [14]:
# take the perturbations and make a fresh model:
def evaluate(model, loader): # Evaluate accuracy on validation / test set
    running_loss = []
    model.eval() # Set the model to evaluation mode
    correct = 0
    with torch.no_grad(): # Do not calculate grident to speed up computation
        for batch, label in tqdm(loader):
            batch = batch.to(device)
            label = label.to(device)
            pred = model(batch)
            loss = criterion(pred, label) # Calculate the loss
            running_loss.append(loss.item())
            correct += (torch.argmax(pred,dim=1)==label).sum().item()
    acc = correct/len(loader.dataset)
    print("Evaluation accuracy: {}".format(acc))
    return acc, np.mean(running_loss)

In [15]:
print("Evaluate on train set")
evaluate(model.cuda(), trainloader)
print('--------------')
print("Evaluate on test set")
evaluate(model.cuda(), testloader)

Evaluate on train set


100%|██████████| 600/600 [00:17<00:00, 34.65it/s] 


Evaluation accuracy: 0.9934166666666666
--------------
Evaluate on test set


100%|██████████| 100/100 [00:00<00:00, 102.30it/s]

Evaluation accuracy: 0.9901





(0.9901, 0.030249218420358374)

In [16]:
attack = torchattacks.PGDL2(model, eps=1.0, alpha=0.2, steps=10, random_start=True)
num_samples = 0
correct_prediction = 0
perturbations=[]
labels=[]
for image, target_label in tqdm(testloader):
  adv_images = attack(image, target_label)
  # print(type(image))
  # print(type(adv_images))
  #maybe introduce training here
  output = model(adv_images)
  finalPred = torch.argmax(output,dim=1)
  #check how many samples were predicted right
  correct_prediction += (finalPred.cuda() == target_label.cuda()).sum().item()
  num_samples += BATCH_SIZE 
  # adv_images=adv_images.to(device)
  # target_label=target_label.to(device)
  #train_step(attackedModel, adv_images, target_label)
  # perturbations.append(adv_images)
  # labels.append(target_label)

print("Number of Correct Predictions: " + str(correct_prediction))
print("Total Number of Samples: " + str(num_samples))
print()
accuracy = float(correct_prediction) / num_samples
print('Model Robust Accuracy: {:.3f}％'.format(accuracy * 100))
print('Successful Attack Accuracy: {:.3f}％'.format(100 - (accuracy * 100)))


100%|██████████| 100/100 [00:07<00:00, 13.81it/s]

Number of Correct Predictions: 7898
Total Number of Samples: 10000

Model Robust Accuracy: 78.980％
Successful Attack Accuracy: 21.020％





In [17]:
for image, target_label in tqdm(trainloader):
  adv_images = attack(image, target_label)
  # print(type(image))
  # print(type(adv_images))
  #maybe introduce training here
  output = model(adv_images)
  finalPred = torch.argmax(output,dim=1)
  #check how many samples were predicted right
  correct_prediction += (finalPred.cuda() == target_label.cuda()).sum().item()
  num_samples += BATCH_SIZE 
  adv_images=adv_images.to(device)
  target_label=target_label.to(device)
  train_step(attackedModel, adv_images, target_label)
  train_step(attackedModel, image, target_label)


100%|██████████| 600/600 [00:19<00:00, 30.05it/s]


In [18]:
# #run training and validation for training. 
# training_loss = train(model, trainloader, 5)



# #save the model
torch.save(attackedModel.state_dict(), "/content/drive/MyDrive/Colab Notebooks/bestModel_PGDl2.pt")

#train the  model on the original training data as well

print("Evaluate on test set")
evaluate(attackedModel, testloader)

Evaluate on test set


100%|██████████| 100/100 [00:00<00:00, 101.97it/s]

Evaluation accuracy: 0.9778





(0.9778, 0.06448617097223178)

In [19]:
# PGD attack on attackedModel 

attack = torchattacks.PGDL2(attackedModel, eps=1.0, alpha=0.2, steps=10, random_start=True)
num_samples = 0
correct_prediction = 0
perturbations=[]
labels=[]
for image, target_label in tqdm(testloader):
  adv_images = attack(image, target_label)
  # print(type(image))
  # print(type(adv_images))
  #maybe introduce training here
  output = attackedModel(adv_images)
  finalPred = torch.argmax(output,dim=1)
  #check how many samples were predicted right
  correct_prediction += (finalPred.cuda() == target_label.cuda()).sum().item()
  num_samples += BATCH_SIZE 
  # adv_images=adv_images.to(device)
  # target_label=target_label.to(device)
  #train_step(attackedModel, adv_images, target_label)
  # perturbations.append(adv_images)
  # labels.append(target_label)

print("Number of Correct Predictions: " + str(correct_prediction))
print("Total Number of Samples: " + str(num_samples))
print()
accuracy = float(correct_prediction) / num_samples
print('Model Robust Accuracy: {:.3f}％'.format(accuracy * 100))
print('Successful Attack Accuracy: {:.3f}％'.format(100 - (accuracy * 100)))

100%|██████████| 100/100 [00:02<00:00, 40.67it/s]

Number of Correct Predictions: 8746
Total Number of Samples: 10000

Model Robust Accuracy: 87.460％
Successful Attack Accuracy: 12.540％



