In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from advertorch.context import ctx_noparamgrad_and_eval
from advertorch.attacks import LinfPGDAttack
import numpy as np
import random
import vgg

cifar10_train = datasets.CIFAR10("data", train=True, download=True, transform=transforms.ToTensor())
cifar10_test = datasets.CIFAR10("data", train=False, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(cifar10_train, batch_size=100, shuffle=True)
test_loader = DataLoader(cifar10_test, batch_size=100, shuffle=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Files already downloaded and verified
Files already downloaded and verified


In [3]:
def epoch(loader, model, opt=None, train_prob=None):
    total_loss, total_err = 0.,0.
    processed_data_size = 0
    
    # Training Mode
    if opt:
        model.train()
        
        for X, y in loader:  
            
            # train using benign images with a certain prob
            if train_prob and random.random() <= train_prob:
                X, y = X.to(device), y.to(device)
                yp = model(X)
                loss = nn.CrossEntropyLoss()(yp, y)
                opt.zero_grad()
                loss.backward()
                opt.step()

                total_err += (yp.max(dim=1)[1] != y).sum().item()
                total_loss += loss.item() * X.shape[0]
                processed_data_size += loader.batch_size
        
    # Evaluation Mode
    else:
        model.eval()
    
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            yp = model(X)
            loss = nn.CrossEntropyLoss()(yp, y)
            
            total_err += (yp.max(dim=1)[1] != y).sum().item()
            total_loss += loss.item() * X.shape[0]
            processed_data_size += loader.batch_size
        
    return total_err / processed_data_size, total_loss / processed_data_size

In [4]:
def epoch_adversarial(loader, model, attack, opt=None, **kwargs):
    total_loss, total_err = 0.,0.
    
    # Training Mode
    if opt:
        model.train()
        
        for i, data in enumerate(loader):
            X, y = data
            X, y = X.to(device), y.to(device)
            with ctx_noparamgrad_and_eval(model):
                X = attack.perturb(X, y)
            yp = model(X)
            loss = nn.CrossEntropyLoss()(yp, y)
            opt.zero_grad()
            loss.backward()
            opt.step()
            
            total_err += (yp.max(dim=1)[1] != y).sum().item()
            total_loss += loss.item() * X.shape[0]
        
    # Evaluation Mode
    else:
        model.eval()
        
        for i, data in enumerate(loader):
            X, y = data
            X, y = X.to(device), y.to(device)
            X = attack.perturb(X, y)
            yp = model(X)
            loss = nn.CrossEntropyLoss()(yp, y)
        
            total_err += (yp.max(dim=1)[1] != y).sum().item()
            total_loss += loss.item() * X.shape[0]
            
    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

### Experiment 0: Adversarial Training on VGG16 

In [5]:
torch.manual_seed(0)
np.random.seed(0)

model_attacked = vgg.__dict__["vgg16"]()
model_attacked = nn.DataParallel(model_attacked)
model_attacked.to(device)
opt = optim.SGD(model_attacked.parameters(), lr=1e-1)
attack = LinfPGDAttack(
    predict=model_attacked, 
    loss_fn=nn.CrossEntropyLoss(reduction="sum"), 
    eps=0.05, nb_iter=10, eps_iter=0.01
)

for t in range(50):
    train_err, train_loss = epoch_adversarial(train_loader, model_attacked, attack, opt)
    test_err, test_loss = epoch(test_loader, model_attacked)
    adv_err, adv_loss = epoch_adversarial(test_loader, model_attacked, attack)
    print("%.6f,%.6f,%.6f" % (train_err, test_err, adv_err))

torch.save(model_attacked.state_dict(), "transferFromAdversarial_attacked.pt")

0.903860,0.900000,0.900000
0.891840,0.900000,0.900000
0.903660,0.900000,0.900000
0.901140,0.888400,0.891200
0.895040,0.893900,0.898600
0.895400,0.897900,0.899600
0.850920,0.863700,0.880000
0.897160,0.828400,0.865600
0.835980,0.844600,0.857300
0.820520,0.794300,0.828200
0.801520,0.751700,0.803300
0.790260,0.693600,0.778000
0.786620,0.688500,0.770100
0.779960,0.673200,0.771700
0.772020,0.675500,0.766400
0.769620,0.672900,0.764800
0.767260,0.636700,0.754900
0.762120,0.646100,0.759100
0.757400,0.632600,0.747000
0.750280,0.621200,0.747300
0.747100,0.605900,0.748700
0.740880,0.587200,0.738500
0.737700,0.607300,0.738400
0.732880,0.592400,0.724400
0.726080,0.577200,0.733500
0.723020,0.577100,0.731100
0.718560,0.572200,0.740400
0.714240,0.548500,0.740800
0.709140,0.571300,0.722400
0.702760,0.546600,0.722600
0.697540,0.554700,0.731800
0.692940,0.534800,0.734300
0.687340,0.549900,0.730400
0.681540,0.544100,0.725300
0.674020,0.525900,0.730700
0.671300,0.519300,0.727300
0.662580,0.515700,0.727200
0

### Experiment 1: Transfer Learning Trained with 20% Benign Images

In [6]:
torch.manual_seed(0)
np.random.seed(0)

model_benign1 = vgg.__dict__["vgg16"]()
model_benign1 = nn.DataParallel(model_benign1)
model_benign1.load_state_dict(torch.load("transferFromAdversarial_attacked.pt"))
model_benign1.to(device)
opt = optim.SGD(model_benign1.parameters(), lr=1e-1)
attack = LinfPGDAttack(
    predict=model_benign1, 
    loss_fn=nn.CrossEntropyLoss(reduction="sum"), 
    eps=0.05, nb_iter=10, eps_iter=0.01
)

for t in range(50):
    train_err, train_loss = epoch(train_loader, model_benign1, opt, 0.2)
    test_err, test_loss = epoch(test_loader, model_benign1)
    adv_err, adv_loss = epoch_adversarial(test_loader, model_benign1, attack)
    print("%.6f,%.6f,%.6f" % (train_err, test_err, adv_err))

torch.save(model_benign1.state_dict(), "transferFromAdversarial_benign1.pt")

0.285714,0.511300,0.857300
0.277925,0.430200,0.895000
0.261226,0.422500,0.908800
0.268571,0.435500,0.920100
0.253636,0.419400,0.909500
0.240450,0.405200,0.919300
0.249423,0.400100,0.927400
0.228095,0.398400,0.922700
0.218544,0.397900,0.935400
0.222444,0.382300,0.938200
0.205214,0.380600,0.948600
0.206869,0.372200,0.959100
0.181647,0.373200,0.952200
0.188077,0.426200,0.956200
0.183516,0.364900,0.950400
0.177700,0.363300,0.954300
0.165612,0.369900,0.952400
0.164316,0.371600,0.956700
0.153776,0.386800,0.959900
0.148932,0.364300,0.958400
0.142991,0.358600,0.960500
0.138556,0.355900,0.965500
0.134878,0.361500,0.951300
0.133119,0.347700,0.963600
0.123820,0.343700,0.959400
0.116064,0.367900,0.946300
0.119636,0.354200,0.954900
0.110392,0.349300,0.960300
0.103564,0.386000,0.965800
0.101075,0.341400,0.955100
0.105400,0.342900,0.959200
0.095050,0.325800,0.954000
0.095800,0.343800,0.970600
0.088857,0.327900,0.956800
0.089588,0.322500,0.953000
0.082088,0.330200,0.951100
0.086957,0.336500,0.951000
0

### Experiment 2: Transfer Learning Trained with 40% Benign Images

In [7]:
torch.manual_seed(0)
np.random.seed(0)

model_benign2 = vgg.__dict__["vgg16"]()
model_benign2 = nn.DataParallel(model_benign2)
model_benign2.load_state_dict(torch.load("transferFromAdversarial_attacked.pt"))
model_benign2.to(device)
opt = optim.SGD(model_benign2.parameters(), lr=1e-1)
attack = LinfPGDAttack(
    predict=model_benign2, 
    loss_fn=nn.CrossEntropyLoss(reduction="sum"), 
    eps=0.05, nb_iter=10, eps_iter=0.01
)

for t in range(50):
    train_err, train_loss = epoch(train_loader, model_benign2, opt, 0.4)
    test_err, test_loss = epoch(test_loader, model_benign2)
    adv_err, adv_loss = epoch_adversarial(test_loader, model_benign2, attack)
    print("%.6f,%.6f,%.6f" % (train_err, test_err, adv_err))

torch.save(model_benign2.state_dict(), "transferFromAdversarial_benign2.pt")

0.281896,0.438500,0.887900
0.266716,0.417800,0.909300
0.258492,0.410100,0.930800
0.238857,0.384100,0.938600
0.227393,0.404500,0.940500
0.213011,0.375300,0.948200
0.197761,0.361700,0.948600
0.184182,0.356600,0.954600
0.175502,0.355500,0.952700
0.160205,0.358100,0.956400
0.146839,0.354100,0.955100
0.137450,0.347100,0.965900
0.132892,0.353800,0.959900
0.119427,0.344700,0.961700
0.107579,0.330600,0.965200
0.105722,0.349200,0.968800
0.096394,0.335300,0.966200
0.083434,0.340100,0.953800
0.085622,0.343600,0.967500
0.077318,0.329000,0.944200
0.073930,0.337500,0.955100
0.069150,0.327000,0.947400
0.063437,0.328700,0.948700
0.060311,0.325900,0.937300
0.052161,0.316300,0.942100
0.046700,0.319000,0.940700
0.045400,0.343500,0.949700
0.043676,0.316800,0.942200
0.040939,0.318100,0.931000
0.039184,0.318200,0.930300
0.036720,0.326500,0.926200
0.033722,0.303100,0.924400
0.036333,0.313200,0.922600
0.030000,0.314100,0.944700
0.032079,0.312300,0.945300
0.025522,0.316900,0.916200
0.027014,0.320600,0.932500
0

### Experiment 3: Transfer Learning Trained with 60% Benign Images

In [8]:
torch.manual_seed(0)
np.random.seed(0)

model_benign3 = vgg.__dict__["vgg16"]()
model_benign3 = nn.DataParallel(model_benign3)
model_benign3.load_state_dict(torch.load("transferFromAdversarial_attacked.pt"))
model_benign3.to(device)
opt = optim.SGD(model_benign3.parameters(), lr=1e-1)
attack = LinfPGDAttack(
    predict=model_benign3, 
    loss_fn=nn.CrossEntropyLoss(reduction="sum"), 
    eps=0.05, nb_iter=10, eps_iter=0.01
)

for t in range(50):
    train_err, train_loss = epoch(train_loader, model_benign3, opt, 0.6)
    test_err, test_loss = epoch(test_loader, model_benign3)
    adv_err, adv_loss = epoch_adversarial(test_loader, model_benign3, attack)
    print("%.6f,%.6f,%.6f" % (train_err, test_err, adv_err))

torch.save(model_benign3.state_dict(), "transferFromAdversarial_benign3.pt")

0.285455,0.423700,0.891000
0.267500,0.425300,0.904900
0.246146,0.380300,0.935700
0.221567,0.393700,0.936600
0.200239,0.372400,0.954300
0.177785,0.370400,0.954200
0.158213,0.351800,0.963300
0.147340,0.376700,0.954700
0.133058,0.333500,0.960500
0.115592,0.359400,0.970700
0.105113,0.336800,0.961100
0.094128,0.323800,0.962500
0.081389,0.337100,0.960200
0.072928,0.336300,0.948300
0.069729,0.325400,0.950300
0.061893,0.334300,0.947300
0.051968,0.326500,0.927400
0.052104,0.326200,0.942100
0.043030,0.335900,0.936700
0.042241,0.316800,0.923700
0.036358,0.324400,0.913200
0.033271,0.317200,0.932700
0.033579,0.312500,0.916600
0.032774,0.319800,0.915500
0.028701,0.318300,0.934900
0.026375,0.310000,0.892800
0.022303,0.312100,0.929200
0.023454,0.313700,0.916600
0.017197,0.304200,0.900100
0.017724,0.310800,0.886900
0.016980,0.308600,0.891800
0.020897,0.316100,0.913100
0.014746,0.312200,0.896400
0.016128,0.310300,0.882300
0.016856,0.309200,0.892800
0.014186,0.311200,0.887500
0.012876,0.307700,0.861600
0

### Experiment 4: Transfer Learning Trained with 80% Benign Images

In [9]:
torch.manual_seed(0)
np.random.seed(0)

model_benign4 = vgg.__dict__["vgg16"]()
model_benign4 = nn.DataParallel(model_benign4)
model_benign4.load_state_dict(torch.load("transferFromAdversarial_attacked.pt"))
model_benign4.to(device)
opt = optim.SGD(model_benign4.parameters(), lr=1e-1)
attack = LinfPGDAttack(
    predict=model_benign4, 
    loss_fn=nn.CrossEntropyLoss(reduction="sum"), 
    eps=0.05, nb_iter=10, eps_iter=0.01
)

for t in range(50):
    train_err, train_loss = epoch(train_loader, model_benign4, opt, 0.8)
    test_err, test_loss = epoch(test_loader, model_benign4)
    adv_err, adv_loss = epoch_adversarial(test_loader, model_benign4, attack)
    print("%.6f,%.6f,%.6f" % (train_err, test_err, adv_err))

torch.save(model_benign4.state_dict(), "transferFromAdversarial_benign4.pt")

0.283081,0.423500,0.914400
0.262678,0.424300,0.937500
0.228815,0.378100,0.945000
0.201654,0.363900,0.961000
0.176314,0.360100,0.958600
0.147426,0.352500,0.962000
0.128977,0.339700,0.968700
0.112295,0.341200,0.971100
0.097794,0.327300,0.954000
0.080149,0.362200,0.956800
0.070885,0.320400,0.935500
0.060709,0.328900,0.950700
0.051990,0.326800,0.944600
0.045470,0.333400,0.941200
0.039682,0.307900,0.926100
0.033181,0.310800,0.933000
0.032587,0.305800,0.936300
0.027792,0.319200,0.926600
0.026015,0.308500,0.913100
0.024377,0.311900,0.919200
0.022087,0.310100,0.907700
0.018175,0.303100,0.893700
0.018179,0.323100,0.894600
0.018881,0.309400,0.890300
0.017094,0.307300,0.901600
0.016788,0.307400,0.908600
0.014988,0.301600,0.895200
0.011859,0.296300,0.846500
0.010628,0.305200,0.892000
0.010988,0.308500,0.861100
0.010148,0.301700,0.874500
0.010733,0.306500,0.897500
0.010000,0.298200,0.861400
0.011204,0.297900,0.871900
0.010224,0.293600,0.870900
0.008139,0.301800,0.879700
0.007455,0.300500,0.871500
0

### Experiment 5: Transfer Learning Trained with 100% Benign Images

In [10]:
torch.manual_seed(0)
np.random.seed(0)

model_benign5 = vgg.__dict__["vgg16"]()
model_benign5 = nn.DataParallel(model_benign5)
model_benign5.load_state_dict(torch.load("transferFromAdversarial_attacked.pt"))
model_benign5.to(device)
opt = optim.SGD(model_benign5.parameters(), lr=1e-1)
attack = LinfPGDAttack(
    predict=model_benign5, 
    loss_fn=nn.CrossEntropyLoss(reduction="sum"), 
    eps=0.05, nb_iter=10, eps_iter=0.01
)

for t in range(50):
    train_err, train_loss = epoch(train_loader, model_benign5, opt, 1.0)
    test_err, test_loss = epoch(test_loader, model_benign5)
    adv_err, adv_loss = epoch_adversarial(test_loader, model_benign5, attack)
    print("%.6f,%.6f,%.6f" % (train_err, test_err, adv_err))

torch.save(model_benign5.state_dict(), "transferFromAdversarial_benign5.pt")

0.284600,0.430000,0.919500
0.251840,0.402700,0.939200
0.214100,0.381800,0.952400
0.181800,0.344400,0.959900
0.150760,0.364300,0.965300
0.122800,0.339200,0.965200
0.103600,0.334400,0.945900
0.084780,0.335000,0.955200
0.070280,0.314900,0.957000
0.060060,0.333100,0.948400
0.049060,0.310900,0.952400
0.042040,0.325100,0.940100
0.035780,0.314900,0.942700
0.032460,0.333300,0.916100
0.027300,0.316600,0.937000
0.022280,0.315400,0.939600
0.022620,0.304000,0.912800
0.017160,0.293200,0.910500
0.019820,0.303700,0.893300
0.019040,0.301300,0.910700
0.013660,0.302000,0.917300
0.015440,0.297700,0.890000
0.011560,0.303300,0.878900
0.011540,0.298100,0.855900
0.010740,0.293900,0.867400
0.012320,0.305900,0.885000
0.010480,0.303000,0.886500
0.009380,0.295600,0.873900
0.005480,0.314500,0.891600
0.007720,0.312000,0.881400
0.007940,0.302300,0.883100
0.008260,0.295200,0.870700
0.006800,0.302300,0.851400
0.004700,0.287000,0.848700
0.006640,0.297200,0.886700
0.006680,0.295200,0.846000
0.007380,0.285100,0.865000
0