In [2]:
# This extension reloads external Python files
import os
from pathlib import Path
import getpass
import numpy as np
import pandas as pd
import time
import math
from collections import defaultdict

import torch
from torch.utils.data import DataLoader
from torch import nn
from tqdm import tqdm
import random
import sys
from torch.utils.data import random_split
from matplotlib import pyplot as plt

# allow imports when running script from within project dir
[sys.path.append(i) for i in ['.', '..']]

# local
from src.helpers.helpers import get_random_indexes, get_random_classes
from src.model.dino_model import get_dino
from src.model.train import *
from src.model.data import *

# seed
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

username = getpass.getuser()
DATA_PATH = Path('/','cluster', 'scratch', 'thobauma', 'dl_data')
MAX_PATH = Path('/','cluster', 'scratch', 'mmathys', 'dl_data')
# Path for intermediate outputs
BASE_POSTHOC_PATH = Path(MAX_PATH, 'posthoc-fixed-labels/')
#BASE_POSTHOC_PATH = Path(MAX_PATH, 'posthoc-subset/')
POSTHOC_MODELS_PATH = Path(MAX_PATH,'posthoc-models')

# Original Dataset
ORI_PATH = Path(DATA_PATH, 'ori_data/')
CLASS_SUBSET_PATH = Path(ORI_PATH, 'class_subset.npy')

TR_PATH = Path(ORI_PATH, 'train/')
TR_ORI_LABEL_PATH = Path(TR_PATH,'correct_labels.txt')
TR_ORI_IMAGES_PATH = Path(TR_PATH,'images')

VAL_PATH = Path(ORI_PATH, 'validation/')
VAL_ORI_LABEL_PATH = Path(VAL_PATH,'correct_labels.txt')
VAL_ORI_IMAGES_PATH = Path(VAL_PATH,'images')

# DAmageNet
DN_PATH = Path(DATA_PATH, 'damageNet')
DN_LABEL_PATH = Path(DN_PATH, 'val_damagenet.txt')
DN_IMAGES_PATH = Path(DN_PATH, 'images')
DN_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'damagenet')
DN_POSTHOC_LABEL_PATH = Path(DN_POSTHOC_PATH, 'labels.csv')

# PGD
TR_PGD_PATH = Path(MAX_PATH, 'adversarial_data/pgd_06/train')
TR_PGD_LABEL_PATH = TR_ORI_LABEL_PATH
TR_PGD_IMAGES_PATH = Path(TR_PGD_PATH, 'images')
TR_PGD_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'pgd/train/')
TR_PGD_POSTHOC_LABEL_PATH = Path(TR_PGD_POSTHOC_PATH, 'labels.csv')

VAL_PGD_PATH = Path(MAX_PATH, 'adversarial_data/pgd_06/validation')
VAL_PGD_LABEL_PATH = VAL_ORI_LABEL_PATH
VAL_PGD_IMAGES_PATH = Path(VAL_PGD_PATH, 'images')
VAL_PGD_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'pgd/validation/')
VAL_PGD_POSTHOC_LABEL_PATH = Path(VAL_PGD_POSTHOC_PATH, 'labels.csv')

# CW
TR_CW_PATH = Path(MAX_PATH, 'adversarial_data/cw/train')
TR_CW_LABEL_PATH = TR_ORI_LABEL_PATH
TR_CW_IMAGES_PATH = Path(TR_CW_PATH, 'images')
TR_CW_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'cw/train/')
TR_CW_POSTHOC_LABEL_PATH = Path(TR_CW_POSTHOC_PATH, 'labels.csv')

VAL_CW_PATH = Path(MAX_PATH, 'adversarial_data/cw/validation')
VAL_CW_LABEL_PATH = VAL_ORI_LABEL_PATH
VAL_CW_IMAGES_PATH = Path(VAL_CW_PATH, 'images')
VAL_CW_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'cw/validation/')
VAL_CW_POSTHOC_LABEL_PATH = Path(VAL_CW_POSTHOC_PATH, 'labels.csv')

# FGSM
TR_FGSM_PATH = Path(MAX_PATH, 'adversarial_data/fgsm_06/train')
TR_FGSM_LABEL_PATH = TR_ORI_LABEL_PATH
TR_FGSM_IMAGES_PATH = Path(TR_FGSM_PATH, 'images')
TR_FGSM_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'fgsm/train/')
TR_FGSM_POSTHOC_LABEL_PATH = Path(TR_FGSM_POSTHOC_PATH, 'labels.csv')

VAL_FGSM_PATH = Path(MAX_PATH, 'adversarial_data/fgsm_06/validation')
VAL_FGSM_LABEL_PATH = VAL_ORI_LABEL_PATH
VAL_FGSM_IMAGES_PATH = Path(VAL_FGSM_PATH, 'images')
VAL_FGSM_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'fgsm/validation/')
VAL_FGSM_POSTHOC_LABEL_PATH = Path(VAL_FGSM_POSTHOC_PATH, 'labels.csv')

In [3]:
# If CLASS_SUBSET is specified, INDEX_SUBSET will be ignored. Set CLASS_SUBSET=None if you want to use indexes.
# INDEX_SUBSET = get_random_indexes(number_of_images = 50000, n_samples=1000)
# CLASS_SUBSET = get_random_classes(number_of_classes = 25, min_rand_class = 1, max_rand_class = 1001)


CLASS_SUBSET = np.load(CLASS_SUBSET_PATH)

INDEX_SUBSET = None
NUM_WORKERS= 0
PIN_MEMORY=True
EPOCHS = 3
BATCH_SIZE = 256

DEVICE = 'cuda'

In [4]:
datasets_paths = {
            'cw':{ 
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_CW_IMAGES_PATH
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_CW_IMAGES_PATH
                    }
                }
            },
            'ori':{
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_ORI_IMAGES_PATH
                    },
                    'val':{
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_ORI_IMAGES_PATH
                    }
                }
            },
            'dn':{
                'b':{
                    'train':{
                        'label':TR_CW_PATH,
                        'images':None
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':DN_IMAGES_PATH
                    }
                 }
            },
            'fgsm_06':{
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_FGSM_IMAGES_PATH
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_FGSM_IMAGES_PATH
                    }
                 }
            },
            'pgd_06':{
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_PGD_IMAGES_PATH
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_PGD_IMAGES_PATH
                    }
                }
            }
}

datasets = ['ori', 'cw', 'pgd_06', 'fgsm_06']
for ds in datasets:
    ds_dict = datasets_paths[ds]
    ds_dict['p'] = {
        'train': { 
            'images': Path(BASE_POSTHOC_PATH, ds, 'train', 'images'),
            'label': Path(BASE_POSTHOC_PATH, ds, 'train', 'labels.csv')
        },
        'val': { 
            'images': Path(BASE_POSTHOC_PATH, ds, 'val', 'images'),
            'label': Path(BASE_POSTHOC_PATH, ds, 'val', 'labels.csv')
        }
    }
    

In [5]:
adv_datasets = ['cw', 'pgd_06', 'fgsm_06']

train_dfs = {}
for ds in adv_datasets:
    train_dfs[ds] = pd.read_csv(Path(BASE_POSTHOC_PATH, ds, 'train', 'labels_merged.csv'))
    
val_dfs = {}
for ds in adv_datasets:
    val_dfs[ds] = pd.read_csv(Path(BASE_POSTHOC_PATH, ds, 'val', 'labels_merged.csv'))


for name, df in train_dfs.items():
    df=df[df['true_labels']==df['ori_pred']]
    df=df[df['true_labels']!=df[name+'_pred']]
    df =df[['file', 'true_labels', 'ori_pred', name+'_pred']]
    train_dfs[name]=df

In [6]:
class AdvDataset(torch.utils.data.Dataset):
    def __init__(self, or_img_folder, adv_img_folder, index_df):
        super().__init__()
        self.or_img_folder = or_img_folder
        self.adv_img_folder = adv_img_folder
        self.index_df = index_df
    
    def __len__(self):
        return len(self.index_df)*2
    
    def __getitem__(self, index):            
        filename = self.index_df['file'].iloc[index%len(self.index_df)]
        filename = filename.split('.')[0]+'.pt'
        if index >= len(self.index_df):
            payload = torch.load(Path(self.or_img_folder, filename)).cpu()
            label = 0 #torch.tensor(0, dtype=torch.float32)
        else:
            payload = torch.load(Path(self.adv_img_folder, filename)).cpu()
            label = 1 #torch.tensor(1, dtype=torch.float32)
        return payload, label, filename

In [7]:
# Linear Binary Classifier
class LinearBC(nn.Module):
    def __init__(self, input_shape):
        self.num_labels = 2
        super(LinearBC,self).__init__()
        self.fc1 = nn.Linear(input_shape,2)

    def forward(self, x):
        x = self.fc1(x)
        return x

In [None]:

logger_dict = {}
for ds in adv_datasets:
    ds_p = datasets_paths[ds]['p']
    print(f'''{ds}''')
    
    
    # loaders
    train_set = AdvDataset(datasets_paths['ori']['p']['train']['images'],ds_p['train']['images'], train_dfs[ds])
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, shuffle=True)
    val_set = AdvDataset(datasets_paths['ori']['p']['val']['images'],ds_p['val']['images'], val_dfs[ds])
    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, shuffle=False)
    
    # Initialise network
    classifier = LinearBC(1536)
    criterion = nn.CrossEntropyLoss()
    classifier.cuda()
    optimizer = torch.optim.Adagrad(classifier.parameters(), lr=0.001, lr_decay=1e-08, weight_decay=0)
    logger_dict[ds] = train(model=None, 
                            classifier=classifier, 
                            train_loader=train_loader, 
                            validation_loader=val_loader, 
                            log_dir=Path(POSTHOC_MODELS_PATH,ds),
                            tensor_dir=None, 
                            optimizer=optimizer, 
                            criterion=criterion, 
                            adversarial_attack=None, 
                            epochs=EPOCHS, 
                            val_freq=1, 
                            batch_size=16,  
                            lr=0.001, 
                            to_restore = {"epoch": 0, "best_acc": 0.}, 
                            n=4, 
                            avgpool_patchtokens=False)

    

cw
Found checkpoint at /cluster/scratch/mmathys/dl_data/posthoc-models/cw/checkpoint.pth.tar
=> loaded 'state_dict' from checkpoint '/cluster/scratch/mmathys/dl_data/posthoc-models/cw/checkpoint.pth.tar' with msg <All keys matched successfully>
=> loaded 'optimizer' from checkpoint: '/cluster/scratch/mmathys/dl_data/posthoc-models/cw/checkpoint.pth.tar'
=> loaded 'scheduler' from checkpoint: '/cluster/scratch/mmathys/dl_data/posthoc-models/cw/checkpoint.pth.tar'
Training of the supervised linear classifier on frozen features completed.
Top-1 test accuracy: 96.6
pgd_06
Epoch: [0]  [  0/249]  eta: 0:05:12  lr: 0.001000  loss: 1.268624 (1.268624)  time: 1.253843  data: 1.245587  max mem: 2
Epoch: [0]  [ 20/249]  eta: 0:04:35  lr: 0.001000  loss: 0.141415 (0.268872)  time: 1.201866  data: 1.200620  max mem: 2
Epoch: [0]  [ 40/249]  eta: 0:04:08  lr: 0.001000  loss: 0.060325 (0.169525)  time: 1.175437  data: 1.174234  max mem: 2
Epoch: [0]  [ 60/249]  eta: 0:03:52  lr: 0.001000  loss: 0.041

In [None]:
def posthoc_forward_pass(datasets, datasets_paths):
    for ds in datasets:
        ds_b = datasets_paths[ds]['b']
        ds_p = datasets_paths[ds]['p']
        logger_dict[ds] = {}
        transform = ONLY_NORMALIZE_TRANSFORM
        if ds == 'ori':
            transform = ORIGINAL_TRANSFORM
        for tv in ['train', 'val']:
            print(f'''images: {ds_b[tv]['images']}\nlabel: {ds_b[tv]['label']}\npred: {ds_p[tv]['label']}''')
            data_set = ImageDataset(ds_b[tv]['images'], ds_b[tv]['label'], transform, class_subset=CLASS_SUBSET)
            data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, shuffle=False)
            print(f'''{ds}: {tv} {len(data_set)}''')
            logger_dict[ds][tv] = validate_network(model, linear_classifier, data_loader, adversarial_attack=None, tensor_dir=ds_p[tv]['images'], path_predictions=ds_p[tv]['label'])
    return logger_dict