# Install, Paths and Parameters

In [None]:
# This extension reloads external Python files
from pathlib import Path
from collections import defaultdict

import numpy as np
import random
import sys
import pandas as pd

import torch
from torch.utils.data import DataLoader
from torch import nn

# allow imports when running script from within project dir
[sys.path.append(i) for i in ['.', '..']]

# local
from src.model.dino_model import get_dino
from src.model.train import *
from src.model.data import *
from src.helpers.helpers import create_paths

# seed
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

DATA_PATH = Path('/','cluster', 'scratch', 'thobauma', 'dl_data')
MAX_PATH = Path('/','cluster', 'scratch', 'mmathys', 'dl_data')

BASE_ADV_PATH = Path(MAX_PATH, 'adversarial_data_tensors')
BASE_POSTHOC_PATH = Path(MAX_PATH, 'posthoc_tensors')

ORI_PATH = Path(DATA_PATH, 'ori')
CLASS_SUBSET_PATH = Path(ORI_PATH, 'class_subset.npy')
CLASS_SUBSET = np.load(CLASS_SUBSET_PATH)

ADV_DATASETS = ['cw', 'fgsm_06', 'pgd_03']
DATASETS = ['ori', *ADV_DATASETS]
print(DATASETS)

['ori', 'cw', 'fgsm_06', 'pgd_03']


In [None]:
DATA_PATHS = create_paths(data_name='ori',
                 datasets_paths=None,  
                 initial_base_path=DATA_PATH, 
                 posthoc_base_path=BASE_POSTHOC_PATH, 
                 train_str='train', 
                 val_str='validation')
for adv_ds in ADV_DATASETS:
    DATA_PATHS = create_paths(data_name=adv_ds,
                 datasets_paths=DATA_PATHS,  
                 initial_base_path=BASE_ADV_PATH, 
                 posthoc_base_path=BASE_POSTHOC_PATH, 
                 train_str='train', 
                 val_str='validation')

In [None]:
INDEX_SUBSET = None
NUM_WORKERS= 0
PIN_MEMORY=True

BATCH_SIZE = 64

DEVICE = 'cuda'

# Import DINO
Official repo: https://github.com/facebookresearch/dino

In [4]:
class LinearClassifier(nn.Module):
    """Linear layer to train on top of frozen features"""
    def __init__(self, dim, num_labels=1000):
        super(LinearClassifier, self).__init__()
        self.num_labels = num_labels
        self.linear = nn.Linear(dim, num_labels)
        self.linear.weight.data.normal_(mean=0.0, std=0.01)
        self.linear.bias.data.zero_()

    def forward(self, x):
        # flatten
        x = x.view(x.size(0), -1)

        # linear layer
        return self.linear(x)
    
model, dino_classifier = get_dino()

linear_classifier = LinearClassifier(dino_classifier.linear.in_features, 
                         num_labels=len(CLASS_SUBSET))

linear_classifier.load_state_dict(torch.load("/cluster/scratch/mmathys/dl_data/adversarial_data/adv_classifiers/25_classes" + "/" + "clean.pt"))
linear_classifier.cuda()


from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit([i for i in CLASS_SUBSET])

Please use the `--pretrained_weights` argument to indicate the path of the checkpoint to evaluate.
Since no pretrained weights have been provided, we load the reference pretrained DINO weights.
Model vit_small built.
Embed dim 1536
We load the reference pretrained linear weights.


LabelEncoder()

# Forward Pass

In [None]:
def posthoc_forward_pass(model, classifier, datasets, datasets_paths):
    logger_dict = {}
    for ds in datasets:
        ds_init = datasets_paths[ds]['init']
        ds_posthoc = datasets_paths[ds]['posthoc']
        
        logger_dict[ds] = {}
        print("\n"+"#"*50 + f''' forwardpass for {ds} ''' + "#"*50)
        
        for tv in ['train', 'validation']:            
            print(f'''images: {ds_init[tv]['images']}\nlabel: {ds_init[tv]['label']}\npred: {ds_posthoc[tv]['label']}''')
            
            if ds == 'ori':
                transform = ORIGINAL_TRANSFORM
                data_set = AdvTrainingImageDataset(img_folder=ds_init[tv]['images'], 
                                   labels_file_name=ds_init[tv]['label'], 
                                   #labels_file_name='/cluster/home/thobauma/deeplearning/data-mmathys/adversarial_data_test/cw/train/labels.txt',
                                   transform=transform, 
                                   class_subset=CLASS_SUBSET,
                                   index_subset=None,
                                   label_encoder=label_encoder)
                
            else:
                data_set = PosthocForwardDataset(img_folder=ds_init[tv]['images'], 
                                                 labels_file_name=ds_init[tv]['label'],
                                                 index_subset=None, 
                                                 class_subset=None)
            
            data_loader = DataLoader(data_set, 
                                     batch_size=BATCH_SIZE, 
                                     num_workers=NUM_WORKERS, 
                                     pin_memory=PIN_MEMORY, 
                                     shuffle=False)
            
            print(f'''{ds}: {tv} {len(data_set)}''')
            logger_dict[ds][tv] = validate_network(model=model, 
                                                   classifier=classifier, 
                                                   validation_loader=data_loader, 
                                                   criterion=nn.CrossEntropyLoss(), 
                                                   tensor_dir=ds_posthoc[tv]['images'],
                                                   adversarial_attack=None, 
                                                   n=4, 
                                                   avgpool_patchtokens=False, 
                                                   path_predictions=ds_posthoc[tv]['label'],
                                                   show_image=False)
            
    return logger_dict

In [6]:
logger_dict = posthoc_forward_pass(model,
                                   linear_classifier, 
                                   DATASETS, 
                                   DATA_PATHS)


################################################## forwardpass for ori ##################################################
images: /cluster/scratch/thobauma/dl_data/ori/train/images
label: /cluster/scratch/thobauma/dl_data/ori/train/labels.csv
pred: /cluster/scratch/mmathys/dl_data/posthoc_tensors/ori/train/labels.csv
ori: train 32181
saving predictions to: /cluster/scratch/mmathys/dl_data/posthoc_tensors/ori/train/labels.csv
Test:  [  0/503]  eta: 0:06:36  loss: 0.018451 (0.018451)  acc1: 98.437500 (98.437500)  acc5: 100.000000 (100.000000)  time: 0.787801  data: 0.518269  max mem: 497
Test:  [ 20/503]  eta: 0:06:39  loss: 0.027935 (0.045099)  acc1: 98.437500 (98.586310)  acc5: 100.000000 (99.925595)  time: 0.828631  data: 0.572045  max mem: 498
Test:  [ 40/503]  eta: 0:06:13  loss: 0.032479 (0.040204)  acc1: 98.437500 (98.818598)  acc5: 100.000000 (99.961890)  time: 0.786478  data: 0.527344  max mem: 498
Test:  [ 60/503]  eta: 0:05:59  loss: 0.025912 (0.038865)  acc1: 98.437500 (98.8

# Create Label files and store them:

In [1]:
from functools import reduce

def merge_frames(frames, on_what=['file', 'true_labels'], how='left'):
    merged_df = reduce(lambda left, right:pd.merge(left, right, on=on_what, how=how,  suffixes=('', '_drop')), frames)
    merged_df.drop(merged_df.filter(regex='_drop$').columns.tolist(),axis=1, inplace=True)
    return merged_df

def get_merged_labels(datasets=DATASETS, datasets_types=['train', 'validation'], datasets_paths=DATA_PATHS, save_path=BASE_POSTHOC_PATH, get_df_dict = False):
    df_data_types = {}
    df_data = {}
    for tv in datasets_types:
        df_data[tv] = {}
        for ds in datasets:
            ds_dict = datasets_paths[ds]
            df_data[tv][ds] = pd.read_csv(ds_dict['posthoc'][tv]['label'])
            df_data[tv][ds].rename(columns = {'pred_labels': ds+'_pred'}, inplace = True)
            if ds != 'ori':
                df_data[tv][ds] = pd.merge(df_data[tv][ds], df_data[tv]['ori'], on=['file', 'true_labels'], how='left')
            df_data[tv][ds].to_csv(Path(BASE_POSTHOC_PATH, ds, tv, 'labels_merged.csv'), sep=",", index=None)
        df_data_types[tv] = merge_frames(df_data[tv].values())
        if save_path is not None:
            df_data_types[tv].to_csv(Path(save_path,tv+'.csv'), sep=",", index=None)
    if get_df_dict:
        return df_data_types, df_data

    return df_data_types

    

In [1]:
df_types, df_data = get_merged_labels(get_df_dict=True)

# Some Analysis

In [20]:
for name in ADV_DATASETS:   
    print(f'''\n{name}:''')
    for tv in ['train', 'validation']: 
        print(f'    {tv}:')
        df = df_data[tv][name]

        ldf = len(df)
        print(f'''        total data:             {ldf}''')
        print(f'''        correct pred:           {len(df[df['true_labels']==df['ori_pred']])},   {len(df[df['true_labels']==df['ori_pred']])/ldf}''')
        print(f'''        incorrect adv pred:     {len(df[df['true_labels']!=df[name+'_pred']])},   {len(df[df['true_labels']!=df[name+'_pred']])/ldf}''')
        df_f = df[df['true_labels']==df['ori_pred']]
        print(f'''        number adv tuples:      {len(df_f[df_f['true_labels']!=df_f[name+'_pred']])},   {len(df_f[df_f['true_labels']!=df_f[name+'_pred']])/ldf}''')



cw:
    train:
        total data:             32181
        correct pred:           31848,   0.9896522792952364
        incorrect adv pred:     32145,   0.9988813274913769
        number adv tuples:      31812,   0.9885336067866132
    validation:
        total data:             1250
        correct pred:           1215,   0.972
        incorrect adv pred:     1249,   0.9992
        number adv tuples:      1214,   0.9712

fgsm_06:
    train:
        total data:             32181
        correct pred:           31848,   0.9896522792952364
        incorrect adv pred:     29355,   0.9121842080730866
        number adv tuples:      29023,   0.9018675616046735
    validation:
        total data:             1250
        correct pred:           1215,   0.972
        incorrect adv pred:     1133,   0.9064
        number adv tuples:      1098,   0.8784

pgd_03:
    train:
        total data:             32181
        correct pred:           31848,   0.9896522792952364
        incorrect adv p