In [1]:
# This extension reloads external Python files
import os
from pathlib import Path
import getpass
import numpy as np
import pandas as pd
import time
import math
from collections import defaultdict

import torch
from torch.utils.data import DataLoader
from torch import nn
from tqdm import tqdm
import random
import sys
from torch.utils.data import random_split
from matplotlib import pyplot as plt

# allow imports when running script from within project dir
[sys.path.append(i) for i in ['.', '..']]

# local
from src.helpers.helpers import get_random_indexes, get_random_classes
from src.model.dino_model import get_dino
from src.model.train import *
from src.model.data import *

# seed
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

username = getpass.getuser()
DATA_PATH = Path('/','cluster', 'scratch', 'thobauma', 'dl_data')
MAX_PATH = Path('/','cluster', 'scratch', 'mmathys', 'dl_data')
# Path for intermediate outputs
BASE_POSTHOC_PATH = Path(MAX_PATH, 'posthoc-fixed-labels/')
#BASE_POSTHOC_PATH = Path(MAX_PATH, 'posthoc-subset/')
POSTHOC_MODELS_PATH = Path(MAX_PATH,'posthoc-models')


# Original Dataset
ORI_PATH = Path(DATA_PATH, 'ori_data/')
CLASS_SUBSET_PATH = Path(ORI_PATH, 'class_subset.npy')

TR_PATH = Path(ORI_PATH, 'train/')
TR_ORI_LABEL_PATH = Path(TR_PATH,'correct_labels.txt')
TR_ORI_IMAGES_PATH = Path(TR_PATH,'images')

VAL_PATH = Path(ORI_PATH, 'validation/')
VAL_ORI_LABEL_PATH = Path(VAL_PATH,'correct_labels.txt')
VAL_ORI_IMAGES_PATH = Path(VAL_PATH,'images')

# DAmageNet
DN_PATH = Path(DATA_PATH, 'damageNet')
DN_LABEL_PATH = Path(DN_PATH, 'val_damagenet.txt')
DN_IMAGES_PATH = Path(DN_PATH, 'images')
DN_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'damagenet')
DN_POSTHOC_LABEL_PATH = Path(DN_POSTHOC_PATH, 'labels.csv')

# PGD
TR_PGD_PATH = Path(MAX_PATH, 'adversarial_data/pgd_06/train')
TR_PGD_LABEL_PATH = TR_ORI_LABEL_PATH
TR_PGD_IMAGES_PATH = Path(TR_PGD_PATH, 'images')
TR_PGD_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'pgd/train/')
TR_PGD_POSTHOC_LABEL_PATH = Path(TR_PGD_POSTHOC_PATH, 'labels.csv')

VAL_PGD_PATH = Path(MAX_PATH, 'adversarial_data/pgd_06/validation')
VAL_PGD_LABEL_PATH = VAL_ORI_LABEL_PATH
VAL_PGD_IMAGES_PATH = Path(VAL_PGD_PATH, 'images')
VAL_PGD_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'pgd/validation/')
VAL_PGD_POSTHOC_LABEL_PATH = Path(VAL_PGD_POSTHOC_PATH, 'labels.csv')

# CW
TR_CW_PATH = Path(MAX_PATH, 'adversarial_data/cw/train')
TR_CW_LABEL_PATH = TR_ORI_LABEL_PATH
TR_CW_IMAGES_PATH = Path(TR_CW_PATH, 'images')
TR_CW_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'cw/train/')
TR_CW_POSTHOC_LABEL_PATH = Path(TR_CW_POSTHOC_PATH, 'labels.csv')

VAL_CW_PATH = Path(MAX_PATH, 'adversarial_data/cw/validation')
VAL_CW_LABEL_PATH = VAL_ORI_LABEL_PATH
VAL_CW_IMAGES_PATH = Path(VAL_CW_PATH, 'images')
VAL_CW_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'cw/validation/')
VAL_CW_POSTHOC_LABEL_PATH = Path(VAL_CW_POSTHOC_PATH, 'labels.csv')

# FGSM
TR_FGSM_PATH = Path(MAX_PATH, 'adversarial_data/fgsm_06/train')
TR_FGSM_LABEL_PATH = TR_ORI_LABEL_PATH
TR_FGSM_IMAGES_PATH = Path(TR_FGSM_PATH, 'images')
TR_FGSM_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'fgsm/train/')
TR_FGSM_POSTHOC_LABEL_PATH = Path(TR_FGSM_POSTHOC_PATH, 'labels.csv')

VAL_FGSM_PATH = Path(MAX_PATH, 'adversarial_data/fgsm_06/validation')
VAL_FGSM_LABEL_PATH = VAL_ORI_LABEL_PATH
VAL_FGSM_IMAGES_PATH = Path(VAL_FGSM_PATH, 'images')
VAL_FGSM_POSTHOC_PATH = Path(BASE_POSTHOC_PATH, 'fgsm/validation/')
VAL_FGSM_POSTHOC_LABEL_PATH = Path(VAL_FGSM_POSTHOC_PATH, 'labels.csv')

In [2]:
# If CLASS_SUBSET is specified, INDEX_SUBSET will be ignored. Set CLASS_SUBSET=None if you want to use indexes.
# INDEX_SUBSET = get_random_indexes(number_of_images = 50000, n_samples=1000)
# CLASS_SUBSET = get_random_classes(number_of_classes = 25, min_rand_class = 1, max_rand_class = 1001)


CLASS_SUBSET = np.load(CLASS_SUBSET_PATH)

INDEX_SUBSET = None
NUM_WORKERS= 0
PIN_MEMORY=True

BATCH_SIZE = 64

DEVICE = 'cuda'

In [3]:
CLASS_SUBSET

array([103, 436, 861, 271, 107,  72, 701,  21, 615, 122, 467, 215, 331,
       459,  88, 373, 100, 872, 664, 131, 662, 309, 770, 344, 492])

In [4]:
datasets_paths = {
            'cw':{ 
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_CW_IMAGES_PATH
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_CW_IMAGES_PATH
                    }
                }
            },
            'ori':{
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_ORI_IMAGES_PATH
                    },
                    'val':{
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_ORI_IMAGES_PATH
                    }
                }
            },
            'dn':{
                'b':{
                    'train':{
                        'label':TR_CW_PATH,
                        'images':None
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':DN_IMAGES_PATH
                    }
                 }
            },
            'fgsm_06':{
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_FGSM_IMAGES_PATH
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_FGSM_IMAGES_PATH
                    }
                 }
            },
            'pgd_06':{
                'b':{
                    'train':{
                        'label':TR_ORI_LABEL_PATH,
                        'images':TR_PGD_IMAGES_PATH
                    },
                    'val':
                    {
                        'label':VAL_ORI_LABEL_PATH,
                        'images':VAL_PGD_IMAGES_PATH
                    }
                }
            }
}

datasets = ['ori', 'cw', 'pgd_06', 'fgsm_06']
for ds in datasets:
    ds_dict = datasets_paths[ds]
    ds_dict['p'] = {
        'train': { 
            'images': Path(BASE_POSTHOC_PATH, ds, 'train', 'images'),
            'label': Path(BASE_POSTHOC_PATH, ds, 'train', 'labels.csv')
        },
        'val': { 
            'images': Path(BASE_POSTHOC_PATH, ds, 'val', 'images'),
            'label': Path(BASE_POSTHOC_PATH, ds, 'val', 'labels.csv')
        }
    }
    

In [5]:
adv_datasets = ['cw', 'pgd_06', 'fgsm_06']

train_dfs = {}
for ds in adv_datasets:
    train_dfs[ds] = pd.read_csv(Path(BASE_POSTHOC_PATH, ds, 'train', 'labels_merged.csv'))
    
val_dfs = {}
for ds in adv_datasets:
    val_dfs[ds] = pd.read_csv(Path(BASE_POSTHOC_PATH, ds, 'val', 'labels_merged.csv'))


for name, df in train_dfs.items():
    df=df[df['true_labels']==df['ori_pred']]
    df=df[df['true_labels']!=df[name+'_pred']]
    df =df[['file', 'true_labels', 'ori_pred', name+'_pred']]
    train_dfs[name]=df

In [6]:
train_dfs['cw']

Unnamed: 0,file,true_labels,ori_pred,cw_pred
13,n03782006_5671.JPEG,20,20,19
92,n02892767_13971.JPEG,15,15,18
106,n04447861_5351.JPEG,23,23,18
112,n02398521_9872.JPEG,12,12,4
130,n02398521_1940.JPEG,12,12,13
...,...,...,...,...
32086,n02009229_4501.JPEG,7,7,0
32094,n02009229_4231.JPEG,7,7,4
32122,n03014705_8387.JPEG,17,17,23
32140,n02892767_15559.JPEG,15,15,18


In [7]:
train_dfs['pgd_06']

Unnamed: 0,file,true_labels,ori_pred,pgd_06_pred
0,n01873310_42267.JPEG,4,4,12
1,n02326432_39907.JPEG,11,11,13
2,n01608432_6128.JPEG,0,0,7
3,n04447861_3923.JPEG,23,23,18
4,n02101388_21983.JPEG,8,8,23
...,...,...,...,...
32176,n03777754_11639.JPEG,19,19,20
32177,n02927161_43568.JPEG,16,16,18
32178,n02398521_22023.JPEG,12,12,13
32179,n03888257_34756.JPEG,21,21,18


In [8]:
train_dfs['fgsm_06']['fgsm_06_pred'].unique()

array([18,  9, 23,  7, 15,  4, 11,  2, 19, 13, 22,  6, 12,  1,  5,  0, 17,
        8, 20, 16, 21, 14, 24, 10,  3])

In [9]:
class AdvDataset(torch.utils.data.Dataset):
    def __init__(self, or_img_folder, adv_img_folder, index_df):
        super().__init__()
        self.or_img_folder = or_img_folder
        self.adv_img_folder = adv_img_folder
        self.index_df = index_df
    
    def __len__(self):
        return len(self.index_df)*2
    
    def __getitem__(self, index):            
        filename = self.index_df['file'].iloc[index%len(self.index_df)]
        filename = filename.split('.')[0]+'.pt'
        if index >= len(self.index_df):
            payload = torch.load(Path(self.or_img_folder, filename))
            label = 0.0
        else:
            payload = torch.load(Path(self.adv_img_folder, filename))
            label = 1.0
        return filename, payload, label

In [10]:
# Linear Binary Classifier
class LinearBC(nn.Module):
    def __init__(self, input_shape):
        super(LinearBC,self).__init__()
        self.fc1 = nn.Linear(input_shape,1)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        return x

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset


def train(net, train_loader, val_loader, optimizer, criterion, save_path):
    # Train network

    losses=[]
    accuracy=[]
    pbar = tqdm(range(EPOCHS))
    for epoch in pbar:  # loop over the dataset multiple times

        # Metrics
        train_running_loss = 0.0
        train_running_loss_mean = 0.0
        train_acc = 0.0
        train_acc_mean = 0.0
        test_running_loss = 0.0
        test_acc = 0.0

        for i, (filename, inputs, labels) in enumerate(train_loader, start=0):        
            inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True).float()

            # Forward Pass
            outputs = net(inputs)
            outputs = outputs.reshape(-1)

            # Backpropagation
            optimizer.zero_grad() # Reset the gradient
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # loss train
            train_running_loss += loss.item()
            train_running_loss_mean = train_running_loss / (i+1)

            # accuracy train
            predicted_logits = net(inputs)  # .reshape(-1).detach().cpu().numpy().round()
            predictions = predicted_logits.argmax(axis=1)
            print(f'''pred_logits shape: {predicted_logits.shape}''')
            print(f'''pred_logits reshape shape: {predicted_logits.reshape(-1).shape}''')
            print(f'''labels shape: {labels.shape}''')
            acc_labels = labels
            acc_labels = acc_labels  # .detach().cpu().numpy()
            inter = torch.eq(predictions, acc_labels).sum()
            train_acc += inter
            print(f'''epoch {epoch}, batch {i}''')

        with torch.no_grad():
            for filename, inputs, labels in val_loader:
                # try:
                    # get the inputs; data is a list of [inputs, labels] and write to device
                    inputs = inputs.cuda(non_blocking=True)
                    labels = labels.cuda(non_blocking=True).float()

                    # Forward Pass
                    predicted_logits = net(inputs)
                    # predicted_logits = predicted_logits.reshape(-1)
                    print(f'''pred_logits shape: {predicted_logits.shape}''')
                    print(f'''pred_logits reshape shape: {predicted_logits.reshape(-1).shape}''')
                    print(f'''labels shape: {labels.shape}''')
                    
                    # loss test
                    loss = criterion(predicted_logits, labels)
                    test_running_loss += loss.item()

                    # accuracy test
                    outputs = predicted_logits.argmax()
                    comparison = torch.eq(labels, outputs).sum()
                    test_acc += comparison
                    accuracy.append(comparison/BATCH_SIZE)

                #except Exception as e:
                #    print("Error: {}".format(e))
                #    pass

        losses.append(train_running_loss_mean)
        pbar.set_description("Ep: {}\t Tr. Loss: {:.4f}\t Tr. Acc: {:.4f}\t T. Loss: {:.4f}\t T. Acc: {:.4f}".format(epoch, 
                                                                                train_running_loss_mean, 
                                                                                train_acc / len(train_loader.dataset), 
                                                                                test_running_loss, 
                                                                                test_acc / len(val_loader.dataset)))
    save_path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(net.state_dict(), save_path)
    print(f'Finished Training, saving to {save_path}')
    return losses, accuracy

In [16]:
from tqdm import tqdm

print("starting train...")
use_validate_split = False

# Hyperparameters
EPOCHS = 1


PIN_MEMORY=False
loss_dict = {}
accuracy_dict = {}
for ds in ['cw']: #adv_datasets:
    ds_p = datasets_paths[ds]['p']
    print(f'''{ds}''')
    # Initialise network
    net = LinearBC(1536)
    net.train()
    net.cuda()
    criterion = nn.BCELoss()
    optimizer = optim.Adagrad(net.parameters(), lr=0.01, lr_decay=1e-08, weight_decay=0)
    train_set = AdvDataset(datasets_paths['ori']['p']['train']['images'],ds_p['train']['images'], train_dfs[ds])
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, shuffle=True)
    val_set = AdvDataset(datasets_paths['ori']['p']['val']['images'],ds_p['val']['images'], val_dfs[ds])
    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, shuffle=False)
    loss_dict[ds], accuracy_dict[ds] = train(net, train_loader, val_loader, optimizer, criterion, Path(POSTHOC_MODELS_PATH,ds+'.pt'))
    
    

  0%|          | 0/1 [00:00<?, ?it/s]

starting train...
cw
pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
epoch 0, batch 0
pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
epoch 0, batch 1
pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
epoch 0, batch 2
pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
epoch 0, batch 3
pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
epoch 0, batch 4
pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
epoch 0, batch 5
pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
epoch 0, batch 6
pred_logits shape: torch.Size([64, 1])
pred_logits reshape sha

Ep: 0	 Tr. Loss: 0.4121	 Tr. Acc: 0.5000	 T. Loss: 0.0000	 T. Acc: 0.0000: 100%|██████████| 1/1 [00:32<00:00, 32.97s/it]

pred_logits shape: torch.Size([64, 1])
pred_logits reshape shape: torch.Size([64])
labels shape: torch.Size([64])
Error: Using a target size (torch.Size([64])) that is different to the input size (torch.Size([64, 1])) is deprecated. Please ensure they have the same size.
pred_logits shape: torch.Size([4, 1])
pred_logits reshape shape: torch.Size([4])
labels shape: torch.Size([4])
Error: Using a target size (torch.Size([4])) that is different to the input size (torch.Size([4, 1])) is deprecated. Please ensure they have the same size.
Finished Training, saving to /cluster/scratch/mmathys/dl_data/posthoc-models/cw.pt





In [None]:
def posthoc_forward_pass(datasets, datasets_paths):
    for ds in datasets:
        ds_b = datasets_paths[ds]['b']
        ds_p = datasets_paths[ds]['p']
        logger_dict[ds] = {}
        transform = ONLY_NORMALIZE_TRANSFORM
        if ds == 'ori':
            transform = ORIGINAL_TRANSFORM
        for tv in ['train', 'val']:
            print(f'''images: {ds_b[tv]['images']}\nlabel: {ds_b[tv]['label']}\npred: {ds_p[tv]['label']}''')
            data_set = ImageDataset(ds_b[tv]['images'], ds_b[tv]['label'], transform, class_subset=CLASS_SUBSET)
            data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, shuffle=False)
            print(f'''{ds}: {tv} {len(data_set)}''')
            logger_dict[ds][tv] = validate_network(model, linear_classifier, data_loader, adversarial_attack=None, tensor_dir=ds_p[tv]['images'], path_predictions=ds_p[tv]['label'])
    return logger_dict