# Install, Paths and Parameters

In [17]:
import os
from pathlib import Path
import getpass
import pandas as pd
import numpy as np
import json
import torch
from typing import List, Callable
import random

# seed
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

In [19]:
username = getpass.getuser()
DATA_PATH = Path('cluster', 'scratch', username, 'dl_data')
ORG_LABEL_PATH = Path(DATA_PATH, 'correct_labels.txt')
ORIGINAL_IMAGES_PATH = Path(DATA_PATH,'ori_data','ImageNetClasses')

In [20]:
def get_random_classes(number_of_classes: int = 50, min_rand_class: int = 0, max_rand_class: int = 999):
  return np.random.randint(low=min_rand_class, high=max_rand_class, size=(number_of_classes,))

def get_random_indexes(number_of_images: int = 50000, n_samples=1000):
  return np.random.choice(50000, 1000, replace=False)

INDEX_SUBSET = get_random_indexes()

In [9]:
!python ../setup/collect_env.py

Collecting environment information...
PyTorch version: 1.7.1+cu110
Is debug build: False
CUDA used to build PyTorch: 11.0
ROCM used to build PyTorch: N/A

OS: CentOS Linux release 7.9.2009 (Core) (x86_64)
GCC version: (GCC) 6.3.0
Clang version: Could not collect
CMake version: version 2.8.12.2
Libc version: glibc-2.17

Python version: 3.8.5 (default, Oct  6 2020, 10:04:29)  [GCC 6.3.0] (64-bit runtime)
Python platform: Linux-3.10.0-1160.36.2.el7.x86_64-x86_64-with-glibc2.2.5
Is CUDA available: True
CUDA runtime version: 11.0.221
GPU models and configuration: 
GPU 0: GeForce GTX 1080
GPU 1: GeForce GTX 1080
GPU 2: GeForce GTX 1080
GPU 3: GeForce GTX 1080

Nvidia driver version: 450.80.02
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A

Versions of relevant libraries:
[pip3] numpy==1.19.2
[pip3] numpy-groupies==0.9.14
[pip3] numpydoc==1.1.0
[pip3] torch==1.7.1+cu110
[pip3] torch-cluster==1.5.8
[pip3] torch-geometric==1.6.3
[pip3] torch-scatter==2.0.5

# Import DINO
Official repo: https://github.com/facebookresearch/dino

In [10]:
# Load pretrained weights from PyTorch
device = 'cuda'
vits16 = torch.hub.load('facebookresearch/dino:main', 'dino_vits8').to(device)

Downloading: "https://github.com/facebookresearch/dino/archive/main.zip" to /cluster/home/thobauma/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth" to /cluster/home/thobauma/.cache/torch/hub/checkpoints/dino_deitsmall8_pretrain.pth


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86728949.0), HTML(value='')))




In [11]:
# Hyperparameters
N_LAST_BLOCKS = 4 # Took from official repo
PATCH_SIZE=8

In [12]:
# Define and load pretrained weights for linear classifier on ImageNet
from torch import nn
class LinearClassifier(nn.Module):
    """Linear layer to train on top of frozen features"""
    def __init__(self, dim, num_labels=1000):
        super(LinearClassifier, self).__init__()
        self.num_labels = num_labels
        self.linear = nn.Linear(dim, num_labels) 
        self.linear.weight.data.normal_(mean=0.0, std=0.01)
        self.linear.bias.data.zero_()

    def forward(self, x):
        # flatten
        x = x.view(x.size(0), -1)

        # linear layer
        return self.linear(x)

linear_classifier = LinearClassifier(vits16.embed_dim * N_LAST_BLOCKS, num_labels=1000)
linear_classifier = linear_classifier.cuda()

linear_state_dict = torch.hub.load_state_dict_from_url(url="https://dl.fbaipublicfiles.com/dino/" + "dino_deitsmall8_pretrain/dino_deitsmall8_linearweights.pth")["state_dict"]

# Update state dict to avoid crash. Workaround.
linear_state_dict['linear.weight'] = linear_state_dict.pop('module.linear.weight')
linear_state_dict['linear.bias'] = linear_state_dict.pop('module.linear.bias')

# Load pre-trained weights
linear_classifier.load_state_dict(linear_state_dict, strict=True)

Downloading: "https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_linearweights.pth" to /cluster/home/thobauma/.cache/torch/hub/checkpoints/dino_deitsmall8_linearweights.pth


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12298259.0), HTML(value='')))




<All keys matched successfully>

# Load data

In [None]:
# Custom loader
from torch.utils.data import Dataset
from torchvision.io import read_image
import traceback
from PIL import Image

class ImageDataset(Dataset):
  def __init__(self, img_folder: str, file_name: str, transform: callable, class_subset: List[int] = None, index_subset: List[int] = None):
    super().__init__()
    self.transform=transform
    self.img_folder=img_folder
    self.data = self.create_df(file_name)
    self.class_subset = class_subset
    if self.class_subset is None:
      if index_subset is not None:
          self.data_subset = self.data.iloc[index_subset]
      else:
        self.data_subset = self.data
    else:
      self.data_subset = self.data[self.data['label'].isin(self.class_subset)] 
  
  def create_df(self, file_name: str):
    df = pd.read_csv(file_name, sep=" ", header=None)
    df.columns=['file', 'label']
    return df
    
  def __len__(self):
    return len(self.data_subset)
  
  def __getitem__(self, index):
    img = Image.open(os.path.join(self.img_folder,self.data_subset['file'].iloc[index]))
    img = img.convert('RGB')

    img=self.transform(img)
    target=self.data_subset['label'].iloc[index]

    return img,target,self.data_subset['file'].iloc[index]
