In [None]:
import numpy as np
import urllib.request
import os
import glob as gb
import pathlib
import random
import torch
import matplotlib
import matplotlib.pyplot as plt
import time
import torch.nn as nn
from torch import optim
from torchsummary import summary
from torch.utils.data import DataLoader

# https://github.com/openai/CLIP/blob/fcab8b6eb92af684e7ff0a904464be7b99b49b88/notebooks/Prompt_Engineering_for_ImageNet.ipynb
# https://github.com/openai/CLIP/issues/164
# https://github.com/openai/CLIP/issues/83

In [None]:
# Setting seed
def set_seed(seed):
    # os.environ['PYTHONASHSEED'] = 0 무작위화 비활성화
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
!
pip install wandb
wandb login

In [None]:
set_seed(1)

In [None]:
# Get image labels
def get_labels(labels_path='/content/30_labels.txt'):
  f = open(labels_path, 'r')
  labels = f.readlines()
  f.close()
  
  labels = [l.replace('\n', '').replace(' ', '_') for l in labels]
  return labels

import os
import urllib.request

def load_dataset(dataset_path='./dataset/'):
  # make directory
  try:
    if not os.path.exists(dataset_path):
      os.makedirs(dataset_path)
  except:
      None
      
  # get data from web
  labels = get_labels()
  base_url = 'https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/'
  for label in labels:
    label_url = label.replace('_', '%20')
    npy_url = base_url + label_url + '.npy'
    print(npy_url)
    urllib.request.urlretrieve(npy_url, dataset_path + label + '.npy')

  print('Done!')

load_dataset()

https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/airplane.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/apple.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/banana.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/baseball.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/bear.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/bicycle.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/bird.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/bus.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/cat.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/cup.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/dog.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/duck.npy
https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/fish.npy
ht

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 71.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 80.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
# Get cpu or gpu device for training
def check_device():
    if torch.cuda.is_available():
        DEVICE = torch.device('cuda')
    else:
        DEVICE = torch.deivce('cpu')
    
    print('Using Pytorch version : ',  torch.__version__, 'DEVICE : ', DEVICE)

In [None]:
DEVICE = check_device()

# model = model.to(DEVICE)

Using Pytorch version :  1.12.1+cu113 DEVICE :  cuda


In [None]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.5 MB/s 
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-b5i3zuzh
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-b5i3zuzh
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369408 sha256=4739ac0ec7288351eb694ccad90345acc5ec206e42d56fbc27f77d1f2df1be87
  Stored in directory: /tmp/pip-ephem-wheel-cache-n9zcbdx1/wheels/ab/4f/3a/5e51521b55997aa6f0690e095c08824219753128ce8d9969a3
Successfully

In [None]:
import clip

clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [None]:
model, preprocess = clip.load("RN50")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

100%|███████████████████████████████████████| 244M/244M [00:04<00:00, 58.8MiB/s]


Model parameters: 102,007,137
Input resolution: 224
Context length: 77
Vocab size: 49408


In [None]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7faa28ab9e50>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [None]:
from PIL import Image
from torchvision.transforms import ToPILImage

import numpy as np
import glob as gb
from sklearn.model_selection import train_test_split
import torch
from torchvision import transforms
from torch.utils.data import Dataset, random_split, TensorDataset
import os


class QuickDrawDataset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform
        
    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(x)
        return x, y
        
    def __len__(self):
        return len(self.subset)

def prepare_img_dataset(npy_files_path='./dataset/*.npy',test_ratio=0.2, max_items_per_class=10000):
    npy_files = gb.glob(npy_files_path)

    #initialize variables 
    X = np.empty([0, 784]) # 28*28 =784
    y = np.empty([0])
    classes = []

    #load a subset of the data to memory 
    for idx, npy_file in enumerate(npy_files):
        data = np.load(npy_file)
        data = data[0:max_items_per_class, :]
        labels = np.full(data.shape[0], idx)

        X = np.concatenate((X, data), axis=0)
        y = np.append(y, labels)
    
        label, extension = os.path.splitext(os.path.basename(npy_file))
        classes.append(label)

    data = None
    labels = None
    
    # transform to torch tensor
    X_tensor = torch.Tensor(X)
    X_tensor = X_tensor.reshape(X_tensor.shape[0], 1, 28, 28)
    # normalizatoin
    X_tensor /= 255.0
    y_tensor = torch.Tensor(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=test_ratio, random_state=1, stratify=y_tensor)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1, stratify=y_test)
    
    # create dataset
    dataset = TensorDataset(X_tensor, y_tensor)
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    test_dataset = TensorDataset(X_test, y_test)
    
    # caculate mean & std
    means = torch.zeros(1)
    stds = torch.zeros(1)
    
    for img, label in train_dataset:
        means += torch.mean(img, dim = (1,2))
        stds += torch.std(img, dim = (1,2))
        
    means /= len(train_dataset)
    stds /= len(train_dataset)
    
    print(f'Means: {means}')
    print(f'STDs: {stds}')
    
    # transform
    transforms_train = transforms.Compose([
        transforms.Normalize((0.1702,), (0.3224)),
        transforms.ToPILImage()
    ])
    
    transforms_test = transforms.Compose([
        transforms.Normalize((0.1702,), (0.3224)),
        transforms.ToPILImage()
    ])
    
    train_dataset = QuickDrawDataset(subset=train_dataset, transform=transforms_train)
    val_dataset = QuickDrawDataset(subset=val_dataset, transform=transforms_test)
    test_dataset = QuickDrawDataset(subset=test_dataset, transform=transforms_test)
    
    
    return dataset, train_dataset, val_dataset, test_dataset, classes

In [None]:
dataset, train_dataset, val_dataset, test_dataset, classes = prepare_img_dataset()

Means: tensor([0.1702])
STDs: tensor([0.3224])


In [None]:
# generate sentences
clip_labels = [f"a drawing of the {label}" for label in classes]
clip_labels

['a drawing of the dog',
 'a drawing of the bus',
 'a drawing of the tree',
 'a drawing of the spider',
 'a drawing of the light_bulb',
 'a drawing of the pencil',
 'a drawing of the bicycle',
 'a drawing of the fish',
 'a drawing of the ice_cream',
 'a drawing of the airplane',
 'a drawing of the bird',
 'a drawing of the bear',
 'a drawing of the moon',
 'a drawing of the banana',
 'a drawing of the house',
 'a drawing of the pig',
 'a drawing of the apple',
 'a drawing of the hamburger',
 'a drawing of the sun',
 'a drawing of the shoe',
 'a drawing of the rabbit',
 'a drawing of the flower',
 'a drawing of the monkey',
 'a drawing of the cup',
 'a drawing of the baseball',
 'a drawing of the nose',
 'a drawing of the umbrella',
 'a drawing of the cat',
 'a drawing of the lion',
 'a drawing of the duck']

In [None]:
with torch.no_grad():
  zeroshot_weights = []
  label_tokens = clip.tokenize(clip_labels).cuda()
  label_embs = model.encode_text(label_tokens).float()
  label_embs /= label_embs.norm(dim=-1, keepdim=True)
  label_emb = label_embs.mean(dim=0)
  label_emb /= label_emb.norm()
  zeroshot_weights.append(label_emb)
  zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()

In [None]:
zeroshot_weights

tensor([[ 0.0023],
        [ 0.0023],
        [ 0.0099],
        ...,
        [ 0.0038],
        [-0.0046],
        [ 0.0364]], device='cuda:0')

In [None]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7faa28ab9e50>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [None]:
def idx_to_class(y_idx):
  return classes[int(y_idx.item())]

In [None]:
img_features = []
img_labels = []

from tqdm import tqdm

for img, y in tqdm(test_dataset):
  img_input = preprocess(img).unsqueeze(0).cuda()
  label = idx_to_class(y)

  with torch.no_grad():
    img_feature = model.encode_image(img_input)
  
  img_feature /= img_feature.norm()
  img_features.append(img_feature)
  img_labels.append(label)

100%|██████████| 30000/30000 [14:08<00:00, 35.34it/s]


In [None]:
@torch.no_grad()
def accuracy(output, target, topk=(1,)):
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.reshape(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size).item())
    return res

In [None]:
img_features = np.stack(img_features, dim=1).squeeze(0).cuda()

    
    
# compute top-1 accuracy
logits = (100. * img_features @ zeroshot_weights).softmax(dim=-1)
img_labels = torch.tensor(img_labels).unsqueeze(dim=1).cuda()
top1_acc = accuracy(logits, img_labels, (1,))
print(f'top-1 accuracy for QuickDraw dataset: {top1_acc[0]:.3f}')


TypeError: ignored

In [None]:
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm

def validate(CONFIG, val_loader, model, criterion, device):
    # Reference: https://github.com/pytorch/examples/blob/00ea159a99f5cb3f3301a9bf0baa1a5089c7e217/imagenet/main.py#L313-L353
    losses = AverageMeter("Loss", ":.4f", Summary.AVERAGE)
    top1 = AverageMeter("Acc@1", ":6.2f", Summary.AVERAGE)
    top5 = AverageMeter("Acc@5", ":6.2f", Summary.AVERAGE)
    progress = ProgressMeter(
        len(val_loader), [losses, top1, top5], prefix="Validation: "
    )
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        for i, (images, flipped_images, labels) in enumerate(tqdm(val_loader)):
            images = images.to(device)
            flipped_images = flipped_images.to(device)
            labels = labels.to(device)
            
            # compute logits
            logit_original = model(images)
            logit_flipped = model(flipped_images)
            logit_output = (logit_original + logit_flipped) / 2
            
            # get cross entropy loss
            ce_loss_org = criterion(logit_original, labels)
            ce_loss_flip = criterion(logit_flipped, labels)
            ce_loss = (ce_loss_org + ce_loss_flip) / 2

            # get kl divergence between logits
            kl_loss_org = F.kl_div(F.log_softmax(logit_original, dim=-1), F.softmax(logit_flipped, dim=-1), reduction='none')
            kl_loss_flip = F.kl_div(F.log_softmax(logit_flipped, dim=-1), F.softmax(logit_original, dim=-1), reduction='none')
            kl_loss_org = kl_loss_org.mean()
            kl_loss_flip = kl_loss_flip.mean()
            kl_loss = (kl_loss_org + kl_loss_flip) / 2

            # get crossentropy loss regularized with kl divergence loss
            loss = ce_loss + CONFIG.reg_lamda * kl_loss

            # measure accuracy and record loss
            acc1, acc5 = accuracy(logit_output.data, labels, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

        progress.display_summary()
    return (
        losses.avg,
        top1.avg,
        top5.avg,
    )

In [None]:
def class_wise(model, model_name, test_dataset, test_loader, device):
  name_classes = [k for k, v in test_dataset.class_to_idx.items()]
  num_classes = len(name_classes)
  confusion_matrix = np.zeros((num_classes, num_classes))
  
  model.eval()
  with torch.no_grad():
    for (X, X_flipped, y) in test_loader:
        X = X.type(torch.float32).to(device)
        X_flipped = X_flipped.type(torch.float32).to(device)
        y = y.type(torch.LongTensor).to(device)
        y_pred = model(X).argmax(dim=1, keepdim = True)
        y_flipped_pred = model(X_flipped).argmax(dim=1, keepdim = True)
        logit_pred = (y_pred + y_flipped_pred) / 2
        
        for truth, pred, in zip(y.view(-1), logit_pred.view(-1)):
          confusion_matrix[truth.long(), pred.long()] += 1
  
  df_cm = pd.DataFrame(confusion_matrix, index=name_classes, columns=name_classes).astype(int)
  
  heatmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap='GnBu')
  heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right',fontsize=10)
  heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right',fontsize=10)
  plt.ylabel('Truth', fontsize = 12, labelpad=5)
  plt.xlabel('Prediction', fontsize = 12, labelpad=5)
  plt.title(f'{model_name} Test Accuracy', fontsize = 15, pad=20)
  save_figure(f'{model_name}_class_wise_test_accuracy')

  test_wise_accuracy_list = np.diag(confusion_matrix) / confusion_matrix.sum(1)
  print(test_wise_accuracy_list)

  return test_wise_accuracy_list