In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn 
from torch.utils.data import DataLoader     # 데이터로더는 데이터셋을 iterable하게 감싸는 역할
from torchvision import datasets            # 데이터셋은 샘플과 정답을 저장함
from torchvision.transforms import ToTensor
import clip
from PIL import Image

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image_title = 'A horse in the space.png'

prompts = [
    "A horse in the space",
    "A dog in the space",
    "A bear in the space",
    "A person in the space",
    "A horse in the park",
    "A dog in the park",
    "A bear in the park",
    "A person in the park",
]

image = preprocess(Image.open(image_title)).unsqueeze(0).to(device)
text = clip.tokenize(prompts).to(device)

In [4]:
img_embeddings = [torch.zeros([1, 512]) for i in range(model.visual.transformer.layers)]

#%% model.encode_image

# image -> tokens
x = model.visual.conv1(image.type(model.visual.conv1.weight.dtype))
x = x.reshape(x.shape[0], x.shape[1], -1)   # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1)                      # shape = [*, grid ** 2, width]
x = torch.cat([model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
x = x + model.visual.positional_embedding.to(x.dtype)
x = model.visual.ln_pre(x)

# tokens -> transformer -> feature_embeddings
x = x.permute(1, 0, 2)  # NLD -> LND

for i in range(model.visual.transformer.layers):
    x = model.visual.transformer.resblocks[i](x)
    tmp = x.permute(1, 0, 2)
    tmp = model.visual.ln_post(tmp[:, 0, :])
    if model.visual.proj is not None:
        tmp = tmp @ model.visual.proj
    img_embeddings[i].copy_(tmp)

x = x.permute(1, 0, 2)  # LND -> NLD
print(x.shape)  

x = model.visual.ln_post(x[:, 0, :])    # [CLS] token의 임베딩을 사용
print(x.shape)  

if model.visual.proj is not None:
    x = x @ model.visual.proj
print(x.shape)

torch.Size([1, 50, 768])
torch.Size([1, 768])
torch.Size([1, 512])


In [20]:
txt_embeddings = [torch.zeros([8, 512]) for i in range(model.transformer.layers)]

#%% model.encode_image

x = model.token_embedding(text).type(model.dtype)  # [batch_size, n_ctx, d_model]

x = x + model.positional_embedding.type(model.dtype)
x = x.permute(1, 0, 2)  # NLD -> LND

for i in range(model.transformer.layers):
    x = model.transformer.resblocks[i](x)
    tmp = x.permute(1, 0, 2)
    tmp = model.ln_final(tmp).type(model.dtype)
    tmp = tmp[torch.arange(tmp.shape[0]), text.argmax(dim=-1)] @ model.text_projection
    txt_embeddings[i].copy_(tmp)


x = x.permute(1, 0, 2)  # LND -> NLD
x = model.ln_final(x).type(model.dtype)
print(x.shape)

x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ model.text_projection    
    # x.shape[0] : 들어온 단어 토큰의 개수
    # x[[(which_word_token)], (which_end_token)] : [CLS] token이 아니라 end token의 임베딩을 사용
    # text.argmax(dim=-1) : end token의 위치
print(x.shape)


torch.Size([8, 77, 512])
torch.Size([8, 512])


In [27]:
txt_embeddings[0][[0]].shape

torch.Size([1, 512])

In [28]:
img_similarity_by_layer = [[(img_embeddings[i] @ txt_embeddings[j][[0]].T).item() for j in range(12)] for i in range(12)]

In [45]:
# score
argmax = np.argmax(img_similarity_by_layer)

score1 = np.max(img_similarity_by_layer)
score2 = np.mean(img_similarity_by_layer)

threshold = 10
score3 = sum(list(map(lambda x: sum([y>threshold for y in x]), img_similarity_by_layer)))

score1, score2, score3

(23.460453033447266, 10.72868076297972, 86)

In [47]:
threshold = 20

for _class in range(8):
    img_similarity_by_layer = [[(img_embeddings[i] @ txt_embeddings[j][[_class]].T).item() for j in range(12)] for i in range(12)]
    score1 = np.max(img_similarity_by_layer)
    score2 = np.mean(img_similarity_by_layer)
    score3 = sum(list(map(lambda x: sum([y>threshold for y in x]), img_similarity_by_layer)))
    print(f"label:\t{prompts[_class]}\nscore1:\t{score1}\nscore2:\t{score2}\nscore3:\t{score3}\n{'='*20}")

label:	A horse in the space
score1:	33.826358795166016
score2:	12.221461418602201
score3:	14
label:	A dog in the space
score1:	26.979454040527344
score2:	12.280097925000721
score3:	13
label:	A bear in the space
score1:	26.947742462158203
score2:	12.698545568519169
score3:	13
label:	A person in the space
score1:	30.46843719482422
score2:	14.02111851506763
score3:	13
label:	A horse in the park
score1:	25.63037872314453
score2:	11.307973388168547
score3:	6
label:	A dog in the park
score1:	20.79569435119629
score2:	10.3964384496212
score3:	1
label:	A bear in the park
score1:	22.559795379638672
score2:	11.075195771124628
score3:	6
label:	A person in the park
score1:	23.460453033447266
score2:	10.72868076297972
score3:	11


# Fine-tuning

`pip install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI"`

In [30]:
import torch
from torch import nn    # Neural Network Layer ? 
from torch.utils.data import DataLoader, Dataset    # 데이터로더는 데이터셋을 iterable하게 감싸는 역할
from torchvision import datasets                    # 데이터셋은 샘플과 정답을 저장함
from torchvision.transforms import ToTensor         # 데이터셋을 텐서로 바꾸는듯?
import torchvision.transforms as transforms

## Flickr dataset

### Setup

In [47]:
import os

import datasets
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing as mp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import io, transforms
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import Seq2SeqTrainer ,Seq2SeqTrainingArguments
from transformers import VisionEncoderDecoderModel , ViTFeatureExtractor
from transformers import AutoTokenizer ,  GPT2Config , default_data_collator


if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1660 Ti


In [48]:
os.environ["WANDB_DISABLED"] = "true"
class config : 
    ENCODER = "google/vit-base-patch16-224"
    DECODER = "gpt2"
    TRAIN_BATCH_SIZE = 8
    VAL_BATCH_SIZE = 8
    VAL_EPOCHS = 1
    LR = 5e-5
    SEED = 42
    MAX_LEN = 128
    SUMMARY_LEN = 20
    WEIGHT_DECAY = 0.01
    MEAN = (0.485, 0.456, 0.406)
    STD = (0.229, 0.224, 0.225)
    TRAIN_PCT = 0.95
    NUM_WORKERS = mp.cpu_count()
    EPOCHS = 3
    IMG_SIZE = (224,224)
    LABEL_MASK = -100
    TOP_K = 1000
    TOP_P = 0.95

In [49]:
feature_extractor = ViTFeatureExtractor.from_pretrained(config.ENCODER)
tokenizer = AutoTokenizer.from_pretrained(config.DECODER)
tokenizer.pad_token = tokenizer.unk_token

In [50]:
transforms = transforms.Compose(
    [
        transforms.Resize(config.IMG_SIZE), 
        transforms.ToTensor(),
        transforms.Normalize(
            mean=0.5, 
            std=0.5
        )
   ]
)
df=  pd.read_csv("Flickr8k/captions.txt")
train_df , val_df = train_test_split(df , test_size = 0.2)
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [51]:
class ImgDataset(Dataset):
    def __init__(self, df,root_dir,tokenizer,feature_extractor, transform = None):
        self.df = df
        self.transform = transform
        self.root_dir = root_dir
        self.tokenizer= tokenizer
        self.feature_extractor = feature_extractor
        self.max_length = 50
    def __len__(self,):
        return len(self.df)
    def __getitem__(self,idx):
        caption = self.df.caption.iloc[idx]
        image = self.df.image.iloc[idx]
        img_path = os.path.join(self.root_dir , image)
        img = Image.open(img_path).convert("RGB")
        
        if self.transform is not None:
            img= self.transform(img)
        pixel_values = self.feature_extractor(img, return_tensors="pt").pixel_values
        captions = self.tokenizer(caption,
                                 padding='max_length',
                                 max_length=self.max_length).input_ids
        captions = [caption if caption != self.tokenizer.pad_token_id else -100 for caption in captions]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(captions)}
        return encoding

### Load

In [63]:
train_dataset = ImgDataset(train_df, root_dir = "Flickr8k/Images",
                            tokenizer=tokenizer,feature_extractor = feature_extractor ,transform = transforms)
val_dataset = ImgDataset(val_df , root_dir = "Flickr8k/Images",
                            tokenizer=tokenizer,feature_extractor = feature_extractor , transform  = transforms)

In [70]:
val_dataset[0]

{'pixel_values': tensor([[[-0.5373, -0.5373, -0.5373,  ..., -0.4275, -0.4431, -0.4431],
          [-0.5059, -0.5216, -0.5059,  ..., -0.4118, -0.4431, -0.4431],
          [-0.5059, -0.5059, -0.5059,  ..., -0.4118, -0.4275, -0.4275],
          ...,
          [ 0.6000,  0.3333,  0.0667,  ...,  0.7098,  0.7255,  0.7255],
          [ 0.3804,  0.4588,  0.6157,  ...,  0.7255,  0.7569,  0.7569],
          [ 0.6784,  0.4902,  0.2078,  ...,  0.7098,  0.6627,  0.6471]],
 
         [[ 0.3098,  0.3098,  0.3098,  ...,  0.3098,  0.3255,  0.3255],
          [ 0.3255,  0.3098,  0.3255,  ...,  0.3412,  0.3255,  0.3255],
          [ 0.3255,  0.3255,  0.3255,  ...,  0.3569,  0.3412,  0.3412],
          ...,
          [ 0.4745,  0.1922, -0.0824,  ...,  0.5529,  0.5843,  0.5843],
          [ 0.2706,  0.3333,  0.5216,  ...,  0.5686,  0.6157,  0.6157],
          [ 0.5686,  0.3804,  0.1608,  ...,  0.5529,  0.5216,  0.5059]],
 
         [[ 1.0000,  1.0000,  1.0000,  ...,  0.9843,  0.9686,  0.9686],
          [ 

In [65]:
for x, y in test_dataloader:
    print(x, val_dataset[0][x].shape)
    break

pixel_values torch.Size([3, 224, 224])


In [53]:
batch_size = 64 

# Create data loaders.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(val_dataset, batch_size=batch_size)

for X, y in test_dataloader:    # X = data, y = label
    print(f"Shape of X [N, C, H, W]: {X}")    # N = batch size, C = Color, H, W
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: pixel_values


AttributeError: 'str' object has no attribute 'shape'

In [None]:
idx = 0
img_arr = []
caption_arr = []
for idx, (imgs, captions) in enumerate(loader):
    for idx in range(4):
        img_arr.append(imgs[idx])
        caption_arr.append(captions[idx])
            
        plt.title(caption_arr[idx])
        plt.imshow(img_arr[idx].permute(1,2,0))
            
        break
    break