## Libraries

In [None]:
!pip install openai-clip
!pip install datasets

Collecting openai-clip
  Downloading openai-clip-1.0.1.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from openai-clip)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: openai-clip
  Building wheel for openai-clip (setup.py) ... [?25l[?25hdone
  Created wheel for openai-clip: filename=openai_clip-1.0.1-py3-none-any.whl size=1368605 sha256=881811b519be8c1bb589c0dd09fa6d9833ebff1a1f708cabe3ba1fda564f97fc
  Stored in directory: /root/.cache/pip/wheels/08/77/8e/8d2f862df6bf7fb4e2007062d2cbaeae49862ec7b56d041229
Successfully built openai-clip
Installing collected packages: ftfy, openai-clip
Successfully installed ftfy-6.2.0 openai-clip-1.0.1
Collecting datasets
  Downlo

In [None]:
import json
import string, os, re, pickle
import pandas as pd
from PIL import Image
import requests
from io import StringIO
from tqdm import tqdm
from datasets import Dataset

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import clip
from transformers import CLIPProcessor, CLIPModel, CLIPVisionConfig

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Create training and testing dataset

In [None]:
image_path = "/content/drive/MyDrive/datasets/pokemon_hf_png"
caption_path = "/content/drive/MyDrive/datasets/pokemon_caption_hf.csv"

imagename_df = pd.DataFrame(os.listdir(image_path))
imagename_df.columns = ['filename']
imagename_df['pokedex'] = imagename_df['filename'].str.replace('.png', "")
imagename_df['filename'] = imagename_df['filename'].transform(lambda x: image_path + "/" + x)

caption_df = pd.read_csv(caption_path)
caption_df['pokedex'] = caption_df['pokedex'].astype(str)
caption_df.drop(caption_df.columns[[0]],axis=1,inplace=True)

dataset_df = imagename_df.merge(caption_df, how='left', on='pokedex')
dataset_df = dataset_df[['filename', 'caption']].rename(columns={"caption": "text", "filename": "image"})
dataset_df.head()

Unnamed: 0,image,text
0,/content/drive/MyDrive/datasets/pokemon_hf_png...,a red and white ball with an angry look on its...
1,/content/drive/MyDrive/datasets/pokemon_hf_png...,a drawing of a green pokemon with red eyes
2,/content/drive/MyDrive/datasets/pokemon_hf_png...,a bunch of balls with faces drawn on them
3,/content/drive/MyDrive/datasets/pokemon_hf_png...,a cartoon character with a potted plant on his...
4,/content/drive/MyDrive/datasets/pokemon_hf_png...,a green and yellow toy with a red nose


In [None]:
class image_title_dataset():
    def __init__(self, list_image_path,list_txt):
        self.image_path = list_image_path
        self.title  = clip.tokenize(list_txt)

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        image = preprocess(Image.open(self.image_path[idx]))
        title = self.title[idx]
        return image, title

In [None]:
def gen():
    for index, row in dataset_df.iterrows():
      yield {"caption":row["text"], "filename":row["image"]}

dataset = Dataset.from_generator(gen).shuffle(seed=123)
dataset = dataset.train_test_split(test_size=0.1, shuffle = False)

list_image_path = []
list_txt = []
for i in dataset["train"]:
  list_image_path.append(i["filename"])
  list_txt.append(i["caption"])

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset = image_title_dataset(list_image_path, list_txt)
train_dataloader = DataLoader(dataset, batch_size=100, shuffle=True)

## Pretrained models and parameters loading

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

'cuda:0'

In [None]:
#device = "cuda:0" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

100%|███████████████████████████████████████| 338M/338M [00:04<00:00, 75.8MiB/s]


In [None]:
def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        p.grad.data = p.grad.data.float()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2)
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

## Model Training

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total=len(train_dataloader))
    for batch in pbar:
        optimizer.zero_grad()

        images,texts = batch

        images= images.to(device)
        texts = texts.to(device)


        logits_per_image, logits_per_text = model(images, texts)


        ground_truth = torch.arange(len(images),dtype=torch.long,device=device)
        total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2


        total_loss.backward()
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)

        pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")

    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, f"/content/drive/MyDrive/Pretrained Models/clip_easy_{epoch}.pt")

Epoch 0/10, Loss: 1.7070: 100%|██████████| 8/8 [01:12<00:00,  9.10s/it]
Epoch 1/10, Loss: 0.8936: 100%|██████████| 8/8 [00:32<00:00,  4.09s/it]
Epoch 2/10, Loss: 0.7339: 100%|██████████| 8/8 [00:31<00:00,  3.88s/it]
Epoch 3/10, Loss: 0.2155: 100%|██████████| 8/8 [00:32<00:00,  4.11s/it]
Epoch 4/10, Loss: 0.0940: 100%|██████████| 8/8 [00:31<00:00,  4.00s/it]
Epoch 5/10, Loss: 0.1609: 100%|██████████| 8/8 [00:32<00:00,  4.05s/it]
Epoch 6/10, Loss: 0.1655: 100%|██████████| 8/8 [00:32<00:00,  4.12s/it]
Epoch 7/10, Loss: 0.1652: 100%|██████████| 8/8 [00:33<00:00,  4.13s/it]
Epoch 8/10, Loss: 0.1230: 100%|██████████| 8/8 [00:32<00:00,  4.05s/it]
Epoch 9/10, Loss: 0.1298: 100%|██████████| 8/8 [00:31<00:00,  3.93s/it]
