In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!cp "/content/drive/MyDrive/AI final/cleaned data/images_9653.zip" /content/

In [3]:
!unzip /content/images_9653.zip -d /content/images

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/images/WOMEN-Graphic_Tees-id_00004383-01_1_front.jpg  
  inflating: /content/images/WOMEN-Jackets_Coats-id_00007577-01_7_additional.jpg  
  inflating: /content/images/MEN-Shirts_Polos-id_00003489-11_1_front.jpg  
  inflating: /content/images/MEN-Tees_Tanks-id_00002482-01_1_front.jpg  
  inflating: /content/images/WOMEN-Dresses-id_00004044-04_4_full.jpg  
  inflating: /content/images/WOMEN-Blouses_Shirts-id_00001942-01_4_full.jpg  
  inflating: /content/images/WOMEN-Jackets_Coats-id_00006710-03_2_side.jpg  
  inflating: /content/images/MEN-Tees_Tanks-id_00004170-04_1_front.jpg  
  inflating: /content/images/WOMEN-Dresses-id_00000173-02_1_front.jpg  
  inflating: /content/images/MEN-Sweaters-id_00004806-01_3_back.jpg  
  inflating: /content/images/WOMEN-Cardigans-id_00000788-02_3_back.jpg  
  inflating: /content/images/MEN-Shorts-id_00004249-02_4_full.jpg  
  inflating: /content/images/WOMEN-Dresses-id

In [4]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from PIL import Image
from PIL import ImageFile

import json
import os
import random
from tqdm import tqdm

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
with open("/content/drive/MyDrive/AI final/cleaned data/cleaned_captions.json", "r") as f:
  captions = json.load(f)

with open("/content/drive/MyDrive/AI final/cleaned data/train_keys.txt", "r") as f:
  train_keys = [line.strip() for line in f if line.strip() != ""]

with open("/content/drive/MyDrive/AI final/cleaned data/test_keys.txt", "r") as f:
  test_keys = [line.strip() for line in f if line.strip() != ""]

def get_img_pair(key):
  if key not in captions:
    return None

  img_path = os.path.join("/content/images", key)

  try:
    img = Image.open(img_path).convert("RGB")
  except:
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    img = Image.open(img_path).convert("RGB")

  caption = captions[key]

  return img, caption

In [7]:
class ImgCaption(Dataset):
  def __init__(self, keys, img_folder, captions_dict, processor, tokenizer, max_length=128):
    self.keys = keys
    self.img_folder = img_folder
    self.captions_dict = captions_dict
    self.processor = processor
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __getitem__(self, index):
    key = self.keys[index]
    img, caption = get_img_pair(key)

    pixel_vals = self.processor(img, return_tensors="pt").pixel_values.squeeze()

    labels = self.tokenizer(
        caption,
        padding="max_length",
        truncation=True,
        max_length=self.max_length,
        return_tensors="pt"
    ).input_ids.squeeze()

    return {"pixel_values" : pixel_vals, "labels" : labels}

  def __len__(self):
    return len(self.keys)

In [8]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (inte

In [9]:
train_dataset = ImgCaption(
    train_keys,
    img_folder="/content/images",
    captions_dict=captions,
    processor=image_processor,
    tokenizer=tokenizer
)

test_dataset = ImgCaption(
    test_keys,
    img_folder="/content/images",
    captions_dict=captions,
    processor=image_processor,
    tokenizer=tokenizer
)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
  model.train()

  tot_loss = 0

  progress = tqdm(train_loader)

  for batch in progress:
    pixel_vals = batch["pixel_values"].to(device)
    labels = batch["labels"].to(device)

    outputs = model(pixel_values=pixel_vals, labels=labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    tot_loss += loss.item()
    progress.set_postfix(loss=loss.item())

  avg_loss = tot_loss / len(train_loader)

  0%|          | 0/1931 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
100%|██████████| 1931/1931 [05:05<00:00,  6.33it/s, loss=0.198]
100%|██████████| 1931/1931 [05:09<00:00,  6.24it/s, loss=0.259]
100%|██████████| 1931/1931 [05:13<00:00,  6.16it/s, loss=0.205]


In [20]:
model.save_pretrained("/content/drive/MyDrive/AI final/final_vision_model")

In [21]:
tokenizer.save_pretrained("/content/drive/MyDrive/AI final/final_tokenizer")

('/content/drive/MyDrive/AI final/final_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/AI final/final_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/AI final/final_tokenizer/vocab.json',
 '/content/drive/MyDrive/AI final/final_tokenizer/merges.txt',
 '/content/drive/MyDrive/AI final/final_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/AI final/final_tokenizer/tokenizer.json')

In [22]:
image_processor.save_pretrained("/content/drive/MyDrive/AI final/final_image_processor")

['/content/drive/MyDrive/AI final/final_image_processor/preprocessor_config.json']

In [12]:
model.eval()

tot_test_loss = 0

with torch.no_grad():
  progress = tqdm(test_loader)

  for batch in progress:
    pixel_vals = batch["pixel_values"].to(device)
    labels = batch["labels"].to(device)

    outputs = model(pixel_values=pixel_vals, labels=labels)
    loss = outputs.loss

    tot_test_loss += loss.item()
    progress.set_postfix(test_loss=loss.item())

avg_test_loss = tot_test_loss / len(test_loader)
print(avg_test_loss)

100%|██████████| 483/483 [00:46<00:00, 10.33it/s, test_loss=0.2]

0.17920453215546242





In [13]:
def generate_caption(key):
  model.eval()

  img_path = "/content/images/" + key

  img = Image.open(img_path).convert("RGB")

  inputs = image_processor(img, return_tensors="pt")
  pixel_values = inputs["pixel_values"].to(device)

  with torch.no_grad():
    out_ids = model.generate(
        pixel_values,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

  caption = tokenizer.decode(out_ids[0], skip_special_tokens=True)

  return caption

In [14]:
from sentence_transformers import SentenceTransformer, util

In [15]:
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
def similarity(x, y):
  e1 = similarity_model.encode(x, convert_to_tensor=True)
  e2 = similarity_model.encode(y, convert_to_tensor=True)
  return util.cos_sim(e1, e2).item()

In [17]:
import pandas as pd

res = []

for key in tqdm(test_keys):
  og_caption = captions[key]
  gen_caption = generate_caption(key)

  sim = similarity(og_caption, gen_caption)

  res.append({
      "key" : key,
      "og_caption" : og_caption,
      "gen_caption" : gen_caption,
      "similarity" : sim
  })

df = pd.DataFrame(res)

df.to_csv("/content/drive/MyDrive/AI final/test_caption_res.csv")

  0%|          | 0/1931 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 1931/1931 [07:27<00:00,  4.32it/s]


In [23]:
import pandas as pd

res = []

for key in tqdm(train_keys):
  og_caption = captions[key]
  gen_caption = generate_caption(key)

  sim = similarity(og_caption, gen_caption)

  res.append({
      "key" : key,
      "og_caption" : og_caption,
      "gen_caption" : gen_caption,
      "similarity" : sim
  })

df = pd.DataFrame(res)

df.to_csv("/content/drive/MyDrive/AI final/train_caption_res.csv")

100%|██████████| 7722/7722 [29:52<00:00,  4.31it/s]
