<a href="https://colab.research.google.com/github/vjhawar12/Image-Captioning/blob/main/Image_Captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.17.0 && pip install torch==2.2.0 && pip install torchvision==0.17.0

In [None]:
from torchtext.vocab import vocab
import torch
import torchvision
from torchvision.transforms import v2
from torchvision.io import decode_image
import torch.nn as nn
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader, Dataset
from pycocotools.coco import COCO
from pprint import pprint
import pandas as pd
from skimage import io
from os import path
from random import randint
from collections import Counter
from google.cloud import storage

In [None]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True) # feature map: [1, 1280]

model.classifier = nn.Identity() # removing the final classification layer to retrieve the feature map

In [35]:
class GRU_Decoder(nn.Module):

  def __init__(self, feature_map_size=1280, embed_size=256, hidden_size=512, num_layers=2, vocab_size=10000):
    super().__init__()

    self.feature_map_size = feature_map_size
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.vocab_size = vocab_size

    self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embed_size) # to map word to a vector
    self.proj = nn.Linear(in_features=self.feature_map_size, out_features=self.hidden_size) # to project the feature map onto the dimension space of the hidden state
    self.gru = nn.GRU(input_size=self.embed_size, hidden_size=self.hidden_size, num_layers=self.num_layers) # the gru layer
    self.fc = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size) # to go from hidden state -> word

  def forward(self, x, words, feature_map):
    batch_size = feature_map.size(0)
    words = self.embed(words) # returns a vector representation of a word
    h0 = self.proj(feature_map).unsqueeze(0) # initializes the hidden state by projecting the feature map onto the hidden state dimensional space
    h0 = h0.reshape(self.num_layers, batch_size, self.hidden_size) # gru expects hidden state in a certain format
    output, _ = self.gru(words, h0) # teacher-forcing the correct captions and supplying the hidden state
    logits = self.fc(output) # going from hidden state vector space --> word vector space

    return logits



In [51]:
class MiniCoco(Dataset):

  def __init__(self, json_file, root_dir, split, transform=None):
    super().__init__()

    self.full_data = pd.read_json(json_file)
    self.data = self.full_data["images"]
    self.split = split
    self.counter = Counter() # counting the # of occurances of a particular word in a sentence
    self.captions = [] # nested list with all the captions for each sample

    if self.split == "train":
      self.data = [obj for obj in self.data if obj["split"] == "restval"]
    elif self.split == "val":
      self.data = [obj for obj in self.data if obj["split"] == "val"]
    elif self.split == "test":
      self.data = [obj for obj in self.data if obj["split"] == "test"]
    else:
      raise Exception("Invalid split")

    self.length = len(self.data)

    self.root_dir = root_dir
    self.transform = transform

    if self.split == "train": # only want to store captions for train -- during test/val model should be generating without knowing any ground truth
      for sample in range(len(self.data)): # iterating over all samples in the train dataset
        cap = [] # captions for particular sample

        for j in range(len(self.data[sample]["sentences"])): # iterating over the various captions provided for each sample
          caption = self.data[sample]["sentences"][j]
          token = caption["tokens"]
          self.counter.update(token) # keeping track of the frequency of each token
          cap.append(token)

        self.captions.append(cap)
    else:
      self.captions = None

    special_tokens = ['<unk>', '<pad>', '<bos>', '<eos>']
    self.vocab = vocab(self.counter, specials=special_tokens, special_first=True, min_freq=2) # mapping words to integers
    self.vocab.set_default_index(self.vocab["<unk>"])

    for i in range(len(self.captions)):
      for j in range(len(self.captions[i])):
        self.captions[i][j] = self.encode(self.captions[i][j]) # mapping each caption in the nested list to an integer via encode()

  def encode(self, text):
    return [self.vocab["<bos>"]] + [self.vocab.get_stoi()[s] for s in text] + [self.vocab["<eos>"]]

  def __len__(self):
    return self.length

  def __getitem__(self, index):
    # train images should only have 1 caption (more efficient when teacher-forcing)
    captions = self.captions[index][randint(0, len(self.captions[index]) - 1)] if self.split == "train" else self.captions[index]

    # storing the image into memory as a torch tensor
    image_name = path.join(self.root_dir, self.data[index]["filename"])
    image = decode_image(image_name, mode="RGB")

    return image, captions

In [52]:
transform_encoder = v2.Compose(
    [
        v2.Resize((224, 224)),
        v2.SanitizeBoundingBoxes(),
        v2.ToTensor(),
        v2.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)



In [53]:
transform_decoder = v2.Compose(
    [
        v2.ToTensor(),
    ]
)

In [None]:
!gcloud auth application-default login

In [3]:
def download_blob(bucket_name, source_blob_name, destination_file_name):

  client = storage.Client(project="Image Captioning")
  bucket = client.bucket(bucket_name)
  blob = bucket.blob(source_blob_name)
  blob.download_to_filename(destination_file_name)

In [23]:
download_blob("img-captioning", "images.cocodataset.org/zips/test2014.zip", "/content/test2014.zip")
download_blob("img-captioning", "images.cocodataset.org/zips/train2014.zip", "/content/train2014.zip")
download_blob("img-captioning", "images.cocodataset.org/zips/val2014.zip", "/content/val2014.zip")
download_blob("img-captioning", "archive.zip", "/content/archive.zip")



In [None]:
!unzip /content/test2014.zip -d /content/test2014/ && unzip /content/train2014.zip -d /content/train2014/ && unzip /content/archive.zip -d /content/archive/ && !unzip /content/val2014.zip -d /content/val2014/

In [27]:
!rm /content/test2014.zip /content/train2014.zip /content/val2014.zip /content/archive.zip

In [44]:
!cd /content/archive && ls

dataset_coco.json  dataset_flickr30k.json  dataset_flickr8k.json


In [None]:
json_file = "/content/archive/dataset_coco.json"
root_train_dir = "/content/train2014/train2014/"
root_test_dir = "/content/test2014/test2014/"
root_val_dir = "/content/val2014/val2014/"

train_data = MiniCoco(json_file, root_train_dir, "train", transform=transform_decoder)
test_data = MiniCoco(json_file, root_test_dir, "test", transform=transform_decoder)
val_data = MiniCoco(json_file, root_val_dir, "val", transform=transform_decoder)

In [None]:
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)
val_dataloader = DataLoader(val_data, batch_size=32, shuffle=False)