In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
!pip3 install sacrebleu
!pip3 install spacy

Collecting sacrebleu
  Downloading sacrebleu-1.4.13-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 425 kB/s eta 0:00:011
[?25hCollecting portalocker
  Downloading portalocker-2.0.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.0.0 sacrebleu-1.4.13
You should consider upgrading via the '/home/pratik/anaconda3/bin/python3 -m pip install --upgrade pip' command.[0m
Collecting spacy
  Downloading spacy-2.3.2-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 3.1 MB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.3-cp37-cp37m-manylinux1_x86_64.whl (32 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.2-cp37-cp37m-manylinux1_x86_64.whl (118 kB)
[K     |████████████████████████████████| 118 kB 4.5 MB/s eta 0:00:01
[?25hCollecting thinc==7.4.1
  Downloading thinc-7.4.1-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)


# Imports

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.models as Tmodels
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision.models import resnet50
from torchvision import transforms, utils
from skimage import io, transform
import matplotlib.pyplot as plt # for plotting
import numpy as np
import spacy.cli
import spacy  # for tokenizer
import os
import csv
import sacrebleu
spacy.cli.download("it_core_news_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('it_core_news_sm')


# Constants

In [3]:
MAX_TEXT_LENGTH = 100
BEAM=5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sp = spacy.load("it_core_news_sm")

# Image Transforms

In [4]:
class Rescale(object):
  """Rescale the image in a sample to a given size.

  Args:
      output_size (tuple or int): Desired output size. If tuple, output is
          matched to output_size. If int, smaller of image edges is matched
          to output_size keeping aspect ratio the same.
  """

  def __init__(self, output_size):
    assert isinstance(output_size, (int, tuple))
    self.output_size = output_size

  def __call__(self, image):
    h, w = image.shape[:2]
    if isinstance(self.output_size, int):
        if h > w:
            new_h, new_w = self.output_size * h / w, self.output_size
        else:
            new_h, new_w = self.output_size, self.output_size * w / h
    else:
        new_h, new_w = self.output_size

    new_h, new_w = int(new_h), int(new_w)
    img = transform.resize(image, (new_h, new_w))
    return img


class ToTensor(object):
  """Convert ndarrays in sample to Tensors."""

  def __call__(self, image):
    # swap color axis because
    # numpy image: H x W x C
    # torch image: C X H X W
    image = image.transpose((2, 0, 1))
    return image


IMAGE_RESIZE = (256, 256)
# Sequentially compose the transforms
img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor()])

# Vocabulary

In [5]:
class Vocabulary:
  def __init__(self):
    self.freq = {"<PAD>": 0, "<SOS>": 0, "<EOS>": 0, "<UNK>": 0}
    self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
    self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    self.idx = 4
    

  def __len__(self):
    return len(self.itos)

  @staticmethod
  def tokenizer(text):
    tokens =  [tok.text.lower() for tok in sp(text)]
    return tokens

  def build_vocabulary(self, sentence_list):

    for sentence in sentence_list:
      for word in self.tokenizer(sentence):
        if word not in self.freq:
          self.freq[word] = 1

        else:
          self.freq[word] += 1

        if self.freq[word] == 1:
          self.stoi[word] = self.idx
          self.itos[self.idx] = word
          self.idx += 1

  def numericalize(self, text):
    tokenized_text = self.tokenizer(text)

    return [
      self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
      for token in tokenized_text
    ]

# Captions Preprocessing

In [6]:
class CaptionsPreprocessing:
  """Preprocess the captions, generate vocabulary and convert words to tensor tokens

  Args:
      captions_file_path (string): captions tsv file path
  """
  def __init__(self, captions_file_path):
    self.captions_file_path = captions_file_path

    # Read raw captions
    self.raw_captions_dict = self.read_raw_captions()

    # Preprocess captions
    self.captions_dict = self.process_captions()

    # Create vocabulary
    self.vocab =  Vocabulary()
    self.generate_vocabulary()

  def read_raw_captions(self):
    """
    Returns:
        Dictionary with raw captions list keyed by image ids (integers)
    """

    captions_dict = {}
    with open(self.captions_file_path, 'r', encoding='utf-8') as f:
      for img_caption_line in f.readlines():
        img_captions = img_caption_line.strip().split('\t')
        captions_dict[int(img_captions[0])] = img_captions[1:]

    return captions_dict

  def process_captions(self):
    """
    Use this function to generate dictionary and other preprocessing on captions
    """

    raw_captions_dict = self.raw_captions_dict

    # Do the preprocessing here
    captions_dict = raw_captions_dict

    return captions_dict

  def generate_vocabulary(self):
    """
    Use this function to generate dictionary and other preprocessing on captions
    """

    captions_dict = self.captions_dict

    # Generate the vocabulary

    for captionList in captions_dict.values():
      self.vocab.build_vocabulary(captionList)

    

  def captions_transform(self, img_caption_list):
    """
    Use this function to generate tensor tokens for the text captions
    Args:
        img_caption_list: List of captions for a particular image
    """
    vocab = self.vocab

    # Generate tensors

    img_caption_List= img_caption_list.copy()
    captions_length = []
    for index, caption in enumerate(img_caption_list):
      numericalized_caption  = [self.vocab.stoi["<SOS>"]]
      numericalized_caption += self.vocab.numericalize(caption)
      numericalized_caption.append(self.vocab.stoi["<EOS>"])
      captions_length.append(len(numericalized_caption))
      img_caption_List[index] = numericalized_caption

    img_caption_matrix  = np.zeros([len(img_caption_List),MAX_TEXT_LENGTH])
    for i,j in enumerate(img_caption_List):
      img_caption_matrix[i][0:len(j)] = j

    caption_tensor =  torch.from_numpy(img_caption_matrix)
    caption_tensor = caption_tensor.long()

    return caption_tensor, torch.tensor(captions_length).int()

# Set the captions tsv file path
CAPTIONS_FILE_PATH = '/home/pratik/Downloads/akshaymodels/train_captions.tsv'
captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)

# TEST_CAPTIONS_FILE_PATH = 'drive/My Drive/Colab Notebooks/CNN/test_captions.tsv'
# test_captions_preprocessing_obj = CaptionsPreprocessing(TEST_CAPTIONS_FILE_PATH)

# Dataset Class

In [7]:
class ImageCaptionsDataset(Dataset):
  def __init__(self, img_dir, captions_dict, img_transform=None, captions_transform=None):
    """
    Args:
        img_dir (string): Directory with all the images.
        captions_dict: Dictionary with captions list keyed by image ids (integers)
        img_transform (callable, optional): Optional transform to be applied
            on the image sample.

        captions_transform: (callable, optional): Optional transform to be applied
            on the caption sample (list).
    """
    self.img_dir = img_dir
    self.captions_dict = captions_dict
    self.img_transform = img_transform
    self.captions_transform = captions_transform

    self.image_ids = list(captions_dict.keys())

  def __len__(self):
    return len(self.image_ids)

  def __getitem__(self, idx):
    img_name = os.path.join(self.img_dir, 'image_{}.jpg'.format(self.image_ids[idx]))
    image = io.imread(img_name)
    captions = self.captions_dict[self.image_ids[idx]]

    if self.img_transform:
        image = self.img_transform(image)

    if self.captions_transform:
        captions, captions_length = self.captions_transform(captions)

    sample = {'id': idx, 'image': image, 'captions': captions, 'captions_length': captions_length}

    return sample

# Save and Load Model

In [15]:
def save_checkpoint(state, filename="/home/pratik/Downloads/akshaymodels/model.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)

def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    step = checkpoint["step"]
    return step

# Encoder CNN

In [9]:
class EncoderCNN(nn.Module):
  def __init__(self, embed_size):
    """Load the pretrained ResNet-50 and replace top fc layer."""
    super(EncoderCNN, self).__init__()
    resnet = resnet50(pretrained=True)
    modules = list(resnet.children())[:-1]      # delete the last fc layer.
    self.resnet = nn.Sequential(*modules)
    self.modules__ = resnet
    self.linear = nn.Linear(resnet.fc.in_features, embed_size)
    self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
    
  def forward(self, images):
    """Extract feature vectors from input images."""
    with torch.no_grad():
      features = self.resnet(images.type(torch.cuda.FloatTensor))
      features = features.reshape(features.size(0), -1)
    features = self.bn(F.relu(self.linear(features)))
    return features

# Decoder RNN

In [10]:
class DecoderRNN(nn.Module):
  def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1, drop_prob=0.5):
    super(DecoderRNN, self).__init__()
    
    # define the properties
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.lstm = nn.LSTMCell(input_size=embed_size, hidden_size=hidden_size)
    self.dropout = nn.Dropout(drop_prob)
    self.linear = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size)
    self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embed_size)
    self.l2 = nn.Linear(1, self.hidden_size)


  def forward(self, features, captions, captions_length):
    captions_length, sort_ind = captions_length.sort(dim=0, descending=True)
    captions = captions[sort_ind]
    features = features[sort_ind]
    batch_size = features.size(0)
    feature_mean = torch.mean(features, dim = 1)
    
    hidden_state = self.l2(feature_mean.unsqueeze(1))
    [hidden_state, cell_state] = [hidden_state.to(device)]*2

    outputs = torch.zeros((batch_size, captions.size(1), self.vocab_size)).to(device)
    decode_lengths = (captions_length).tolist()

    # decoder is fed encoding as well as captions except <EOS> 
    for t in range(max(decode_lengths)):
      batch_size_t = sum([l > t for l in decode_lengths])
      if t == 0:
        captions_embed = features
      else:
        captions_embed = self.embed(captions[:batch_size_t,t-1])

      hidden_state, cell_state = self.lstm(captions_embed, (hidden_state[:batch_size_t], cell_state[:batch_size_t]))
      hidden_state = self.dropout(hidden_state)
      out = self.linear(hidden_state)
      outputs[:batch_size_t, t, :] = out  # contains <SOS> ---- <EOS>
    return outputs, captions, sort_ind, captions_length

# Model Architecture

In [11]:
class ImageCaptionsNet(nn.Module):
  def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
    super(ImageCaptionsNet, self).__init__()
    self.hidden_size = hidden_size
    self.encoderCNN = EncoderCNN(embed_size)
    self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)


  def forward(self,  images, captions, captions_length):
    features = self.encoderCNN(images)
    features = torch.repeat_interleave(features,dim = 0,repeats = 5)
    outputs, captions, sort_id, decode_lengths = self.decoderRNN(features, captions, captions_length)
    return outputs, captions, sort_id, decode_lengths

# Training Loop

In [17]:
IMAGE_DIR = '/home/pratik/Downloads/akshaymodels/1/'
# IMAGE_TEST_DIR = 'drive/My Drive/Colab Notebooks/CNN/test_images/1/'
# PRIVATE_IMAGE_TEST_DIR = 'drive/My Drive/Colab Notebooks/CNN/private_test_images/1/'


# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)


# # Creating the test Dataset
# test_dataset = ImageCaptionsDataset(
#     IMAGE_TEST_DIR, test_captions_preprocessing_obj.captions_dict, img_transform=img_transform,
#     captions_transform=test_captions_preprocessing_obj.captions_transform
# )

# # Creating the test Dataset
# private_test_dataset = ImageCaptionsDataset(
#     PRIVATE_IMAGE_TEST_DIR, test_captions_preprocessing_obj.captions_dict, img_transform=img_transform,
#     captions_transform=test_captions_preprocessing_obj.captions_transform
# )


# Define your hyperparameters
NUMBER_OF_EPOCHS = 3
LEARNING_RATE = 1e-3
BATCH_SIZE = 32
embed_size = 1024
hidden_size = 512
num_layers = 1
vocab_size = len(captions_preprocessing_obj.vocab) # to be changed for training
NUM_WORKERS = 4 # Parallel threads for dataloading


load_model = True
save_model = True


net = ImageCaptionsNet(embed_size, hidden_size, vocab_size, num_layers).to(device)

# assign class weight to each vocab index
class_weights = torch.ones(vocab_size).to(device)
class_weights[captions_preprocessing_obj.vocab.stoi["<PAD>"]] = 0  # ignore pading loss

loss_function = nn.CrossEntropyLoss(weight=class_weights).to(device)

params = list(net.encoderCNN.parameters()) + list(net.decoderRNN.parameters())

optimizer = optim.Adam(params = params, lr=LEARNING_RATE, weight_decay = 0)

step = 0

if load_model:
  step = load_checkpoint(torch.load("/home/pratik/Downloads/akshaymodels/model.tar"), net, optimizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

=> Loading checkpoint


In [18]:
net.train()

# sample = next(iter(train_loader))
for epoch in range(NUMBER_OF_EPOCHS):

  for batch_idx, sample in enumerate(train_loader):
    image_batch, captions_batch, captions_length = (sample['image']/255).to(device), sample['captions'].to(device), sample['captions_length'].to(device)

    caption_batch = caption_batch.view(-1,captions_batch.shape[2])
    captions_length = captions_length.view(-1)
    
    
    outputs, captions, sort_id, decode_lengths = net(image_batch, captions_batch, captions_length)
    
    loss = loss_function(outputs.reshape(-1,outputs.shape[2]), captions.reshape(-1))
    
    net.zero_grad()
    loss.backward()
    optimizer.step()

  if save_model:
    checkpoint = { "state_dict": net.state_dict(), "optimizer": optimizer.state_dict(), "step": step }
    save_checkpoint(checkpoint)
  step+=1
  print("Iteration: " + str(step))
  print("loss", loss)

=> Saving checkpoint
Iteration: 10
loss tensor(2.4988, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 11
loss tensor(2.2724, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 12
loss tensor(2.5595, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 13
loss tensor(2.3729, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 14
loss tensor(2.1385, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 15
loss tensor(2.2849, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 16
loss tensor(2.0774, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 17
loss tensor(2.1240, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 18
loss tensor(2.1692, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving checkpoint
Iteration: 19
loss tensor(2.1939, device='cuda:0', grad_fn=<NllLossBackward>)
=> Saving 

KeyboardInterrupt: 

# Prediction

In [25]:
def temp_pred(model, image, vocabulary):
  result_caption = []
  with torch.no_grad():
    model.eval()
    input = model.encoderCNN(image)

    feature_mean = torch.mean(input, dim = 1)
    hidden_state = model.decoderRNN.l2(feature_mean.unsqueeze(1))
    [hidden_state, cell_state] = [hidden_state.to(device)]*2

    hidden  = torch.zeros((1,model.hidden_size)).to(device)
    cell = torch.zeros((1,model.hidden_size)).to(device)

    for i in range(MAX_TEXT_LENGTH):
      hiddens, cell = model.decoderRNN.lstm(input, (hidden, cell))
      output = model.decoderRNN.linear(hiddens)
      pred = output.argmax(1)
      result_caption.append(pred)
      input = model.decoderRNN.embed(pred)
      if vocabulary.itos[pred.item()] == "<EOS>" or vocabulary.itos[pred.item()] == "<PAD>" :
        break
    model.train()

    
  return [vocabulary.itos[i.item()] for i in result_caption]

In [26]:
IMAGE_TEST_DIR = '/home/pratik/Downloads/akshaymodels/test_images/1/'
with open('2019MCS2574_2019MCS2556_test.tsv', 'w+') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    files = os.listdir(IMAGE_TEST_DIR)
    for f in files:

        image = io.imread(IMAGE_TEST_DIR+f)
        image = torch.tensor(img_transform(image))
        # plt.imshow(image.permute(1,2,0))
        # break
        res = temp_pred(net,image.unsqueeze(0),captions_preprocessing_obj.vocab)
        st_fake = ' '.join([i for i in res if i != '<SOS>' and i != '<EOS>' and i != '<PAD>'])
        id = f.split("_")[-1]
        id = id.split(".")[0]
        tsv_writer.writerow([id, st_fake])

In [27]:
PRIVATE_IMAGE_TEST_DIR = '/home/pratik/Downloads/akshaymodels/private_test_images/'
with open('2019MCS2574_2019MCS2556_private.tsv', 'w+') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    files = os.listdir(PRIVATE_IMAGE_TEST_DIR)
    for f in files:

        image = io.imread(PRIVATE_IMAGE_TEST_DIR+f)
        image = torch.tensor(img_transform(image))
        # plt.imshow(image.permute(1,2,0))
        # break
        res = temp_pred(net,image.unsqueeze(0),captions_preprocessing_obj.vocab)
        st_fake = ' '.join([i for i in res if i != '<SOS>' and i != '<EOS>' and i != '<PAD>'])
        id = f.split("_")[-1]
        id = id.split(".")[0]
        tsv_writer.writerow([id, st_fake])

In [None]:
# with open('2019MCS2556_2019MCS2574_public.tsv', 'wt') as out_file:
#   tsv_writer = csv.writer(out_file, delimiter='\t')
#   for sample in test_loader:
#     image_batch, ids, caption_batch = sample['image']/255, sample['id'], sample['captions']
#     for image, id, caption in zip(image_batch, ids, caption_batch):
#       res = temp_pred(net,image.unsqueeze(0),captions_preprocessing_obj.vocab)
#       st_fake = ' '.join([i 
#                           for i in res if i != '<SOS>' and i != '<EOS>' and i != '<PAD>'])
#       tsv_writer.writerow([id.item(), st_fake])