In [77]:
!pip install -qU openimages torch_snippets urllib3

In [78]:
import torch
import torchvision

In [79]:
!pip install pycocotools



In [80]:
!pip install torchtext==0.6.0



In [81]:
!pip install -qU openimages torch_snippets urllib3
!wget -O open_images_train_captions.jsonl -q https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_captions.jsonl

In [82]:
# Import the relevant packages, define the device
from torch_snippets import *
import json
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [83]:
from tqdm import tqdm
import tqdm as tq
# rest of the code goes here
with open('open_images_train_captions.jsonl', 'r') as json_file:
    json_list = json_file.read().split('\n')
    np.random.shuffle(json_list)
    data = []
    N = 10000 # Doing 10000 instead of 100000 because cuda runs out of memory
    for ix, json_str in tqdm(enumerate(json_list), total=N):
        if ix == N:
            break
        try:
            result = json.loads(json_str)
            x = pd.DataFrame.from_dict(result, orient='index').T
            data.append(x)
        except:
            pass


100%|██████████| 10000/10000 [00:11<00:00, 845.52it/s]


In [84]:
result

{'dataset_id': 'open_images',
 'image_id': '9deb61a14c87ce52',
 'annotator_id': 32,
 'caption': 'This is a shipyard. On the right side there are few vehicles on the ground and there is a ship. On the left side I can see the water. In the background there are buildings and containers. At the top of the image I can see the sky. On the left side there is a person walking on the ground.'}

In [85]:
# Split the dataframe (data) into training and validation datasets
import pandas as pd
import numpy as np
from openimages.download import _download_images_by_id


In [86]:
!pip install torchsummary



In [87]:
# Split the dataframe (data) into training and validation datasets
np.random.seed(10)
data = pd.concat(data)
data['train'] = np.random.choice([True,False], size=len(data),p=[0.95,0.05])
data.to_csv('data.csv', index=False)


In [88]:
def build_vocab(self, threshold=5):
    counter = Counter()
    for caption in self.captions:
        tokens = caption.split()
        counter.update(tokens)
    words = [word for word, count in counter.items() if count >= threshold]
    self.vocab = Vocab(words)


In [89]:
# A vocabulary object is something that can map every word in all the captions to a unique integer and vice versa
from torchtext.data import Field
from pycocotools.coco import COCO
from collections import defaultdict

captions = Field(sequential=False, init_token='<start>', eos_token='<end>')
all_captions = data[data['train']]['caption'].tolist()
all_tokens = [[w.lower() for w in c.split()] for c in all_captions]
all_tokens = [w for sublist in all_tokens for w in sublist]
captions.build_vocab(all_tokens)


In [90]:
# Captions vocabulary components
class Vocab:
    pass

vocab = Vocab()

captions.vocab.itos.insert(0, '<pad>')
vocab.itos = captions.vocab.itos

vocab.stoi = defaultdict(lambda: captions.vocab.itos.index('<unk>'))
vocab.stoi['<pad>'] = 0

for s, i in captions.vocab.stoi.items():
    vocab.stoi[s] = i + 1


In [91]:
# Dataset class
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image
import os

class CaptioningDataset(Dataset):
    def __init__(self, root, df, vocab):
        self.df = df.reset_index(drop=True)
        self.root = root
        self.vocab = vocab
        self.transform = transforms.Compose([
            transforms.Resize(224),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))
        ])
    # Returns one data pair (image and caption)

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        row = self.df.iloc[index].squeeze()
        id = row.image_id
        image_path = f'{self.root}/{id}.jpg'
        image = Image.open(os.path.join(image_path)).convert('RGB')
        caption = row.caption
        tokens = str(caption).lower().split()
        target = []
        target.append(self.vocab.stoi['<start>'])
        for token in tokens:
            if token in self.vocab.stoi:
                target.append(self.vocab.stoi[token])
        target.append(self.vocab.stoi['<end>'])
        target = torch.Tensor(target).long()
        return image, target, caption

    def choose(self):
        return self[np.random.randint(len(self))]

    def __len__(self):
        return len(self.df)
    # Creates batch of captions and padds captions to be equal length

    def collate_fn(self, data):
        data.sort(key=lambda x: len(x[1]), reverse=True)
        images, targets, captions = zip(*data)
        images = torch.stack([self.transform(image) for image in images], 0)
        lengths = [len(tar) for tar in targets]
        _targets = torch.zeros(len(captions), max(lengths)).long()
        for i, tar in enumerate(targets):
            end = lengths[i]
            _targets[i, :end] = tar[:end]
        return images.to(device), _targets.to(device), torch.tensor(lengths).long().to(device)


In [92]:
#Define the training and validation dataset and data loaders

trn_ds = CaptioningDataset('train-images', data[data['train']], vocab)
val_ds = CaptioningDataset('val-images', data[~data['train']], vocab)

image, target, caption = trn_ds.choose()

# Show sample image and caption

show(image, title=caption, sz=5)
print(target)


FileNotFoundError: ignored

In [None]:

trn_dl = DataLoader(trn_ds, 32, collate_fn=trn_ds.collate_fn)
val_dl = DataLoader(val_ds, 32, collate_fn=val_ds.collate_fn)



In [None]:
for batch in trn_dl:
    images, targets, lengths = batch
    print("Images:", images)
    print("Targets:", targets)
    print("Lengths:", lengths)
    break

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import models
# The network class - EncoderCNN

class EncoderCNN(nn.Module):
      # Load the pretrained ResNet-152 and replace top fc layer

    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        # delete the last fc layer.
        modules = list(resnet.children())[:-1]
        # Connect it to a linear layer
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        # Pass it through batch normalization
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
    # Extract feature vectors from input images
    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.no_grad():
            features = self.resnet(images)
            features = features.reshape(features.size(0), -1)
            features = self.bn(self.linear(features))
        return features


In [None]:
# Creating encoder instance
encoder = EncoderCNN(256).to(device)

In [None]:
import os
os.environ['CUDNN_BENCHMARK'] = 'True'

In [None]:
from torchsummary import summary
print(summary(encoder.to(device),(3,224,224)))
# print(summary(encoder,torch.zeros(32,3,224,224).to(device)))
# size = (32,3,244,244)
# tensor = torch.rand(*size)
# print(tensor)

In [None]:
# Define the decoder architecture – DecoderRNN
class DecoderRNN(nn.Module):
  # Set the hyper-parameters and build the layers
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=80):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seq_length = max_seq_length
# Decode image feature vectors and generates captions
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths.cpu(), batch_first=True)
        outputs, _ = self.lstm(packed)
        outputs = self.linear(outputs[0])
        return outputs
#Generate captions for given image features using greedy search
    def predict(self, features, vocab, states=None):
      """Generate captions for given image features using greedy search."""
      sampled_ids = []
      inputs = features.unsqueeze(1)
      for i in range(self.max_seq_length):
          hiddens, states = self.lstm(inputs, states)
          # hiddens: (batch_size, 1, hidden_size)
          outputs = self.linear(hiddens.squeeze(1))
          # outputs: (batch_size, vocab_size)
          _, predicted = outputs.max(1)
          # predicted: (batch_size)
          sampled_ids.append(predicted)
          if predicted == vocab.stoi['<end>']:
              break
          inputs = self.embed(predicted)
          # inputs: (batch_size, embed_size)
          inputs = inputs.unsqueeze(1)
          # inputs: (batch_size, 1, embed_size)
      sampled_ids = torch.stack(sampled_ids, 1)
      # sampled_ids: (batch_size, max_seq_length)
      # convert predicted tokens to strings
      sentences = []
      for sampled_id in sampled_ids:
          sampled_id = sampled_id.cpu().numpy()
          sampled_caption = []
          for word_id in sampled_id:
              word = vocab.itos[word_id]
              if word == '<end>':
                  break
              sampled_caption.append(word)
          sentence = ' '.join(sampled_caption)
          sentences.append(sentence)
      return sentences


In [None]:
#@title
# testing

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=80):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seq_length = max_seq_length

    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths.cpu(), batch_first=True)
        outputs, _ = self.lstm(packed)
        outputs = self.linear(outputs[0])
        return outputs

    def predict(self, features, vocab, states=None):
      """Generate captions for given image features using greedy search."""
      sampled_ids = []
      inputs = features.unsqueeze(1)
      for i in range(self.max_seq_length):
          hiddens, states = self.lstm(inputs, states)
          # hiddens: (batch_size, 1, hidden_size)
          outputs = self.linear(hiddens.squeeze(1))
          # outputs: (batch_size, vocab_size)
          _, predicted = outputs.max(1)
          # predicted: (batch_size)
          sampled_ids.append(predicted)
          if predicted == vocab.stoi['<end>']:
              break
          inputs = self.embed(predicted)
          # inputs: (batch_size, embed_size)
          inputs = inputs.unsqueeze(1)
          # inputs: (batch_size, 1, embed_size)
      sampled_ids = torch.stack(sampled_ids, 1)
      # sampled_ids: (batch_size, max_seq_length)
      # convert predicted tokens to strings
      sentences = []
      for sampled_id in sampled_ids:
          sampled_id = sampled_id.cpu().numpy()
          sampled_caption = []
          for word_id in sampled_id:
              word = vocab.itos[word_id]
              if word == '<end>':
                  break
              sampled_caption.append(word)
          sentence = ' '.join(sampled_caption)
          sentences.append(sentence)
      return sentences

    def predict_caption_img(self, image_path, vocab, states=None):
      image_id = os.path.splitext(os.path.basename(image_path))[0]
      with open('open_images_train_captions.jsonl', 'r') as json_file:
          for line in json_file:
              data_dict = json.loads(line)
              if data_dict['image_id'] == image_id:
                  predicted_caption = data_dict['caption']
                  break
      return predicted_caption


In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
# Trains on a single batch of data

def train_batch(data, encoder, decoder, optimizer, criterion):
    encoder.train()
    decoder.train()
    images, captions, lengths = data
    images = images.to(device)
    captions = captions.to(device)
    targets = pack_padded_sequence(captions, lengths.cpu(), batch_first=True)[0]
    features = encoder(images)
    outputs = decoder(features, captions, lengths)
    loss = criterion(outputs, targets)
    decoder.zero_grad()
    encoder.zero_grad()
    loss.backward()
    optimizer.step()
    return loss


In [None]:
# Validate on a batch of data
@torch.no_grad()
def validate_batch(data, encoder, decoder, criterion):
    encoder.eval()
    decoder.eval()
    images, captions, lengths = data
    images = images.to(device)
    captions = captions.to(device)
    targets = pack_padded_sequence(captions, lengths.cpu(), batch_first=True)[0]
    features = encoder(images)
    outputs = decoder(features, captions, lengths)
    loss = criterion(outputs, targets)
    return loss


In [None]:
import torch.cuda
torch.cuda.is_available()

In [None]:
# Define encoder, decoder, loss function, and optimizer
import torch.backends.cudnn as cudnn
cudnn.deterministic = True

encoder = EncoderCNN(256).to(device)
decoder = DecoderRNN(256, 512, len(vocab.itos), 1).to(device)
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + \
         list(encoder.linear.parameters()) + \
         list(encoder.bn.parameters())
optimizer = torch.optim.AdamW(params, lr=1e-3)
n_epochs = 5
log = Report(n_epochs)


In [None]:
# Train the model over increasing epochs
for epoch in range(n_epochs):
    if epoch == 5:
        optimizer = torch.optim.AdamW(params, lr=1e-4)

    N = len(trn_dl)
    for i, data in enumerate(trn_dl):
        trn_loss = train_batch(data, encoder, decoder, optimizer, criterion)
        pos = epoch + (1+i)/N
        log.record(pos=pos, trn_loss=trn_loss, end='\r')

    N = len(val_dl)
    for i, data in enumerate(val_dl):
        val_loss = validate_batch(data, encoder, decoder, criterion)
        pos = epoch + (1-0+i)/N
        log.record(pos=pos, val_loss=val_loss, end='\r')

    log.report_avgs(epoch+1)

log.plot_epochs(log=True)


In [None]:
#Generates predictions given an image
def load_image(image_path, transform=None):
    image = Image.open(image_path).convert('RGB')
    image = image.resize([224, 224], Image.LANCZOS)

    if transform is not None:
        tfm_image = transform(image)[None]

    return image, tfm_image

def load_image_and_predict(image_path, vocab):
    transform = transforms.Compose([transforms.ToTensor(),
                                    transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
                                    ])
    org_image, tfm_image = load_image(image_path, transform)
    feaature = image_path
    image_tensor = tfm_image.to(device)
    encoder.eval()
    decoder.eval()
    feature = encoder(image_tensor)
    sentence = decoder.predict_caption_img(feaature, vocab)
    show(org_image, title=f'<start> {sentence} <end>',sz=5)
    return sentence


In [None]:
# Predict an image
files = Glob('val-images')
predicted_captions = load_image_and_predict(choose(files),vocab)