In [1]:
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import os
import pickle
import numpy as np
import nltk
from PIL import Image
from pycocotools.coco import COCO


class CocoDataset(data.Dataset):
    """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
    def __init__(self, root, json, vocab, transform=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        self.coco = COCO(json)
        self.ids = list(self.coco.anns.keys())
        self.vocab = vocab
        self.transform = transform

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        coco = self.coco
        vocab = self.vocab
        ann_id = self.ids[index]
        caption = coco.anns[ann_id]['caption']
        img_id = coco.anns[ann_id]['image_id']
        path = coco.loadImgs(img_id)[0]['file_name']

        image = Image.open(os.path.join(self.root, path)).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        # Convert caption (string) to word ids.
        tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        return image, target

    def __len__(self):
        return len(self.ids)


def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (image, caption).
    
    We should build custom collate_fn rather than using default collate_fn, 
    because merging caption (including padding) is not supported in default.
    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.
    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]        
    return images, targets, lengths

def get_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
    """Returns torch.utils.data.DataLoader for custom coco dataset."""
    # COCO caption dataset
    coco = CocoDataset(root=root,
                       json=json,
                       vocab=vocab,
                       transform=transform)
    
    # Data loader for COCO dataset
    # This will return (images, captions, lengths) for each iteration.
    # images: a tensor of shape (batch_size, 3, 224, 224).
    # captions: a tensor of shape (batch_size, padded_length).
    # lengths: a list indicating valid length for each caption. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=coco, 
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers,
                                              collate_fn=collate_fn)
    return data_loader

In [2]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

In [3]:
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO
#from nltk.stem import PorterStemmer 
import pdb

In [4]:
def build_vocab(json, threshold=4):
    """Build a simple vocabulary wrapper."""
    #ps = nltk.stem.PorterStemmer() 
    lm = nltk.stem.WordNetLemmatizer()
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        #tokens = [ps.stem(word) for word in tokens]
        tokens = [lm.lemmatize(word) for word in tokens]
        counter.update(tokens)

        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    
    words = [word for word, cnt in counter.items() if cnt >= threshold]
    print(len(words))
    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab


In [5]:
def main():
    vocab = build_vocab(json="./datasets/coco2014/trainval_coco2014_captions/captions_train2014.json", threshold=4)
    vocab_path = "./vocab_lemma.pkl"
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print("Total vocabulary size: {}".format(len(vocab)))
    print("Saved the vocabulary wrapper to '{}'".format(vocab_path))


if __name__ == '__main__':
    
    main()

loading annotations into memory...
Done (t=0.73s)
creating index...
index created!
[1000/414113] Tokenized the captions.
[2000/414113] Tokenized the captions.
[3000/414113] Tokenized the captions.
[4000/414113] Tokenized the captions.
[5000/414113] Tokenized the captions.
[6000/414113] Tokenized the captions.
[7000/414113] Tokenized the captions.
[8000/414113] Tokenized the captions.
[9000/414113] Tokenized the captions.
[10000/414113] Tokenized the captions.
[11000/414113] Tokenized the captions.
[12000/414113] Tokenized the captions.
[13000/414113] Tokenized the captions.
[14000/414113] Tokenized the captions.
[15000/414113] Tokenized the captions.
[16000/414113] Tokenized the captions.
[17000/414113] Tokenized the captions.
[18000/414113] Tokenized the captions.
[19000/414113] Tokenized the captions.
[20000/414113] Tokenized the captions.
[21000/414113] Tokenized the captions.
[22000/414113] Tokenized the captions.
[23000/414113] Tokenized the captions.
[24000/414113] Tokenized the 

In [6]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

In [7]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)          # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
            _, predicted = outputs.max(1)                        # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)                       # inputs: (batch_size, embed_size)
            inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
        return sampled_ids
    def sample_beam_search(self, inputs, states=None, max_len=20, beam_width=5):
        idx_sequences = [[[], 0.0, inputs, states]]
        for _ in range(max_len):
            # Store all the potential candidates at each step
            all_candidates = []
            # Predict the next word idx for each of the top sequences
            for idx_seq in idx_sequences:
                hiddens, states = self.lstm(idx_seq[2].unsqueeze(1), idx_seq[3])
                outputs = self.linear(hiddens.squeeze(1))
                # Transform outputs to log probabilities to avoid floating-point 
                # underflow caused by multiplying very small probabilities
                log_probs = F.log_softmax(outputs, -1)
                top_log_probs, top_idx = log_probs.topk(beam_width, 1)
                top_idx = top_idx.squeeze(0)
                # create a new set of top sentences for next round
                for i in range(beam_width):
                    next_idx_seq, log_prob = idx_seq[0][:], idx_seq[1]
                    next_idx_seq.append(top_idx[i].item())
                    log_prob += top_log_probs[0][i].item()
                    # Indexing 1-dimensional top_idx gives 0-dimensional tensors.
                    # We have to expand dimensions before embedding them
                    inputs = self.embed(top_idx[i].unsqueeze(0))
                    all_candidates.append([next_idx_seq, log_prob, inputs, states])
            # Keep only the top sequences according to their total log probability
            ordered = sorted(all_candidates, key=lambda x: x[1], reverse=True)
            idx_sequences = ordered[:beam_width]
        return [idx_seq[0] for idx_seq in idx_sequences]
    

In [8]:
import io
def resize_image(image, size):
    """Resize an image to the given size."""
    return image.resize(size, Image.ANTIALIAS)

def resize_images(image_dir, output_dir, size):
    """Resize the images in 'image_dir' and save into 'output_dir'."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    images = os.listdir(image_dir)
    num_images = len(images)
    for i, image in enumerate(images):
        with io.open(os.path.join(image_dir, image), 'r') as f:
            #print(f.name)
            with Image.open(f.name) as img:
                #print(img)
                img1 = img
                img1 = resize_image(img1, size)
                img1.save(os.path.join(output_dir, image), img1.format)
        if (i+1) % 100 == 0:
            print ("[{}/{}] Resized the images and saved into '{}'."
                   .format(i+1, num_images, output_dir))

In [9]:
image_dir = "./datasets/coco2014/train2014/"
output_dir = "./resized2014/"
image_size = [256,256]
resize_images(image_dir,output_dir,image_size)

[100/82783] Resized the images and saved into './resized2014/'.
[200/82783] Resized the images and saved into './resized2014/'.
[300/82783] Resized the images and saved into './resized2014/'.
[400/82783] Resized the images and saved into './resized2014/'.
[500/82783] Resized the images and saved into './resized2014/'.
[600/82783] Resized the images and saved into './resized2014/'.
[700/82783] Resized the images and saved into './resized2014/'.
[800/82783] Resized the images and saved into './resized2014/'.
[900/82783] Resized the images and saved into './resized2014/'.
[1000/82783] Resized the images and saved into './resized2014/'.
[1100/82783] Resized the images and saved into './resized2014/'.
[1200/82783] Resized the images and saved into './resized2014/'.
[1300/82783] Resized the images and saved into './resized2014/'.
[1400/82783] Resized the images and saved into './resized2014/'.
[1500/82783] Resized the images and saved into './resized2014/'.
[1600/82783] Resized the images an

In [10]:
import torch
import torch.nn as nn
import numpy as np
import os
import pickle

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
model_path = './models'
if not os.path.exists(model_path):
    os.makedirs(model_path)
    

In [46]:
transform = transforms.Compose([ 
        transforms.CenterCrop(224),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
    
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])

In [30]:
vocab_path = "./vocab_lemma.pkl"
with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

In [31]:
caption_path = "./datasets/coco2014/trainval_coco2014_captions/captions_train2014.json"
batch_size = 128
num_workers = 0
data_loader = get_loader(image_dir, caption_path, vocab, 
                             transform, batch_size,
                             shuffle=True, num_workers=num_workers) 

loading annotations into memory...
Done (t=0.64s)
creating index...
index created!


In [32]:
embed_size = 256
hidden_size = 512
num_layers= 1
learning_rate = 0.001
encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers).to(device)
    
    # Loss and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)
    
    # Train the models
total_step = len(data_loader)
    

In [43]:
(data_loader.dataset.__getitem__(0))

(tensor([[[-1.2959, -1.2617, -1.2445,  ..., -1.7583, -1.6898, -1.7754],
          [-1.2617, -1.2445, -1.2445,  ..., -1.6898, -1.7583, -1.8097],
          [-1.2788, -1.2617, -1.2617,  ..., -1.7069, -1.7925, -1.8782],
          ...,
          [ 1.0844,  1.0331,  1.0159,  ...,  0.6392,  0.6221,  0.6221],
          [ 1.0844,  1.0502,  1.0502,  ...,  0.6221,  0.6392,  0.6392],
          [ 1.0844,  1.0673,  1.0331,  ...,  0.6221,  0.6392,  0.6563]],
 
         [[-1.2829, -1.2829, -1.2479,  ..., -1.7031, -1.6331, -1.7206],
          [-1.2479, -1.2479, -1.2479,  ..., -1.6331, -1.7031, -1.7556],
          [-1.2654, -1.2654, -1.2654,  ..., -1.6506, -1.7906, -1.8256],
          ...,
          [ 1.1331,  1.1155,  1.1155,  ...,  0.7829,  0.7654,  0.7654],
          [ 1.0805,  1.0980,  1.0980,  ...,  0.7654,  0.7829,  0.7304],
          [ 1.0805,  1.0630,  1.0805,  ...,  0.7654,  0.7829,  0.7304]],
 
         [[-0.9504, -0.9330, -0.9504,  ..., -1.3861, -1.3164, -1.4384],
          [-0.9504, -0.9504,

In [44]:
num_epochs = 1
log_step = 10
save_step = 1000
for epoch in range(num_epochs):
        print("train epoch {}".format(epoch+1))
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            print(i)
            
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 
                
            # Save the model checkpoints？
            if (i+1) % save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))

train epoch 1
0
Epoch [0/1], Step [0/3236], Loss: 2.7623, Perplexity: 15.8357
1
2
3
4
5
6
7
8
9
10
Epoch [0/1], Step [10/3236], Loss: 2.6894, Perplexity: 14.7223
11
12
13
14
15
16
17
18
19
20
Epoch [0/1], Step [20/3236], Loss: 2.8946, Perplexity: 18.0766
21
22
23
24
25


KeyboardInterrupt: 