In [21]:
data_dir = '/datasets/COCO-2015/train2014'
annotation_file = 'annotations/captions_train2014.json'
val_dir = '/datasets/COCO-2015/val2014'
val_annotation_file = 'annotations/captions_val2014.json'

In [22]:
#load data
import nbimporter
import data_loader as data_loader
import pickle
from vocabbuild import Vocabulary
import torchvision.transforms as transforms
import torch.utils.data as td

with open("vocab.pkl", 'rb') as fi:
    vocabulary = pickle.load(fi)

image_size = (229,229)
transform = transforms.Compose([ 
        # Try resize
        #transforms.RandomCrop(crop_size), 
        transforms.Resize(image_size),
        transforms.ToTensor(), 
        # Using Imagenet std and mean
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])   

coco_train = data_loader.COCODataset(root = data_dir, annFile = annotation_file, 
                                     vocabulary=vocabulary, transform = transform)
train_loader = td.DataLoader(coco_train, batch_size = 64, shuffle = True,
                                         pin_memory = True, collate_fn = data_loader.coco_batch)

coco_val = data_loader.COCODataset(root = val_dir, annFile = val_annotation_file, 
                                     vocabulary=vocabulary, transform = transform)
val_loader = td.DataLoader(coco_val, batch_size = 64, shuffle = False,
                                         pin_memory = True, collate_fn = data_loader.coco_batch)

loading annotations into memory...
Done (t=0.96s)
creating index...
index created!
loading annotations into memory...
Done (t=0.34s)
creating index...
index created!


In [23]:
# #check train_loader
# for i, (images, captions, lengths) in enumerate(train_loader):
#     if (i > 4):
#         break
#     data_loader.myimshow(images[0])
#     cap_string = ""
#     for j in list(captions.data.numpy()[0]):
#         cap_string += vocabulary.idx2word[j] + " "        
#     print(cap_string)

In [24]:
#plot functions
import matplotlib.pyplot as plt
def myimshow(image, ax=plt):
    image = image.to('cpu').numpy()
    image = np.moveaxis(image, [0, 1, 2], [2, 0, 1])
    image = (image + 1) / 2
    image[image < 0] = 0
    image[image > 1] = 1
    h = ax.imshow(image)
    ax.axis('off')
    return h

def plot(exp, fig, axes, noisy, visu_rate=2):
    if exp.epoch % visu_rate != 0:
        return
    with torch.no_grad():
        denoised = exp.net(noisy[np.newaxis].to(exp.net.device))[0]
    axes[0][0].clear()
    axes[0][1].clear()
    axes[1][0].clear()
    axes[1][1].clear()
    
    myimshow(noisy, ax=axes[0][0])
    axes[0][0].set_title('Noisy image')
    
    axes[0][1].set_title('Denoised image')
    myimshow(denoised, ax=axes[0][1])
    
    axes[1][0].plot([exp.history[k][0]['loss'] for k in range(exp.epoch)],label="training loss")
    axes[1][1].plot([exp.history[k][0]['psnr'] for k in range(exp.epoch)],label="training psnr")
    
    axes[1][0].set_xlabel("epochs")
    axes[1][1].set_xlabel("epochs")
    axes[1][0].set_ylabel("loss")
    axes[1][1].set_ylabel("psnr")
    axes[1][0].legend()
    axes[1][1].legend()
    plt.tight_layout()
    fig.canvas.draw()

In [25]:
# Device configuration
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [26]:
import weight_initialization
weight_matrix = weight_initialization.get_weight_matrix()

In [1]:
#define decoder network
from torch import nn

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class LSTM_custom(nn.Module):
    def __init__(self, weights_matrix, hidden_size, vocab_size, num_layers, max_seq_length=20):
        super(LSTM_custom, self).__init__()
        embed_size = 256
        #self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
    
    def forward(self, features, captions, lengths):
        embeddings = self.embedding(captions)
        print(embeddings.shape)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        print(embeddings.shape)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        #print(packed)
        hiddens, _ = self.lstm(packed)
        print(hiddens[0].shape)
        outputs = self.linear(hiddens[0])
        print(outputs.shape)
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)          # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
            _, predicted = outputs.max(1)                        # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs = self.embedding(predicted)                       # inputs: (batch_size, embed_size)
            inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
        return sampled_ids

In [16]:
# Build the models
import models_custom
encoder = models_custom.CNN(out_classes = 300).to(device)
decoder = LSTM_custom(weights_matrix = weight_matrix, hidden_size = 512, 
                             vocab_size = weight_matrix.shape[0], num_layers = 1).to(device)

512


In [17]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters())
optimizer = torch.optim.Adam(params, lr=1e-4)

In [18]:
#train the model
from torch.nn.utils.rnn import pack_padded_sequence
import numpy as np
import os

log_step = 50
save_step = 200
num_epochs = 25
model_path = "model3"
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, captions, lengths) in enumerate(train_loader):

        # Set mini-batch dataset
        images = images.to(device)
        captions = captions.to(device)
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

        # Forward, backward and optimize
        features = encoder.forward(images)
        outputs = decoder.forward(features, captions, lengths)
        print(outputs.shape, targets.shape)
        print(outputs[1, :].cpu().detach().numpy(), targets[1].cpu().detach().numpy())
#         print(np.unique(targets.cpu().detach().numpy()))
        loss = criterion(outputs, targets)
        decoder.zero_grad()
        loss.backward()
        optimizer.step()

        # Print log info
        if i % log_step == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                  .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 

        # Save the model checkpoints
        if (i+1) % save_step == 0:
            torch.save(decoder.state_dict(), os.path.join(
                model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))

torch.Size([64, 92, 300])
torch.Size([64, 93, 300])
PackedSequence(data=tensor([[ 0.5099, -0.4875,  0.2736,  ..., -0.0274,  0.1766,  0.4399],
        [ 0.5031,  0.1621,  0.0188,  ..., -0.1652, -0.4765,  0.6706],
        [ 0.6448,  0.1570, -0.0433,  ...,  0.1785,  0.1674,  0.1571],
        ...,
        [ 0.3514, -0.1685,  0.1196,  ..., -0.3544, -0.4076,  0.4999],
        [ 0.0702, -0.0663, -0.5209,  ..., -0.5004,  0.4866,  0.2786],
        [ 0.0702, -0.0663, -0.5209,  ..., -0.5004,  0.4866,  0.2786]],
       device='cuda:0', grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 63, 62, 60, 60, 59, 59, 57, 53, 46, 40, 40, 37, 34,
        30, 20, 20, 15, 13, 12, 10,  9,  8,  6,  6,  6,  5,  3,  3,  3,  3,  3,
         2,  1]), s

KeyboardInterrupt: 