In [1]:
DATA_PATH = '../data/coco/data/'
img_codes_path = DATA_PATH + 'image_codes.npy'
captions_path = DATA_PATH + 'captions_tokenized.json'

%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from image_captioning.utils import Vocab, create_datasets, pad_collate_fn
from image_captioning.models import CaptionNet

In [3]:
vocab = Vocab(captions_path)
datasets = create_datasets(vocab, img_codes_path, captions_path)
train, val, test = datasets['train'], datasets['val'], datasets['test']

In [22]:
batch_size = 64

train_loader = DataLoader(train, batch_size, num_workers=4, collate_fn=pad_collate_fn)
val_loader = DataLoader(val, batch_size, num_workers=4, collate_fn=pad_collate_fn)
test_loader = DataLoader(test, batch_size, num_workers=4, collate_fn=pad_collate_fn)

In [1]:
from pycocoevalcap.cider.cider import Cider

In [14]:
scorer = Cider(df='corpus')

In [115]:
scorer.compute_score(ids2hypo, ids2ref)

AssertionError: 

In [114]:
for imgs, captions, lenghts in train_loader:
    imgs = imgs.to(device)
    captions = captions.to(device)
    logits = model(imgs, captions, lenghts)
    predictions = logits[:, :-1, :].argmax(dim=2)
    targets = captions[:, 1:]
    
    imgs_ids = imgs[:, :10].cpu().data.numpy().sum(axis=1)
    targets = targets.cpu().data.numpy()
    predictions = predictions.cpu().data.numpy()
    print(imgs_ids.shape)
    print(predictions.shape)
    print(targets.shape)
    
    predictions = [' '.join(sent.astype(str)) for sent in predictions]
    targets = [' '.join(sent.astype(str)) for sent in targets]
    ids2hypo = {img_id: hypo for (img_id, hypo) in zip(imgs_ids, predictions)}
    ids2ref = {img_id: ref for (img_id, ref) in zip(imgs_ids, targets)}
    
    break

(64,)
(64, 14)
(64, 14)
{2.7491841: '24 118 184 14 24 48 60 35 187 381 2 0 0 0', 1.9870746: '24 57 41 67 35 24 230 6 24 168 12 2 0 0', 3.1995814: '24 396 41 139 6 24 597 123 12 2 0 0 0 0', 2.5974107: '24 57 78 73 385 35 1740 306 2 0 0 0 0 0', 1.8115432: '24 57 15 24 48 2013 35 24 2176 2 0 0 0 0', 2.6611524: '88 62 342 35 77 14 24 399 12 2 0 0 0 0', 3.6090465: '24 396 60 35 77 14 24 157 192 116 24 726 12 2', 4.1148515: '24 277 984 41 611 15 139 6 24 157 2 0 0 0', 2.608445: '24 906 396 58 54 1820 400 31 2 0 0 0 0 0', 4.4631777: '24 48 41 139 35 24 682 702 12 2 0 0 0 0', 3.2731586: '24 48 35 24 207 53 18 45 232 2 0 0 0 0', 4.35108: '335 41 24 92 1614 35 24 183 2 0 0 0 0 0', 2.5696392: '24 1123 58 24 2645 15 24 229 35 45 71 2 0 0', 2.8676562: '24 278 6 24 194 256 41 305 24 1414 12 2 0 0', 3.625111: '335 194 3128 41 6 24 46 240 2 0 0 0 0 0', 1.9116251: '449 14 45 1621 41 793 35 2016 12 2 0 0 0 0', 2.8264365: '24 57 305 24 1414 35 24 717 249 12 2 0 0 0', 2.4379883: '24 157 63 41 713 666 15 7

In [23]:
###################
hidden_size = 512
embedding_dim = 512
num_layers = 2
lstm_dropout = 0.3
fc_dropout = 0.5
device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu'
)
###################

model = CaptionNet(
    vocab=vocab,
    hidden_size=hidden_size,
    num_layers=num_layers,
    lstm_dropout=lstm_dropout,
    fc_dropout=fc_dropout,
    embedding_dim=embedding_dim
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.AdamW(model.parameters())

In [16]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams.update({'figure.figsize': (12, 7), 'font.size': 11})
from IPython.display import clear_output

train_history = None
valid_history = None
imgs, captions, lenghts = next(iter(train_loader))
clip = 1
for i in range(1000):
    model.train()
    
    epoch_loss = 0
    history = []
    
    imgs = imgs.to(device)
    captions = captions.to(device)

    logits = model(imgs, captions, lenghts)
    predictions = logits[:, :-1, :].permute(0, 2, 1)
    targets = captions[:, 1:]

    loss = criterion(predictions, targets)

    optimizer.zero_grad()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()

    epoch_loss += loss.item()
    history.append(loss.cpu().data.numpy())
    if (i+1)%10==0:
        print(loss)
        fig, ax = plt.subplots(nrows=1, ncols=2)

        clear_output(True)
        ax[0].plot(history, label='train loss')
        ax[0].set_xlabel('Batch')
        ax[0].set_title('Train loss')
        if train_history is not None:
            ax[1].plot(train_history, label='general train history')
            ax[1].set_xlabel('Epoch')
        if valid_history is not None:
            ax[1].plot(valid_history, label='general valid history')
        plt.legend()
        
        plt.show()

No handles with labels found to put in legend.


KeyboardInterrupt: 