In [12]:
from __future__ import print_function

import matplotlib; matplotlib.use('Agg')
import os
import os.path as osp
import argparse

from train import train 
from test import test
from test_beam import test_beam 

## 하이퍼 파라미터 설정

In [13]:
parser = argparse.ArgumentParser(description='PyTorch Convolutional Image Captioning Model')

parser.add_argument('model_dir', help='output directory to save models & results')

parser.add_argument('-g', '--gpu', type=int, default=0,\
                    help='gpu device id')

parser.add_argument('--coco_root', type=str, default= './data/coco/',\
                    help='directory containing coco dataset train2014, val2014, & annotations')

parser.add_argument('-t', '--is_train', type=int, default=1,\
                    help='use 1 to train model')

parser.add_argument('-e', '--epochs', type=int, default=30,\
                    help='number of training epochs')

parser.add_argument('-b', '--batchsize', type=int, default=20,\
                    help='number of images per training batch')

parser.add_argument('-c', '--ncap_per_img', type=int, default=5,\
                    help='ground-truth captions per image in training batch')

parser.add_argument('-n', '--num_layers', type=int, default=3,\
                    help='depth of convcap network')

parser.add_argument('-m', '--nthreads', type=int, default=4,\
                    help='pytorch data loader threads')

# parser.add_argument('-ft', '--finetune_after', type=int, default=8,\
#                     help='epochs after which vgg16 is fine-tuned')

parser.add_argument('-lr', '--learning_rate', type=float, default=5e-5,\
                    help='learning rate for convcap')

parser.add_argument('-st', '--lr_step_size', type=int, default=15,\
                    help='epochs to decay learning rate after')

parser.add_argument('-sc', '--score_select', type=str, default='CIDEr',\
                    help='metric to pick best model')

parser.add_argument('--beam_size', type=int, default=1, \
                    help='beam size to use for test') 

parser.add_argument('--attention', dest='attention', action='store_true', \
                    help='Use this for convcap with attention (by default set)')

parser.add_argument('--no-attention', dest='attention', action='store_false', \
                    help='Use this for convcap without attention')


_StoreFalseAction(option_strings=['--no-attention'], dest='attention', nargs=0, const=False, default=True, type=None, choices=None, help='Use this for convcap without attention', metavar=None)

In [14]:
parser.set_defaults(attention=True)

args, _ = parser.parse_known_args()

In [15]:
args.finetune_after = 8
args.model_dir = 'output'

In [16]:
import os
import os.path as osp
import argparse
import numpy as np 
import json
import time
 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torch.utils.data import DataLoader

import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torchvision import models                                                                     

from coco_loader import coco_loader
from convcap import convcap
from vggfeats import Vgg16Feats
from tqdm import tqdm 
from test import test 

In [17]:
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

In [18]:
if (args.is_train == 1):
    print('train')

train


In [19]:
t_start = time.time()
train_data = coco_loader(args.coco_root, split='train', ncap_per_img=args.ncap_per_img)
print('[DEBUG] Loading train data ... %f secs' % (time.time() - t_start))

Loading annotation file...
Found 113287 images in split: train
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading train data ... 3.102025 secs


In [20]:
train_data_loader = DataLoader(dataset=train_data, num_workers=0, batch_size=args.batchsize, \
                               shuffle=True, drop_last=True)

In [21]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import json
import argparse
from random import shuffle, seed
import string
# non-standard dependencies:
import h5py
from six.moves import cPickle
import numpy as np
import torch
import torchvision.models as models
import skimage.io

from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
from torch import nn

preprocess = Compose([
    Resize((224, 224), interpolation=Image.BICUBIC),
    CenterCrop((224, 224)),
    ToTensor()
])


from clip.clip import load
from timm.models.vision_transformer import resize_pos_embed
import timm

from captioning.utils.resnet_utils import myResnet
import captioning.utils.resnet as resnet

In [22]:
model, transform = load('RN50', jit=False)  

In [16]:
model = model.cuda()
model.train(True)

CLIP(
  (visual): ModifiedResNet(
    (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (relu): ReLU(inplace=True)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn

In [17]:
#Convcap model
model_convcap = convcap(train_data.numwords, args.num_layers, is_attention=args.attention, nfeats=2048, nimgfeats=1024)
model_convcap.cuda()
model_convcap.train(True)

convcap(
  (emb_0): Embedding(9221, 2048, padding_idx=0)
  (emb_1): Linear(in_features=2048, out_features=2048, bias=True)
  (imgproj): Linear(in_features=1024, out_features=2048, bias=True)
  (resproj): Linear(in_features=4096, out_features=2048, bias=True)
  (convs): ModuleList(
    (0): Conv1d(4096, 4096, kernel_size=(5,), stride=(1,), padding=(4,))
    (1): Conv1d(2048, 4096, kernel_size=(5,), stride=(1,), padding=(4,))
    (2): Conv1d(2048, 4096, kernel_size=(5,), stride=(1,), padding=(4,))
  )
  (attention): ModuleList(
    (0): AttentionLayer(
      (in_projection): Linear(in_features=2048, out_features=2048, bias=True)
      (out_projection): Linear(in_features=2048, out_features=2048, bias=True)
    )
    (1): AttentionLayer(
      (in_projection): Linear(in_features=2048, out_features=2048, bias=True)
      (out_projection): Linear(in_features=2048, out_features=2048, bias=True)
    )
    (2): AttentionLayer(
      (in_projection): Linear(in_features=2048, out_features=2048, 

In [18]:
optimizer = optim.RMSprop(model_convcap.parameters(), lr=args.learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=.1)
img_optimizer = None

In [19]:
batchsize = args.batchsize
ncap_per_img = args.ncap_per_img
batchsize_cap = batchsize*ncap_per_img
max_tokens = train_data.max_tokens
nbatches = np.int_(np.floor((len(train_data.ids)*1.)/batchsize)) 
bestscore = .0

In [20]:
def repeat_img_per_cap(imgsfeats, imgsfc7, ncap_per_img):
    batchsize, featdim, feat_h, feat_w = imgsfeats.size()
    batchsize_cap = batchsize*ncap_per_img
    imgsfeats = imgsfeats.unsqueeze(1).expand(batchsize, ncap_per_img, featdim, feat_h, feat_w)
    imgsfeats = imgsfeats.contiguous().view(batchsize_cap, featdim, feat_h, feat_w)
    
    batchsize, featdim = imgsfc7.size()
    batchsize_cap = batchsize*ncap_per_img
    imgsfc7 = imgsfc7.unsqueeze(1).expand(batchsize, ncap_per_img, featdim)
    imgsfc7 = imgsfc7.contiguous().view(batchsize_cap, featdim)
    
    return imgsfeats, imgsfc7

In [21]:
args.epochs

30

In [None]:
#   for epoch in range(args.epochs):
# 코드가 잘 돌아가는지 확인하기 위해 2번만 돌려봤습니다. 전 30(args.epochs)번 돌렸습니다.

In [22]:
for epoch in range(args.epochs): 
    loss_train = 0.
    
#     if(epoch == args.finetune_after):
#         img_optimizer = optim.RMSprop(model_imgcnn.parameters(), lr=1e-5)
#         img_scheduler = lr_scheduler.StepLR(img_optimizer, step_size=args.lr_step_size, gamma=.1)

    scheduler.step()    
#     if(img_optimizer):
#         img_scheduler.step()

    #One epoch of train
    for batch_idx, (imgs, captions, wordclass, mask, _) in \
      tqdm(enumerate(train_data_loader), total=nbatches):
        
        imgs = imgs.view(batchsize, 3, 224, 224)
        wordclass = wordclass.view(batchsize_cap, max_tokens)
        mask = mask.view(batchsize_cap, max_tokens)

        imgs_v = Variable(imgs).cuda()
        wordclass_v = Variable(wordclass).cuda()

        optimizer.zero_grad()
#         if(img_optimizer):
#             img_optimizer.zero_grad() 

        imgsfeats, imgsfc7 = model.encode_image(imgs_v)
        imgsfeats, imgsfc7 = repeat_img_per_cap(imgsfeats, imgsfc7, ncap_per_img)
        _, _, feat_h, feat_w = imgsfeats.size()

        if(args.attention == True):
            wordact, attn = model_convcap(imgsfeats.float(), imgsfc7.float(), wordclass_v)
            attn = attn.view(batchsize_cap, max_tokens, feat_h, feat_w)
        else:
            wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass_v)

        wordact = wordact[:,:,:-1]
        wordclass_v = wordclass_v[:,1:]
        mask = mask[:,1:].contiguous()

        wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize_cap*(max_tokens-1), -1)
        wordclass_t = wordclass_v.contiguous().view(batchsize_cap*(max_tokens-1), 1)
      
        maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1)

        if(args.attention == True):
            #Cross-entropy loss and attention loss of Show, Attend and Tell
            loss = F.cross_entropy(wordact_t[maskids, ...], \
                                   wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])) \
            + (torch.sum(torch.pow(1. - torch.sum(attn, 1), 2)))\
            /(batchsize_cap*feat_h*feat_w)
        else:
            loss = F.cross_entropy(wordact_t[maskids, ...], \
                                   wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))

        loss_train = loss_train + loss.data

        loss.backward()

        optimizer.step()
#         if(img_optimizer):
#             img_optimizer.step()

    loss_train = (loss_train*1.)/(batch_idx)
    print('[DEBUG] Training epoch %d has loss %f' % (epoch, loss_train))

    modelfn = osp.join(args.model_dir, 'model.pth')

#     if(img_optimizer):
#         img_optimizer_dict = img_optimizer.state_dict()
#     else:
#         img_optimizer_dict = None

    torch.save({
          'epoch': epoch,
          'state_dict': model_convcap.state_dict(),
#           'img_state_dict': model.encode_image.state_dict(),
          'optimizer' : optimizer.state_dict(),
#           'img_optimizer' : img_optimizer_dict,
    }, modelfn)

    #Run on validation and obtain score
    scores = test(args, 'val', model_convcap=model_convcap, model_imgcnn=model)
    score = scores[0][args.score_select]

    if(score > bestscore):
        bestscore = score
        print('[DEBUG] Saving model at epoch %d with %s score of %f'\
          % (epoch, args.score_select, score))
        bestmodelfn = osp.join(args.model_dir, 'bestmodel.pth')
        os.system('cp %s %s' % (modelfn, bestmodelfn))

  x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
100%|███████████████████████████████████████████████████████████████████████████| 5664/5664 [1:09:18<00:00,  1.36it/s]


[DEBUG] Training epoch 0 has loss 4.989432
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.182087 secs
[DEBUG] Running inference on val with 250 batches


  wordprobs = F.softmax(wordact_t).cpu().data.numpy()
100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:22<00:00,  3.02it/s]


loading annotations into memory...
Done (t=0.67s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 46222, 'reflen': 46534, 'guess': [46222, 41222, 36222, 31222], 'correct': [30415, 13423, 4919, 1742]}
ratio: 0.9932952249967553
Bleu_1: 0.654
Bleu_2: 0.460
Bleu_3: 0.306
Bleu_4: 0.199
computing METEOR score...
METEOR: 0.198
computing Rouge score...
ROUGE_L: 0.466
computing CIDEr score...
CIDEr: 0.604
computing SPICE score...
SPICE: 0.127
[DEBUG] Saving model at epoch 0 with CIDEr score of 0.603762


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:59<00:00,  2.55it/s]


[DEBUG] Training epoch 1 has loss 3.724845
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.537999 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:21<00:00,  3.08it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47486, 'reflen': 47432, 'guess': [47486, 42486, 37486, 32486], 'correct': [32221, 15023, 6005, 2320]}
ratio: 1.0011384719176717
Bleu_1: 0.679
Bleu_2: 0.490
Bleu_3: 0.337
Bleu_4: 0.229
computing METEOR score...
METEOR: 0.218
computing Rouge score...
ROUGE_L: 0.488
computing CIDEr score...
CIDEr: 0.725
computing SPICE score...
SPICE: 0.149
[DEBUG] Saving model at epoch 1 with CIDEr score of 0.725487


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:59<00:00,  2.55it/s]


[DEBUG] Training epoch 2 has loss 3.459597
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.454183 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47538, 'reflen': 47419, 'guess': [47538, 42538, 37538, 32538], 'correct': [32941, 15787, 6344, 2527]}
ratio: 1.0025095425883928
Bleu_1: 0.693
Bleu_2: 0.507
Bleu_3: 0.352
Bleu_4: 0.241
computing METEOR score...
METEOR: 0.221
computing Rouge score...
ROUGE_L: 0.497
computing CIDEr score...
CIDEr: 0.762
computing SPICE score...
SPICE: 0.155
[DEBUG] Saving model at epoch 2 with CIDEr score of 0.761757


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:59<00:00,  2.55it/s]


[DEBUG] Training epoch 3 has loss 3.321762
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.544999 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48149, 'reflen': 47760, 'guess': [48149, 43149, 38149, 33149], 'correct': [33573, 16562, 7046, 2952]}
ratio: 1.008144891122257
Bleu_1: 0.697
Bleu_2: 0.517
Bleu_3: 0.367
Bleu_4: 0.258
computing METEOR score...
METEOR: 0.233
computing Rouge score...
ROUGE_L: 0.506
computing CIDEr score...
CIDEr: 0.819
computing SPICE score...
SPICE: 0.163
[DEBUG] Saving model at epoch 3 with CIDEr score of 0.818592


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:00<00:00,  2.55it/s]


[DEBUG] Training epoch 4 has loss 3.228324
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.482045 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.10it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47018, 'reflen': 47036, 'guess': [47018, 42018, 37018, 32018], 'correct': [33027, 16301, 6879, 2866]}
ratio: 0.9996173143974616
Bleu_1: 0.702
Bleu_2: 0.522
Bleu_3: 0.370
Bleu_4: 0.259
computing METEOR score...
METEOR: 0.230
computing Rouge score...
ROUGE_L: 0.506
computing CIDEr score...
CIDEr: 0.822
computing SPICE score...
SPICE: 0.161
[DEBUG] Saving model at epoch 4 with CIDEr score of 0.821711


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:53<00:00,  2.56it/s]


[DEBUG] Training epoch 5 has loss 3.157045
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.542176 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 46962, 'reflen': 46945, 'guess': [46962, 41962, 36962, 31962], 'correct': [33265, 16498, 7132, 3078]}
ratio: 1.0003621258919801
Bleu_1: 0.708
Bleu_2: 0.528
Bleu_3: 0.377
Bleu_4: 0.268
computing METEOR score...
METEOR: 0.233
computing Rouge score...
ROUGE_L: 0.508
computing CIDEr score...
CIDEr: 0.842
computing SPICE score...
SPICE: 0.164
[DEBUG] Saving model at epoch 5 with CIDEr score of 0.842000


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:09<00:00,  2.54it/s]


[DEBUG] Training epoch 6 has loss 3.100183
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.523999 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47077, 'reflen': 47053, 'guess': [47077, 42077, 37077, 32077], 'correct': [33688, 16835, 7278, 3072]}
ratio: 1.00051006312029
Bleu_1: 0.716
Bleu_2: 0.535
Bleu_3: 0.383
Bleu_4: 0.271
computing METEOR score...
METEOR: 0.236
computing Rouge score...
ROUGE_L: 0.513
computing CIDEr score...
CIDEr: 0.860
computing SPICE score...
SPICE: 0.169
[DEBUG] Saving model at epoch 6 with CIDEr score of 0.860355


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:09<00:00,  2.54it/s]


[DEBUG] Training epoch 7 has loss 3.051425
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.603192 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47715, 'reflen': 47546, 'guess': [47715, 42715, 37715, 32715], 'correct': [33578, 16779, 7214, 3080]}
ratio: 1.0035544525301603
Bleu_1: 0.704
Bleu_2: 0.526
Bleu_3: 0.375
Bleu_4: 0.266
computing METEOR score...
METEOR: 0.236
computing Rouge score...
ROUGE_L: 0.509
computing CIDEr score...
CIDEr: 0.854
computing SPICE score...
SPICE: 0.166


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:03<00:00,  2.55it/s]


[DEBUG] Training epoch 8 has loss 3.009316
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.599041 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47894, 'reflen': 47625, 'guess': [47894, 42894, 37894, 32894], 'correct': [34035, 17171, 7512, 3248]}
ratio: 1.0056482939632336
Bleu_1: 0.711
Bleu_2: 0.533
Bleu_3: 0.383
Bleu_4: 0.273
computing METEOR score...
METEOR: 0.240
computing Rouge score...
ROUGE_L: 0.516
computing CIDEr score...
CIDEr: 0.869
computing SPICE score...
SPICE: 0.170
[DEBUG] Saving model at epoch 8 with CIDEr score of 0.869287


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:13<00:00,  2.54it/s]


[DEBUG] Training epoch 9 has loss 2.969102
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.657043 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47521, 'reflen': 47315, 'guess': [47521, 42521, 37521, 32521], 'correct': [33799, 16934, 7545, 3348]}
ratio: 1.0043537990066362
Bleu_1: 0.711
Bleu_2: 0.532
Bleu_3: 0.385
Bleu_4: 0.277
computing METEOR score...
METEOR: 0.240
computing Rouge score...
ROUGE_L: 0.515
computing CIDEr score...
CIDEr: 0.884
computing SPICE score...
SPICE: 0.170
[DEBUG] Saving model at epoch 9 with CIDEr score of 0.884008


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:05<00:00,  2.54it/s]


[DEBUG] Training epoch 10 has loss 2.933248
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.398000 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.10it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48496, 'reflen': 48028, 'guess': [48496, 43496, 38496, 33496], 'correct': [34260, 17140, 7606, 3360]}
ratio: 1.009744315815753
Bleu_1: 0.706
Bleu_2: 0.528
Bleu_3: 0.380
Bleu_4: 0.273
computing METEOR score...
METEOR: 0.243
computing Rouge score...
ROUGE_L: 0.517
computing CIDEr score...
CIDEr: 0.883
computing SPICE score...
SPICE: 0.172


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:16<00:00,  2.53it/s]


[DEBUG] Training epoch 11 has loss 2.898622
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.452576 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48313, 'reflen': 47919, 'guess': [48313, 43313, 38313, 33313], 'correct': [34140, 17316, 7822, 3534]}
ratio: 1.0082222083098353
Bleu_1: 0.707
Bleu_2: 0.532
Bleu_3: 0.386
Bleu_4: 0.280
computing METEOR score...
METEOR: 0.242
computing Rouge score...
ROUGE_L: 0.518
computing CIDEr score...
CIDEr: 0.887
computing SPICE score...
SPICE: 0.173
[DEBUG] Saving model at epoch 11 with CIDEr score of 0.886847


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:04<00:00,  2.55it/s]


[DEBUG] Training epoch 12 has loss 2.865029
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.506992 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.10it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48492, 'reflen': 48046, 'guess': [48492, 43492, 38492, 33492], 'correct': [34530, 17392, 7652, 3295]}
ratio: 1.0092827706780791
Bleu_1: 0.712
Bleu_2: 0.534
Bleu_3: 0.384
Bleu_4: 0.273
computing METEOR score...
METEOR: 0.242
computing Rouge score...
ROUGE_L: 0.520
computing CIDEr score...
CIDEr: 0.888
computing SPICE score...
SPICE: 0.171
[DEBUG] Saving model at epoch 12 with CIDEr score of 0.888330


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:14<00:00,  2.54it/s]


[DEBUG] Training epoch 13 has loss 2.831491
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.580999 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48273, 'reflen': 47908, 'guess': [48273, 43273, 38273, 33273], 'correct': [34289, 17457, 7701, 3384]}
ratio: 1.0076187693078191
Bleu_1: 0.710
Bleu_2: 0.535
Bleu_3: 0.386
Bleu_4: 0.277
computing METEOR score...
METEOR: 0.242
computing Rouge score...
ROUGE_L: 0.518
computing CIDEr score...
CIDEr: 0.888
computing SPICE score...
SPICE: 0.173


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:09<00:00,  2.54it/s]


[DEBUG] Training epoch 14 has loss 2.711402
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.643136 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.10it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48181, 'reflen': 47831, 'guess': [48181, 43181, 38181, 33181], 'correct': [34602, 17802, 8103, 3692]}
ratio: 1.0073174301185215
Bleu_1: 0.718
Bleu_2: 0.544
Bleu_3: 0.398
Bleu_4: 0.289
computing METEOR score...
METEOR: 0.248
computing Rouge score...
ROUGE_L: 0.526
computing CIDEr score...
CIDEr: 0.932
computing SPICE score...
SPICE: 0.178
[DEBUG] Saving model at epoch 14 with CIDEr score of 0.931870


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:07<00:00,  2.54it/s]


[DEBUG] Training epoch 15 has loss 2.687235
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.386000 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48433, 'reflen': 48038, 'guess': [48433, 43433, 38433, 33433], 'correct': [34620, 17866, 8129, 3748]}
ratio: 1.0082226570631374
Bleu_1: 0.715
Bleu_2: 0.542
Bleu_3: 0.396
Bleu_4: 0.289
computing METEOR score...
METEOR: 0.248
computing Rouge score...
ROUGE_L: 0.525
computing CIDEr score...
CIDEr: 0.931
computing SPICE score...
SPICE: 0.178


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:15<00:00,  2.53it/s]


[DEBUG] Training epoch 16 has loss 2.674812
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.415999 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48396, 'reflen': 48055, 'guess': [48396, 43396, 38396, 33396], 'correct': [34640, 17905, 8187, 3749]}
ratio: 1.0070960357923004
Bleu_1: 0.716
Bleu_2: 0.543
Bleu_3: 0.398
Bleu_4: 0.290
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.527
computing CIDEr score...
CIDEr: 0.934
computing SPICE score...
SPICE: 0.178
[DEBUG] Saving model at epoch 16 with CIDEr score of 0.933553


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:11<00:00,  2.54it/s]


[DEBUG] Training epoch 17 has loss 2.664922
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.413039 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48190, 'reflen': 47864, 'guess': [48190, 43190, 38190, 33190], 'correct': [34567, 17773, 8078, 3648]}
ratio: 1.0068109643991099
Bleu_1: 0.717
Bleu_2: 0.543
Bleu_3: 0.397
Bleu_4: 0.288
computing METEOR score...
METEOR: 0.247
computing Rouge score...
ROUGE_L: 0.526
computing CIDEr score...
CIDEr: 0.925
computing SPICE score...
SPICE: 0.177


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:12<00:00,  2.54it/s]


[DEBUG] Training epoch 18 has loss 2.655495
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.495003 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.10it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48418, 'reflen': 48040, 'guess': [48418, 43418, 38418, 33418], 'correct': [34670, 17955, 8258, 3838]}
ratio: 1.0078684429641755
Bleu_1: 0.716
Bleu_2: 0.544
Bleu_3: 0.399
Bleu_4: 0.292
computing METEOR score...
METEOR: 0.250
computing Rouge score...
ROUGE_L: 0.529
computing CIDEr score...
CIDEr: 0.938
computing SPICE score...
SPICE: 0.178
[DEBUG] Saving model at epoch 18 with CIDEr score of 0.937585


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:17<00:00,  2.53it/s]


[DEBUG] Training epoch 19 has loss 2.645840
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.607000 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48176, 'reflen': 47877, 'guess': [48176, 43176, 38176, 33176], 'correct': [34580, 17862, 8185, 3762]}
ratio: 1.0062451699145518
Bleu_1: 0.718
Bleu_2: 0.545
Bleu_3: 0.399
Bleu_4: 0.291
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.527
computing CIDEr score...
CIDEr: 0.932
computing SPICE score...
SPICE: 0.178


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:09<00:00,  2.54it/s]


[DEBUG] Training epoch 20 has loss 2.638928
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.280002 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48567, 'reflen': 48134, 'guess': [48567, 43567, 38567, 33567], 'correct': [34721, 17812, 8107, 3720]}
ratio: 1.0089957202808617
Bleu_1: 0.715
Bleu_2: 0.541
Bleu_3: 0.395
Bleu_4: 0.287
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.525
computing CIDEr score...
CIDEr: 0.927
computing SPICE score...
SPICE: 0.178


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:18<00:00,  2.53it/s]


[DEBUG] Training epoch 21 has loss 2.631627
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.519216 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.11it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48340, 'reflen': 47989, 'guess': [48340, 43340, 38340, 33340], 'correct': [34554, 17842, 8149, 3733]}
ratio: 1.0073141761653504
Bleu_1: 0.715
Bleu_2: 0.542
Bleu_3: 0.397
Bleu_4: 0.289
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.526
computing CIDEr score...
CIDEr: 0.927
computing SPICE score...
SPICE: 0.177


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:16<00:00,  2.53it/s]


[DEBUG] Training epoch 22 has loss 2.624597
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.507088 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.10it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48465, 'reflen': 48107, 'guess': [48465, 43465, 38465, 33465], 'correct': [34575, 17817, 8175, 3733]}
ratio: 1.0074417444446546
Bleu_1: 0.713
Bleu_2: 0.541
Bleu_3: 0.396
Bleu_4: 0.289
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.525
computing CIDEr score...
CIDEr: 0.927
computing SPICE score...
SPICE: 0.177


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:06<00:00,  2.54it/s]


[DEBUG] Training epoch 23 has loss 2.617305
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.508161 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.09it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48487, 'reflen': 48094, 'guess': [48487, 43487, 38487, 33487], 'correct': [34567, 17798, 8149, 3732]}
ratio: 1.0081714974840728
Bleu_1: 0.713
Bleu_2: 0.540
Bleu_3: 0.395
Bleu_4: 0.288
computing METEOR score...
METEOR: 0.248
computing Rouge score...
ROUGE_L: 0.526
computing CIDEr score...
CIDEr: 0.926
computing SPICE score...
SPICE: 0.177


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [37:12<00:00,  2.54it/s]


[DEBUG] Training epoch 24 has loss 2.610523
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.502084 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.10it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48230, 'reflen': 47856, 'guess': [48230, 43230, 38230, 33230], 'correct': [34344, 17554, 7991, 3664]}
ratio: 1.0078151120026537
Bleu_1: 0.712
Bleu_2: 0.538
Bleu_3: 0.392
Bleu_4: 0.286
computing METEOR score...
METEOR: 0.247
computing Rouge score...
ROUGE_L: 0.523
computing CIDEr score...
CIDEr: 0.921
computing SPICE score...
SPICE: 0.177


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:58<00:00,  2.55it/s]


[DEBUG] Training epoch 25 has loss 2.603060
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.602592 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.09it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48490, 'reflen': 48140, 'guess': [48490, 43490, 38490, 33490], 'correct': [34468, 17592, 7973, 3630]}
ratio: 1.007270461154944
Bleu_1: 0.711
Bleu_2: 0.536
Bleu_3: 0.391
Bleu_4: 0.283
computing METEOR score...
METEOR: 0.248
computing Rouge score...
ROUGE_L: 0.523
computing CIDEr score...
CIDEr: 0.919
computing SPICE score...
SPICE: 0.176


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:50<00:00,  2.56it/s]


[DEBUG] Training epoch 26 has loss 2.596197
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.559343 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.09it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48538, 'reflen': 48149, 'guess': [48538, 43538, 38538, 33538], 'correct': [34693, 17925, 8218, 3773]}
ratio: 1.008079087831502
Bleu_1: 0.715
Bleu_2: 0.542
Bleu_3: 0.397
Bleu_4: 0.290
computing METEOR score...
METEOR: 0.250
computing Rouge score...
ROUGE_L: 0.526
computing CIDEr score...
CIDEr: 0.930
computing SPICE score...
SPICE: 0.178


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:50<00:00,  2.56it/s]


[DEBUG] Training epoch 27 has loss 2.589466
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.594564 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.09it/s]


loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48748, 'reflen': 48283, 'guess': [48748, 43748, 38748, 33748], 'correct': [34596, 17805, 8149, 3752]}
ratio: 1.0096307188865439
Bleu_1: 0.710
Bleu_2: 0.537
Bleu_3: 0.393
Bleu_4: 0.287
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.524
computing CIDEr score...
CIDEr: 0.926
computing SPICE score...
SPICE: 0.178


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:57<00:00,  2.55it/s]


[DEBUG] Training epoch 28 has loss 2.582845
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.526001 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.09it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48702, 'reflen': 48265, 'guess': [48702, 43702, 38702, 33702], 'correct': [34702, 17900, 8197, 3758]}
ratio: 1.0090541800476327
Bleu_1: 0.713
Bleu_2: 0.540
Bleu_3: 0.395
Bleu_4: 0.288
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.526
computing CIDEr score...
CIDEr: 0.924
computing SPICE score...
SPICE: 0.177


100%|█████████████████████████████████████████████████████████████████████████████| 5664/5664 [36:48<00:00,  2.56it/s]


[DEBUG] Training epoch 29 has loss 2.564430
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 3.603759 secs
[DEBUG] Running inference on val with 250 batches


100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:20<00:00,  3.09it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48493, 'reflen': 48097, 'guess': [48493, 43493, 38493, 33493], 'correct': [34647, 17882, 8141, 3718]}
ratio: 1.0082333617481132
Bleu_1: 0.714
Bleu_2: 0.542
Bleu_3: 0.396
Bleu_4: 0.288
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.526
computing CIDEr score...
CIDEr: 0.927
computing SPICE score...
SPICE: 0.178


In [17]:
for epoch in range(1): 
    loss_train = 0.
    
#     if(epoch == args.finetune_after):
#         img_optimizer = optim.RMSprop(model_imgcnn.parameters(), lr=1e-5)
#         img_scheduler = lr_scheduler.StepLR(img_optimizer, step_size=args.lr_step_size, gamma=.1)

    scheduler.step()    
#     if(img_optimizer):
#         img_scheduler.step()

    #One epoch of train
    for batch_idx, (imgs, captions, wordclass, mask, _) in \
      tqdm(enumerate(train_data_loader), total=nbatches):
        
        imgs = imgs.view(batchsize, 3, 224, 224)
        wordclass = wordclass.view(batchsize_cap, max_tokens)
        mask = mask.view(batchsize_cap, max_tokens)

        imgs_v = Variable(imgs).cuda()
        wordclass_v = Variable(wordclass).cuda()

        optimizer.zero_grad()
#         if(img_optimizer):
#             img_optimizer.zero_grad() 

        imgsfeats, imgsfc7 = model.encode_image(imgs_v)
        imgsfeats, imgsfc7 = repeat_img_per_cap(imgsfeats, imgsfc7, ncap_per_img)
        _, _, feat_h, feat_w = imgsfeats.size()

        if(args.attention == True):
            wordact, attn = model_convcap(imgsfeats.float(), imgsfc7.float(), wordclass_v)
            attn = attn.view(batchsize_cap, max_tokens, feat_h, feat_w)
        else:
            wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass_v)

        wordact = wordact[:,:,:-1]
        wordclass_v = wordclass_v[:,1:]
        mask = mask[:,1:].contiguous()

        wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize_cap*(max_tokens-1), -1)
        wordclass_t = wordclass_v.contiguous().view(batchsize_cap*(max_tokens-1), 1)
      
        maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1)

        if(args.attention == True):
            #Cross-entropy loss and attention loss of Show, Attend and Tell
            loss = F.cross_entropy(wordact_t[maskids, ...], \
                                   wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])) \
            + (torch.sum(torch.pow(1. - torch.sum(attn, 1), 2)))\
            /(batchsize_cap*feat_h*feat_w)
        else:
            loss = F.cross_entropy(wordact_t[maskids, ...], \
                                   wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))

        loss_train = loss_train + loss.data

        loss.backward()

        optimizer.step()
#         if(img_optimizer):
#             img_optimizer.step()

    loss_train = (loss_train*1.)/(batch_idx)
    print('[DEBUG] Training epoch %d has loss %f' % (epoch, loss_train))

    modelfn = osp.join(args.model_dir, 'model.pth')

#     if(img_optimizer):
#         img_optimizer_dict = img_optimizer.state_dict()
#     else:
#         img_optimizer_dict = None

    torch.save({
          'epoch': epoch,
          'state_dict': model_convcap.state_dict(),
          'img_state_dict': model_imgcnn.state_dict(),
          'optimizer' : optimizer.state_dict(),
#           'img_optimizer' : img_optimizer_dict,
    }, modelfn)

    #Run on validation and obtain score
    scores = test(args, 'val', model_convcap=model_convcap, model_imgcnn=model_imgcnn)
    score = scores[0][args.score_select]

    if(score > bestscore):
        bestscore = score
        print('[DEBUG] Saving model at epoch %d with %s score of %f'\
          % (epoch, args.score_select, score))
        bestmodelfn = osp.join(args.model_dir, 'bestmodel.pth')
        os.system('cp %s %s' % (modelfn, bestmodelfn))

  x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
100%|███████████████████████████████████████████████████████████████████████████| 5664/5664 [1:58:56<00:00,  1.26s/it]


[DEBUG] Training epoch 0 has loss 7.425636
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 5.809345 secs
[DEBUG] Running inference on val with 250 batches


  wordprobs = F.softmax(wordact_t).cpu().data.numpy()
100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:17<00:00,  3.21it/s]


loading annotations into memory...
Done (t=0.54s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
tokenization...
setting up scorers...
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
computing Bleu score...
{'testlen': 43639, 'reflen': 44922, 'guess': [43639, 38639, 33639, 28639], 'correct': [24884, 8879, 2228, 471]}
ratio: 0.9714393838208235
Bleu_1: 0.554
Bleu_2: 0.351
Bleu_3: 0.200
Bleu_4: 0.106
computing METEOR score...
METEOR: 0.148
computing Rouge score...
ROUGE_L: 0.410
computing CIDEr score...
CIDEr: 0.303
computing SPICE score...
SPICE: 0.086
[DEBUG] Saving model at epoch 0 with CIDEr score of 0.303493


In [23]:
bestmodelfn = osp.join(args.model_dir, 'bestmodel.pth')

In [25]:
args.model_dir

'output'

In [34]:
split='test'
modelfn=bestmodelfn

In [35]:
import sys
sys.path.insert(0, 'third_party/coco-caption')

import numpy as np
import os
import os.path as osp
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
import json
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')
import sys


In [36]:
def language_eval(input_data, savedir, split):
  if type(input_data) == str: # Filename given.
    checkpoint = json.load(open(input_data, 'r'))
    preds = checkpoint
  elif type(input_data) == list: # Direct predictions give.
    preds = input_data

  annFile = 'third_party/coco-caption/annotations/captions_val2014.json'
  coco = COCO(annFile)
  valids = coco.getImgIds()

  # Filter results to only those in MSCOCO validation set (will be about a third)
  preds_filt = [p for p in preds if p['image_id'] in valids]
  len_p = len(preds_filt)
  for i in range(len_p):
    preds_filt[i]['image_id'] = int(preds_filt[i]['image_id'])
  print('Using %d/%d predictions' % (len(preds_filt), len(preds)))
  resFile = osp.join(savedir, 'result_%s.json' % (split))
  json.dump(preds_filt, open(resFile, 'w')) # Serialize to temporary json file. Sigh, COCO API...

  cocoRes = coco.loadRes(resFile)
  cocoEval = COCOEvalCap(coco, cocoRes)
  cocoEval.params['image_id'] = cocoRes.getImgIds()
  cocoEval.evaluate()

  # Create output dictionary.
  out = {}
  for metric, score in cocoEval.eval.items():
    out[metric] = score

  # Return aggregate and per image score.
  return out, cocoEval.evalImgs

In [38]:
input_data, savedir, split = pred_captions, args.model_dir, split

In [39]:
  if type(input_data) == str: # Filename given.
    checkpoint = json.load(open(input_data, 'r'))
    preds = checkpoint
  elif type(input_data) == list: # Direct predictions give.
    preds = input_data

  annFile = 'third_party/coco-caption/annotations/captions_val2014.json'
  coco = COCO(annFile)
  valids = coco.getImgIds()

  # Filter results to only those in MSCOCO validation set (will be about a third)
  preds_filt = [p for p in tqdm(preds) if p['image_id'] in valids]
  len_p = len(preds_filt)
  for i in range(len_p):
    preds_filt[i]['image_id'] = int(preds_filt[i]['image_id'])
  print('Using %d/%d predictions' % (len(preds_filt), len(preds)))
  resFile = osp.join(savedir, 'result_%s.json' % (split))
  json.dump(preds_filt, open(resFile, 'w')) # Serialize to temporary json file. Sigh, COCO API...

  cocoRes = coco.loadRes(resFile)
  cocoEval = COCOEvalCap(coco, cocoRes)
  cocoEval.params['image_id'] = cocoRes.getImgIds()
  cocoEval.evaluate()

  # Create output dictionary.
  out = {}
  for metric, score in cocoEval.eval.items():
    out[metric] = score

  # Return aggregate and per image score.
#   return out, cocoEval.evalImgs

loading annotations into memory...
Done (t=0.25s)
creating index...
index created!


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [11:58<00:00,  6.96it/s]


Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 48522, 'reflen': 48024, 'guess': [48522, 43522, 38522, 33522], 'correct': [34648, 17877, 8138, 3641]}
ratio: 1.0103698150924327
Bleu_1: 0.714
Bleu_2: 0.542
Bleu_3: 0.396
Bleu_4: 0.286
computing METEOR score...
METEOR: 0.249
computing Rouge score...
ROUGE_L: 0.525
computing CIDEr score...
CIDEr: 0.944
computing SPICE score...
SPICE: 0.179


In [37]:
scores = language_eval(pred_captions, args.model_dir, split)

loading annotations into memory...
Done (t=0.27s)
creating index...
index created!


KeyboardInterrupt: 

In [31]:

  """Runs test on split=val/test with checkpoint file modelfn or loaded model_*"""

  t_start = time.time()
  data = coco_loader(args.coco_root, split=split, ncap_per_img=1)
  print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start))

  data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\
    batch_size=args.batchsize, shuffle=False, drop_last=True)

  batchsize = args.batchsize
  max_tokens = data.max_tokens
  num_batches = np.int_(np.floor((len(data.ids)*1.)/batchsize))
  print('[DEBUG] Running inference on %s with %d batches' % (split, num_batches))

  model, transform = load('RN50', jit=False)

  if(modelfn is not None):
    print('===========')
    # model_imgcnn = Vgg16Feats()
    # model_imgcnn.cuda()
    model_imgcnn = model.cuda()

    # model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention)
    model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention, nfeats=2048, nimgfeats=1024)
    model_convcap.cuda()

    print('[DEBUG] Loading checkpoint %s' % modelfn)
    checkpoint = torch.load(modelfn)
    model_convcap.load_state_dict(checkpoint['state_dict'])
    # model_imgcnn.load_state_dict(checkpoint['img_state_dict'])
  else:
    model_imgcnn = model.cuda()
    model_convcap = model_convcap

  model_imgcnn.train(False) 
  model_convcap.train(False)

  pred_captions = []
  #Test epoch
  for batch_idx, (imgs, _, _, _, img_ids) in \
    tqdm(enumerate(data_loader), total=num_batches):
    
    imgs = imgs.view(batchsize, 3, 224, 224)

    imgs_v = Variable(imgs.cuda())
    imgsfeats, imgsfc7 = model_imgcnn.encode_image(imgs_v)
    _, featdim, feat_h, feat_w = imgsfeats.size()
  
    wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64')
    wordclass_feed[:,0] = data.wordlist.index('<S>') 

    outcaps = np.empty((batchsize, 0)).tolist()

    for j in range(max_tokens-1):
      wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()

      wordact, _ = model_convcap(imgsfeats.float(), imgsfc7.float(), wordclass)

      wordact = wordact[:,:,:-1]
      wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)

      wordprobs = F.softmax(wordact_t).cpu().data.numpy()
      wordids = np.argmax(wordprobs, axis=1)

      for k in range(batchsize):
        word = data.wordlist[wordids[j+k*(max_tokens-1)]]
        outcaps[k].append(word)
        if(j < max_tokens-1):
          wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)]

    for j in range(batchsize):
      num_words = len(outcaps[j]) 
      if 'EOS' in outcaps[j]:
        num_words = outcaps[j].index('EOS')
      outcap = ' '.join(outcaps[j][:num_words])
      pred_captions.append({'image_id': img_ids[j], 'caption': outcap})

  scores = language_eval(pred_captions, args.model_dir, split)

  model_imgcnn.train(True) 
  model_convcap.train(True)


Loading annotation file...
Found 5000 images in split: test
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading test data ... 4.613305 secs
[DEBUG] Running inference on test with 250 batches
[DEBUG] Loading checkpoint output\bestmodel.pth


  wordprobs = F.softmax(wordact_t).cpu().data.numpy()
100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:21<00:00,  3.08it/s]


NameError: name 'COCO' is not defined

In [33]:
pred_captions

[{'image_id': tensor(391895),
  'caption': 'a man riding a bike down a dirt road'},
 {'image_id': tensor(60623), 'caption': 'a woman eating a donut with a fork'},
 {'image_id': tensor(483108),
  'caption': 'a man on a bicycle is looking at a train'},
 {'image_id': tensor(384213),
  'caption': 'a kitchen with a window and a window'},
 {'image_id': tensor(386164),
  'caption': 'a row of wooden tables with white sheets'},
 {'image_id': tensor(223648),
  'caption': 'a long wooden table with a bunch of green bananas on it'},
 {'image_id': tensor(403385),
  'caption': 'a bathroom with a toilet and a sink'},
 {'image_id': tensor(294832),
  'caption': 'a bathroom with a white toilet and a white sink'},
 {'image_id': tensor(462565),
  'caption': 'a group of people riding bikes down a street'},
 {'image_id': tensor(436141),
  'caption': 'a bathroom with a sink and a toilet'},
 {'image_id': tensor(192440),
  'caption': 'a white sink and a toilet in a room'},
 {'image_id': tensor(1146),
  'caption

In [32]:
args.model_dir

'output'

In [28]:
pred_captions

[{'image_id': tensor(391895),
  'caption': 'a man riding a bike down a dirt road'},
 {'image_id': tensor(60623), 'caption': 'a woman eating a donut with a fork'},
 {'image_id': tensor(483108),
  'caption': 'a man on a bicycle is looking at a train'},
 {'image_id': tensor(384213),
  'caption': 'a kitchen with a window and a window'},
 {'image_id': tensor(386164),
  'caption': 'a row of wooden tables with white sheets'},
 {'image_id': tensor(223648),
  'caption': 'a long wooden table with a bunch of green bananas on it'},
 {'image_id': tensor(403385),
  'caption': 'a bathroom with a toilet and a sink'},
 {'image_id': tensor(294832),
  'caption': 'a bathroom with a white toilet and a white sink'},
 {'image_id': tensor(462565),
  'caption': 'a group of people riding bikes down a street'},
 {'image_id': tensor(436141),
  'caption': 'a bathroom with a sink and a toilet'},
 {'image_id': tensor(192440),
  'caption': 'a white sink and a toilet in a room'},
 {'image_id': tensor(1146),
  'caption

In [29]:
with open('1218_results.json', 'w') as f:
    json.dump(pred_captions, f)

TypeError: Object of type Tensor is not JSON serializable

In [21]:
if (osp.exists(bestmodelfn)):
    print('if (osp.exists(bestmodelfn)):')
    
    if (args.beam_size == 1):
        print('if (args.beam_size == 1):')
        scores = test(args, 'test', modelfn=bestmodelfn)
    else:
        print('else:')
        scores = test_beam(args, 'test', modelfn=bestmodelfn)
        
    print('TEST set scores')
    for k, v in scores[0].items():
        print('%s: %f' % (k, v))
else:
    print('2 else')
    raise Exception('No checkpoint found %s' % bestmodelfn)

if (osp.exists(bestmodelfn)):
if (args.beam_size == 1):
Loading annotation file...
Found 5000 images in split: test
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading test data ... 4.452054 secs
[DEBUG] Running inference on test with 250 batches
[DEBUG] Loading checkpoint output\bestmodel.pth


  x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
  wordprobs = F.softmax(wordact_t).cpu().data.numpy()
100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [01:11<00:00,  3.51it/s]


loading annotations into memory...
Done (t=0.47s)
creating index...
index created!
Using 5000/5000 predictions
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 43520, 'reflen': 44789, 'guess': [43520, 38520, 33520, 28520], 'correct': [24850, 8862, 2193, 503]}
ratio: 0.9716671504163753
Bleu_1: 0.555
Bleu_2: 0.352
Bleu_3: 0.199
Bleu_4: 0.108
computing METEOR score...
METEOR: 0.148
computing Rouge score...
ROUGE_L: 0.411
computing CIDEr score...
CIDEr: 0.303
computing SPICE score...
SPICE: 0.085
TEST set scores
Bleu_1: 0.554592
Bleu_2: 0.352029
Bleu_3: 0.198949
Bleu_4: 0.107769
METEOR: 0.148145
ROUGE_L: 0.411031
CIDEr: 0.303347
SPICE: 0.085442


In [20]:
scores[0].items()

dict_items([('Bleu_1', 0.6257070931810783), ('Bleu_2', 0.4400814108073923), ('Bleu_3', 0.2950048884687872), ('Bleu_4', 0.19118162177132456), ('METEOR', 0.1882842543884385), ('ROUGE_L', 0.4576943954075463), ('CIDEr', 0.5376096446582836), ('SPICE', 0.11709152104827554)])