In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
# from torch.nn.utils.rnn import pack_padded_sequence
from model import ImageCNN,MatchCNN

import argparse
import os
import pickle
from data_loader import get_loader,CocoDataset
from build_vocab import Vocabulary
from torchvision import transforms
import time
from pycocotools.coco import COCO
from PIL import Image
import nltk
from random import shuffle
from matchCNN_st import MatchCNN_st

def to_var(x, volatile=False):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x, volatile=volatile)


In [2]:
"""load coco dataset"""
data_dir = "../data/coco/"
annotation_file = data_dir + "annotations/captions_train2014.json"
coco = COCO(annotation_file)


loading annotations into memory...
Done (t=0.48s)
creating index...
index created!


In [3]:
"""randomly extract  imgid and corresponding  captionid"""
sample_num = 100
# caption_num = sample_num * 1000
img_ids_all = list(coco.imgs.keys())
shuffle(img_ids_all)
img_ids = []
ann_ids = []

for key in img_ids_all:
    temp = coco.getAnnIds(key)
    ann_ids.append(temp[0])
    img_ids.append(key)
    if len(img_ids) == sample_num:
        break


In [4]:

"""preprocess images"""
image_dir = data_dir + "resized2014/"
imgs = []

# Image preprocessing
transform = transforms.Compose([
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))])

"""
imgs:
    list of img
img:
    ann_ids, data, id
"""
for i, img_id in enumerate(img_ids):
    path = coco.loadImgs(img_id)[0]['file_name']
    image = Image.open(os.path.join(image_dir, path)).convert('RGB')
    if transform is not None:
        image = transform(image)
    imgs.append(image)
imgs = np.array(imgs)


TypeError: only 1-element tensors can be converted to Python scalars

In [None]:
"""preprocess annotations"""
vocab_file = "../data/coco/vocab.pkl"
pad_len = 62
# Load vocabulary wrapper.
with open(vocab_file, 'rb') as f:
    vocab = pickle.load(f)

anns = np.zeros((sample_num, pad_len), dtype=int)
for i, ann_id in enumerate(ann_ids):
    # for j, ann_id in enumerate(ann_ids_image):
        caption_str = coco.anns[ann_id]["caption"]
        tokens = nltk.tokenize.word_tokenize(str(caption_str).lower())
        caption = []
        #         caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        #         caption.append(vocab('<end>'))
        caption = np.array(caption)
        anns[i][:len(caption)] = caption[:len(caption)]

anns = to_var(torch.from_numpy(anns))


In [None]:
"""
build model
"""

"""parameters"""
image_vector_size = 256
embed_size = 100
margin = 0.1
batch_size = 100
vocab_size = 9956
pad_len = 62
batch_num = sample_num//batch_size

"""set model"""
imageCNN = ImageCNN(image_vector_size=image_vector_size)
matchCNN = MatchCNN_st(embed_size=embed_size,
                       image_vector_size=image_vector_size,
                       vocab_size=vocab_size,
                       pad_len=pad_len)

if torch.cuda.is_available():
    print("cuda is available")
    imageCNN = imageCNN.cuda()
    matchCNN = matchCNN.cuda()



"""load models"""
model_path = "../models"
imageCNN.load_state_dict(torch.load(os.path.join(model_path, 'imageCNN_st21-0.005963.pkl')))
matchCNN.load_state_dict(torch.load(os.path.join(model_path, 'matchCNN_st21-0.005963.pkl')))

# imageCNN.eval()
# matchCNN.eval()



In [None]:

"""extract image feature"""

# img_data = to_var(torch.zeros(sample_num, 3, 224, 224))
img_features_batch = to_var(torch.zeros(batch_num, batch_size, image_vector_size))

# for i, img in enumerate(imgs):
#     img_data[i] = img["data"]

for i in range(batch_num):
    img_features_batch[i] = imageCNN(imgs[i * batch_size:(i + 1) * batch_size])

    


In [None]:
"""calculate scores"""
scores = np.zeros((sample_num, sample_num))
for i, caption in enumerate(anns):
    caption_tmp = caption.unsqueeze(0)
    caption_batch = caption_tmp.repeat(batch_size, 1)
    for j, img_feature_batch in enumerate(img_features_batch):
        score_batch = matchCNN(img_feature_batch, caption_batch)
        score_batch_np = score_batch.cpu().data.numpy()

        scores[j * batch_size:(j + 1) * batch_size, i] = score_batch_np[:, 0]



In [None]:
"""rank"""
sorted_scores = (-scores).argsort(axis=0)

scores_ranks = np.zeros((sample_num, sample_num), dtype=int)

for i in range(sample_num):
    for j in range(sample_num):
        scores_ranks[sorted_scores[j][i]][i] = j

ranks_image = np.zeros((sample_num), dtype=int)
for i in range(sample_num):
    ranks_image[i] = scores_ranks[i][i]

# sorted_ranks_image = np.sort(ranks_image)
# med_ranks = np.zeros(sample_num)
# for i in range(sample_num):
#     med_ranks[i] = sorted_ranks_image[i][0]

r1 = len(ranks_image[ranks_image == 0]) / sample_num * 100
r5 = len(ranks_image[ranks_image <= 4]) / sample_num * 100
r10 = len(ranks_image[ranks_image <= 9]) / sample_num * 100
med = np.mean(ranks_image)

print("r1:", r1)
print("r5:", r5)
print("r10:", r10)
print("med:", med)
print("")
