In [1]:
 Upgrade pillow to latest version (solves a colab Issue) :
!pip install --user Pillow>=5.2.0

In [2]:
import os, sys

from matplotlib import pyplot as plt
print("abcxyz")

abcxyz


### Download the Prebuilt VQA model and Weights

In [4]:
# https://github.com/Cyanogenoid/pytorch-vqa/releases!pip install wget
import wget
url = 'https://github.com/Cyanogenoid/pytorch-vqa/releases/download/v1.0/2017-08-04_00.55.19.pth'
if not os.path.isfile('./2017-08-04_00.55.19.pth'):   # 81Mb model
    #!wget https://github.com/Cyanogenoid/pytorch-vqa/releases/download/v1.0/2017-08-04_00.55.19.pth
    wget.download(url)

In [5]:
try: 
    import torch
except:
    from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
    platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
    accelerator = 'cu80' if os.path.exists('/opt/bin/nvidia-smi') else 'cpu'
    !pip install -q \
      http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl \
      torchvision

In [6]:
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [7]:
import model # from pytorch-vqa

#saved_state = torch.load('logs/2017-08-04_00:55:19.pth')
saved_state = torch.load('./2017-08-04_00.55.19.pth', map_location=device)
tokens = len(saved_state['vocab']['question']) + 1

saved_state.keys()  # See what's in the saved state

dict_keys(['name', 'tracker', 'config', 'weights', 'eval', 'vocab'])

In [8]:
# Load the predefined model
vqa_net = torch.nn.DataParallel(model.Net(tokens))
vqa_net.load_state_dict(saved_state['weights'])
vqa_net.to(device)
vqa_net.eval()

DataParallel(
  (module): Net(
    (text): TextProcessor(
      (embedding): Embedding(15193, 300, padding_idx=0)
      (drop): Dropout(p=0.5, inplace=False)
      (tanh): Tanh()
      (lstm): LSTM(300, 1024)
    )
    (attention): Attention(
      (v_conv): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (q_lin): Linear(in_features=1024, out_features=512, bias=True)
      (x_conv): Conv2d(512, 2, kernel_size=(1, 1), stride=(1, 1))
      (drop): Dropout(p=0.5, inplace=False)
      (relu): ReLU(inplace=True)
    )
    (classifier): Classifier(
      (drop1): Dropout(p=0.5, inplace=False)
      (lin1): Linear(in_features=5120, out_features=1024, bias=True)
      (relu): ReLU()
      (drop2): Dropout(p=0.5, inplace=False)
      (lin2): Linear(in_features=1024, out_features=3000, bias=True)
    )
  )
)

### Now get the Correct Image feature network

In [10]:
import resnet  # from pytorch-resnet

import torchvision.transforms as transforms
from PIL import Image

def get_transform(target_size, central_fraction=1.0):
    return transforms.Compose([
        transforms.Scale(int(target_size / central_fraction)),
        transforms.CenterCrop(target_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

class ResNetLayer4(torch.nn.Module):
    def __init__(self):
        super(ResNetLayer4, self).__init__()
        self.model = resnet.resnet152(pretrained=True)
        
        # from  visual_qa_analysis/config.py
        image_size = 448  # scale shorter end of image to this size and centre crop
        #output_size = image_size // 32  # size of the feature maps after processing through a network
        output_features = 2048  # number of feature maps thereof
        central_fraction = 0.875 # only take this much of the centre when scaling and centre cropping

        self.transform = get_transform(image_size, central_fraction)

        def save_output(module, input, output):
            self.buffer = output
        self.model.layer4.register_forward_hook(save_output)

    def forward(self, x):
        self.model(x)
        return self.buffer
    
    def image_to_features(self, img_file):
        img = Image.open(img_file).convert('RGB')
        img_transformed = self.transform(img)
        #print(img_transformed.size())
        img_batch = img_transformed.unsqueeze(0).to(device)
        print(img_batch)
        return self.forward(img_batch) 
    
resnet_layer4 = ResNetLayer4().to(device)  # Downloads 241Mb model when first run

In [11]:
# Sample images : 
import wget
## local images----------
image_files=['./img/Black_pussy_-_panoramio.jpg','./img/jump.jpg','./img/abc.jpg',
    './img/penguins.jpg',
    './img/2.jpg',
             './img/ima.jpg',
 './img/cat_roof_home_architecture_building_roofs_animal_sit-536976.jpg!d']
image_files

['./img/Black_pussy_-_panoramio.jpg',
 './img/jump.jpg',
 './img/abc.jpg',
 './img/penguins.jpg',
 './img/2.jpg',
 './img/ima.jpg',
 './img/cat_roof_home_architecture_building_roofs_animal_sit-536976.jpg!d']

In [36]:
import numpy as np
import torch
v = resnet_layer4.image_to_features(image_files[1])
v.size()
print(type(v))

#for j in v:
    #print(j)
    #np.savez('0.npz', x=feats[j].reshape(1024, 196).transpose(1,0))

tensor([[[[ 0.6049,  0.6049,  0.6049,  ..., -0.2856, -0.2856, -0.2856],
          [ 0.6221,  0.6049,  0.6049,  ..., -0.2856, -0.2856, -0.2856],
          [ 0.6221,  0.6221,  0.6049,  ..., -0.2684, -0.2684, -0.2684],
          ...,
          [-0.9705, -1.0390, -1.1589,  ...,  2.1975,  2.1975,  2.1975],
          [-1.2617, -1.2788, -1.2617,  ...,  2.1975,  2.1975,  2.1975],
          [-1.3302, -1.3473, -1.2959,  ...,  2.1975,  2.1975,  2.1975]],

         [[ 1.2906,  1.2906,  1.2906,  ...,  0.6604,  0.6604,  0.6604],
          [ 1.3081,  1.2906,  1.2906,  ...,  0.6604,  0.6604,  0.6604],
          [ 1.3081,  1.3081,  1.2906,  ...,  0.6779,  0.6779,  0.6779],
          ...,
          [-0.8627, -0.9328, -1.0553,  ...,  2.3585,  2.3585,  2.3585],
          [-1.1604, -1.1779, -1.1604,  ...,  2.3585,  2.3585,  2.3585],
          [-1.1954, -1.2129, -1.1779,  ...,  2.3585,  2.3585,  2.3585]],

         [[ 2.2740,  2.2740,  2.2740,  ...,  2.0648,  2.0648,  2.0648],
          [ 2.2914,  2.2740,  

### Have a look at how the vocab is built

In [37]:
vocab = saved_state['vocab']
vocab.keys()  # dict_keys(['question', 'answer'])
list(vocab['question'].items())[:5]  # [('the', 1), ('is', 2), ('what', 3), ('are', 4), ('this', 5)]

[('the', 1), ('is', 2), ('what', 3), ('are', 4), ('this', 5)]

In [38]:
qtoken_to_index = vocab['question']
QUESTION_LENGTH_MAX = 30 # say...
    
def encode_question(question_str):
    """ Turn a question into a vector of indices and a question length """
    question_arr = question_str.lower().split(' ')
    vec = torch.zeros(len(question_arr)).long()  
    for i, token in enumerate(question_arr):
        vec[i] = qtoken_to_index.get(token, 0)
    return vec.to(device), torch.tensor( len(question_arr) ).to(device)

In [39]:
list(vocab['answer'].items())[:5]  

[('yes', 0), ('no', 1), ('2', 2), ('1', 3), ('white', 4)]

In [40]:
answer_words = ['UNDEF'] * len(vocab['answer'])
for w,idx in vocab['answer'].items():
    answer_words[idx]=w
len(answer_words), answer_words[:10] 

(3000, ['yes', 'no', '2', '1', 'white', '3', 'red', 'blue', '4', 'green'])

In [41]:
# Important things to know...
'colour' in qtoken_to_index, 'color' in qtoken_to_index, 'tabby' in answer_words

(False, True, True)

In [42]:
image_idx = 1
image_filename = image_files[image_idx]

#img = Image.open(image_filename).convert('RGB')
#plt.imshow(img)

In [43]:
v0 = resnet_layer4.image_to_features(image_filename)

tensor([[[[ 0.6049,  0.6049,  0.6049,  ..., -0.2856, -0.2856, -0.2856],
          [ 0.6221,  0.6049,  0.6049,  ..., -0.2856, -0.2856, -0.2856],
          [ 0.6221,  0.6221,  0.6049,  ..., -0.2684, -0.2684, -0.2684],
          ...,
          [-0.9705, -1.0390, -1.1589,  ...,  2.1975,  2.1975,  2.1975],
          [-1.2617, -1.2788, -1.2617,  ...,  2.1975,  2.1975,  2.1975],
          [-1.3302, -1.3473, -1.2959,  ...,  2.1975,  2.1975,  2.1975]],

         [[ 1.2906,  1.2906,  1.2906,  ...,  0.6604,  0.6604,  0.6604],
          [ 1.3081,  1.2906,  1.2906,  ...,  0.6604,  0.6604,  0.6604],
          [ 1.3081,  1.3081,  1.2906,  ...,  0.6779,  0.6779,  0.6779],
          ...,
          [-0.8627, -0.9328, -1.0553,  ...,  2.3585,  2.3585,  2.3585],
          [-1.1604, -1.1779, -1.1604,  ...,  2.3585,  2.3585,  2.3585],
          [-1.1954, -1.2129, -1.1779,  ...,  2.3585,  2.3585,  2.3585]],

         [[ 2.2740,  2.2740,  2.2740,  ...,  2.0648,  2.0648,  2.0648],
          [ 2.2914,  2.2740,  

In [47]:
q, q_len = encode_question("is the cat jumping up or down")
q, q_len

(tensor([  2,   1,  43, 576, 109,  25, 168]), tensor(7))

In [48]:
ans = vqa_net(v0, q.unsqueeze(0), q_len.unsqueeze(0))
ans.data.cpu()[0:30]

tensor([[-12.7777, -12.9656, -18.9573,  ..., -49.5790, -41.9412, -56.8924]])

In [49]:
_, answer_idx = ans.data.cpu().max(dim=1)
print(answer_idx)
answer_words[answer_idx]

tensor([96])


'down'

In [50]:

def vqa_single_softmax(im_features, q_str):
    q, q_len = encode_question(q_str)
    ans = vqa_net(im_features, q.unsqueeze(0), q_len.unsqueeze(0))
    return ans.data.cpu()

def vqa(image_filename, question_arr):
    plt.imshow(Image.open(image_filename).convert('RGB')); plt.show()    
    image_features = resnet_layer4.image_to_features(image_filename)
    for question_str in question_arr:
        _, answer_idx = vqa_single_softmax(image_features, question_str).max(dim=1)
        print(question_str+" -> "+answer_words[ answer_idx ])
        print((answer_words[ answer_idx ]+' '*8)[:8]+" <- "+question_str)  

In [51]:
image_idx = 0  # 6 
image_files=['./img/penguins.jpg',
 './img/2.jpg',
 './img/ima.jpg',
 './img/cat_roof_home_architecture_building_roofs_animal_sit-536976.jpg!d']
vqa(image_files[image_idx], [
    "is there a cat in the picture",
    "is this a picture of a panda",
    "is the animal in the picture a panda or a dog",
    "what color is the panda",
    "how many cows are there",
])

tensor([[[[-1.5699, -1.5357, -1.5357,  ..., -1.0390, -1.3302, -1.3302],
          [-1.6213, -1.5870, -1.5870,  ..., -1.1075, -1.2959, -1.2617],
          [-1.6898, -1.6384, -1.6384,  ..., -1.1760, -1.2788, -1.2445],
          ...,
          [ 0.7591,  0.7591,  0.8276,  ...,  0.4166, -0.2513, -1.2103],
          [ 0.7419,  0.7077,  0.7077,  ...,  0.4851,  0.2967, -0.3198],
          [ 0.7591,  0.7762,  0.7762,  ...,  0.3481,  0.1939,  0.0741]],

         [[-1.4055, -1.3704, -1.3704,  ..., -0.8452, -1.1429, -1.1429],
          [-1.4405, -1.4055, -1.4055,  ..., -0.9153, -1.1078, -1.0728],
          [-1.4405, -1.3880, -1.3880,  ..., -0.9853, -1.0903, -1.0553],
          ...,
          [ 1.1155,  1.1155,  1.1856,  ...,  0.7654,  0.0651, -0.9328],
          [ 1.0980,  1.0630,  1.0630,  ...,  0.8179,  0.6254, -0.0049],
          [ 1.1155,  1.1331,  1.1331,  ...,  0.7304,  0.5553,  0.4328]],

         [[-1.1596, -1.1247, -1.1247,  ..., -0.7238, -1.0027, -0.9853],
          [-1.2119, -1.1770, -

#### Leave one word out 

In [52]:
def leave_one_out(image_filename, question_base):
    plt.imshow(Image.open(image_filename).convert('RGB')); plt.show()    
    image_features = resnet_layer4.image_to_features(image_filename)
    question_arr = question_base.lower().split(' ')
    for i, word_omit in enumerate(question_arr):
        question_str = ' '.join( question_arr[:i]+question_arr[i+1:] )
        score, answer_idx = vqa_single_softmax(image_features, question_str).max(dim=1)
        print(question_str+" -> "+answer_words[ answer_idx ])
        print((answer_words[ answer_idx ]+' '*8)[:8]+" <- "+question_str)  #, score

In [53]:
image_idx = 1

leave_one_out(image_files[image_idx], "is there a cat in the picture")

tensor([[[[-2.0323, -2.0152, -2.0152,  ..., -1.6213, -1.6555, -1.7754],
          [-2.0323, -2.0665, -2.0494,  ..., -1.6384, -1.6727, -1.7925],
          [-1.6555, -1.6384, -1.8268,  ..., -1.6555, -1.6898, -1.7412],
          ...,
          [-1.9980, -1.9638, -1.9638,  ...,  0.7591,  1.1015,  1.1187],
          [-1.9980, -1.9638, -1.9980,  ...,  0.8104,  0.7248,  0.6563],
          [-2.0152, -1.9980, -1.9980,  ...,  0.8789,  0.5707,  0.4679]],

         [[-1.9657, -1.9307, -1.9307,  ..., -1.5805, -1.6155, -1.7206],
          [-1.9657, -1.9832, -1.9657,  ..., -1.5980, -1.6331, -1.7206],
          [-1.5805, -1.5455, -1.7381,  ..., -1.5630, -1.5980, -1.6331],
          ...,
          [-1.9307, -1.9307, -1.9307,  ...,  0.6429,  1.0280,  1.0105],
          [-1.9307, -1.9482, -1.9657,  ...,  0.6254,  0.4853,  0.6078],
          [-1.9132, -1.9482, -1.9832,  ...,  0.5378,  0.1877,  0.3803]],

         [[-1.7696, -1.6999, -1.6999,  ..., -1.4907, -1.5256, -1.5953],
          [-1.7522, -1.7522, -

#### Leave all combos of words out ( think : Binary )

In [54]:
def leave_out_combos(image_filename, question_base):
    plt.imshow(Image.open(image_filename).convert('RGB')); plt.show()    
    image_features = resnet_layer4.image_to_features(image_filename)
    question_arr = question_base.lower().split(' ')
    for i in range(2 ** len(question_arr)):
        q_arr = [question_arr[j] for j in range(len(question_arr)) if (i & (2**j))==0 ]
        question_str = ' '.join( q_arr )
        _, answer_idx = vqa_single_softmax(image_features, question_str).max(dim=1)
        print((answer_words[ answer_idx ]+' '*8)[:8]+" <- "+question_str)

In [55]:
image_idx = 1
image_files=['./img/penguins.jpg',
 './img/2.jpg',
 './img/ima.jpg',
 './img/cat_roof_home_architecture_building_roofs_animal_sit-536976.jpg!d']
leave_out_combos(image_files[image_idx], "is there a cat in the picture")
#leave_out_combos(image_files[image_idx], "what color are cat's eyes")

tensor([[[[-2.0323, -2.0152, -2.0152,  ..., -1.6213, -1.6555, -1.7754],
          [-2.0323, -2.0665, -2.0494,  ..., -1.6384, -1.6727, -1.7925],
          [-1.6555, -1.6384, -1.8268,  ..., -1.6555, -1.6898, -1.7412],
          ...,
          [-1.9980, -1.9638, -1.9638,  ...,  0.7591,  1.1015,  1.1187],
          [-1.9980, -1.9638, -1.9980,  ...,  0.8104,  0.7248,  0.6563],
          [-2.0152, -1.9980, -1.9980,  ...,  0.8789,  0.5707,  0.4679]],

         [[-1.9657, -1.9307, -1.9307,  ..., -1.5805, -1.6155, -1.7206],
          [-1.9657, -1.9832, -1.9657,  ..., -1.5980, -1.6331, -1.7206],
          [-1.5805, -1.5455, -1.7381,  ..., -1.5630, -1.5980, -1.6331],
          ...,
          [-1.9307, -1.9307, -1.9307,  ...,  0.6429,  1.0280,  1.0105],
          [-1.9307, -1.9482, -1.9657,  ...,  0.6254,  0.4853,  0.6078],
          [-1.9132, -1.9482, -1.9832,  ...,  0.5378,  0.1877,  0.3803]],

         [[-1.7696, -1.6999, -1.6999,  ..., -1.4907, -1.5256, -1.5953],
          [-1.7522, -1.7522, -

In [56]:
def leave_out_best(image_filename, question_base):
    plt.imshow(Image.open(image_filename).convert('RGB')); plt.show()    
    image_features = resnet_layer4.image_to_features(image_filename)
    _, answer_true = vqa_single_softmax(image_features, question_base).max(dim=1)
    print((answer_words[ answer_true ]+' '*8)[:8]+" <- "+question_base)
    print()
    while True:
        question_arr = question_base.lower().split(' ')
        score_best, q_best = None, ''
        for i, word_omit in enumerate(question_arr):
            question_str = ' '.join( question_arr[:i]+question_arr[i+1:] )
            score, answer_idx = vqa_single_softmax(image_features, question_str).max(dim=1)
            if answer_idx==answer_true:
                print((answer_words[ answer_idx ]+' '*8)[:8]+" <- "+question_str)  #, score        
                if (score_best is None or score>score_best):
                    score_best, question_base = score, question_str
        print()
        if score_best is None or len(question_base)==0: break

In [57]:
image_idx = 1

leave_out_best(image_files[image_idx], "is there a cat in the picture")

tensor([[[[-2.0323, -2.0152, -2.0152,  ..., -1.6213, -1.6555, -1.7754],
          [-2.0323, -2.0665, -2.0494,  ..., -1.6384, -1.6727, -1.7925],
          [-1.6555, -1.6384, -1.8268,  ..., -1.6555, -1.6898, -1.7412],
          ...,
          [-1.9980, -1.9638, -1.9638,  ...,  0.7591,  1.1015,  1.1187],
          [-1.9980, -1.9638, -1.9980,  ...,  0.8104,  0.7248,  0.6563],
          [-2.0152, -1.9980, -1.9980,  ...,  0.8789,  0.5707,  0.4679]],

         [[-1.9657, -1.9307, -1.9307,  ..., -1.5805, -1.6155, -1.7206],
          [-1.9657, -1.9832, -1.9657,  ..., -1.5980, -1.6331, -1.7206],
          [-1.5805, -1.5455, -1.7381,  ..., -1.5630, -1.5980, -1.6331],
          ...,
          [-1.9307, -1.9307, -1.9307,  ...,  0.6429,  1.0280,  1.0105],
          [-1.9307, -1.9482, -1.9657,  ...,  0.6254,  0.4853,  0.6078],
          [-1.9132, -1.9482, -1.9832,  ...,  0.5378,  0.1877,  0.3803]],

         [[-1.7696, -1.6999, -1.6999,  ..., -1.4907, -1.5256, -1.5953],
          [-1.7522, -1.7522, -