# VQA Model

This is part (the front end) of a splitted model. It is responsible for precalculating embeddings for both image and language channels and store it to disk along with the answer and some additional information.

In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
from skimage import io
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import json
import pprint
import numpy as np
import time
from datetime import timedelta


Mounting shared Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

import sys
sys.path.append('/content/drive/My Drive/aidl/lib')

Mounted at /content/drive


In [None]:
! cp '/content/drive/My Drive/aidl/full.zip' .
! unzip full.zip

Archive:  full.zip
  inflating: mscoco_qtest.txt        
  inflating: mscoco_a.json           
  inflating: mscoco_q.json           
  inflating: mscoco_qtest_full.txt   
  inflating: mscoco_qtrain.txt       


In [None]:
! wget 'http://images.cocodataset.org/zips/val2014.zip'

--2020-11-10 18:20:59--  http://images.cocodataset.org/zips/val2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.89.188
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.89.188|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6645013297 (6.2G) [application/zip]
Saving to: ‘val2014.zip’


2020-11-10 18:23:41 (39.1 MB/s) - ‘val2014.zip’ saved [6645013297/6645013297]



In [None]:
# This is an attempt to use the train dataset (443,757 triplets).
# Failed due to memory constraints in Google Colab

! wget 'http://images.cocodataset.org/zips/train2014.zip'
! mkdir coco
! unzip -qj train2014.zip -d coco
! cp '/content/drive/My Drive/aidl/trt/mscoco_train.zip' .
! unzip mscoco_train.zip
! head mscoco_qtrain.txt > mscoco_qtest.txt
! wc -l *.txt

Global Parameters

In [None]:
batch_size = 30

#dataset_root = "/content/model1000_any"
#dataset_root = "/content/drive/My Drive/aidl"
#dataset_root = "/content/drive/My Drive/aidl/Dataset100"
#dataset_root = "/content/drive/My Drive/aidl/model5000_1_20"
#dataset_root = "/content/drive/My Drive/aidl/model10000_1_20"
#dataset_root = "/content/drive/My Drive/aidl/model1000_any"
#dataset_root = "/content/drive/My Drive/aidl/model10000_yes_no"
dataset_root = "/content"

!ls '$dataset_root'
device = torch.device('cuda')

coco   mscoco_a.json  mscoco_qtest.txt	 mscoco_train.zip
drive  mscoco_q.json  mscoco_qtrain.txt  sample_data


Custom Data Loader

In [None]:
"""
     Data structure:

     root_dir
        |_ mscoco_qtrain.txt [list of question IDs for training]
        |_ mscoco_qtest.txt  [list of question IDs for testing]
        |_ mscoco_q.json     [json of questions]
        |_ mscoco_a.json     [json of annotations]
        |_ coco              [subdirectory for images]
            |_ image 1
            |_ image 2
              ...
            |_ image n

"""

class preProcess(Dataset):
#
#   _ _ i n i t _ _
#
    def __init__(self, root_dir, train=False, transform=None):
        """
        Args:
            root_dir (string): root directory for the data           
            train (bool, optional): choose training or testing datasets
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        super().__init__()

        # Define some constants to access files
        #self.image_prefix = "COCO_val2014_"
        self.image_prefix = "COCO_train2014_"
        self.image_postfix = ".jpg"
        self.image_subdir = 'coco'
        self.questions_list_train_fname = 'mscoco_qtrain.txt'
        self.questions_list_test_fname = 'mscoco_qtest.txt'
        self.questions_fname = 'mscoco_q.json'
        self.annotations_fname = 'mscoco_a.json'

        # Store parameters as class attributes
        self.root_dir = root_dir
        self.train = train
        self.transform = transform

        # Load questions json file
        fqfname = os.path.join(self.root_dir, self.questions_fname)
        self.questions = json.load(open(fqfname, 'r'))

        # Load annotations json file
        fqfname = os.path.join(self.root_dir, self.annotations_fname)
        self.annotations = json.load(open(fqfname, 'r'))

        # Compute list of unique answers (train + test)
        fqfname = os.path.join(self.root_dir, self.questions_list_train_fname)
        with open(fqfname) as f:
          tmp = f.read().splitlines()
        ql_train = list(map(int, tmp))

        fqfname = os.path.join(self.root_dir, self.questions_list_test_fname)
        with open(fqfname) as f:
          tmp = f.read().splitlines()
        ql_test = list(map(int, tmp))

        ql_global = set(ql_train + ql_test)

        self.annotation_map = {}
        self.annotation_classdist = {}
        annotation_id = 0
        for a in self.annotations:
            if a['question_id'] in ql_global:
                if not(a['multiple_choice_answer'] in self.annotation_map):
                    self.annotation_map[a['multiple_choice_answer']] = annotation_id
                    annotation_id += 1
                self.annotation_classdist.update({a['multiple_choice_answer'] : self.annotation_classdist.get(a['multiple_choice_answer'],0) + 1})

        #print (self.annotation_map)
        #print (self.annotation_classdist)

        print("Dataset: training {} / testing {}".format(len(ql_train), len(ql_test)))

        # Depending on self.train, assign either training question list or
        #   testing question list
        if (self.train):
            self.questions_list = ql_train
        else:
            self.questions_list = ql_test
#
#   _ _ l e n _ _
#
    def __len__(self):
        return len(self.questions_list)
#
#   _ _ g e t i t e m _ _
#
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Get question_id from the list
        question_id = self.questions_list[idx]

        # Find corresponding question and annotation
        #   (use next() to get a single value from the list comprehension)
        question = next(x for x in self.questions if x['question_id'] == question_id)
        annotation = next(a for a in self.annotations if a['question_id'] == question_id)

        image_id = question['image_id']
        file_name = self.image_prefix+("0"*12+str(image_id))[-12:]+self.image_postfix
        fqfname = os.path.join(self.root_dir,self.image_subdir,file_name)
        image = Image.open(fqfname)
 
        # Fix B&W images
        if image.mode != 'RGB':
            image = image.convert('RGB')

        # Apply transformation if there is any
        if self.transform:
            image = self.transform(image)

        # Translate annotation to its ID
        annotation_ID = self.annotation_map[annotation['multiple_choice_answer']]

        metadata = {}
        metadata["image_id"] = image_id
        metadata["filename"] = file_name
        metadata["question_text"] = question['question']
        metadata["question_id"] = question_id
        metadata["annotation_text"] = annotation['multiple_choice_answer']
        metadata["annotation_id"] = annotation_ID
        metadata["question_type"] = annotation['question_type']
        metadata["answer_type"] = annotation['answer_type']

        # return tuple: image, question text, answer text
        return image, question['question'], annotation_ID, metadata

Image Embeddings (pretrained VGG-16)

In [None]:
from torchvision import models, transforms

model = models.vgg16(pretrained=True)
model.to(device)
model.eval()

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [None]:
import torch.nn as nn
class ImageEmbedding(nn.Module):

    def __init__(self, base_model):
        super().__init__()
        self.features = base_model.features
        self.avgpool = base_model.avgpool
        self.classifier = base_model.classifier

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        for i, layer in enumerate(self.classifier):
            x = layer(x)
            if i == 3:
                break
        return x


Text Embeddings (Google Universal Sentence Encoder)

In [None]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import pandas as pd
import re

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

sentence_embedding_generator = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


Training VQA Model

In [None]:
def preProcessData(dataloader):
    t = transforms.ToTensor()

    image_embeddings_batch = []
    question_embeddings_batch = []
    annotations_batch = []
    metadatas = []
    for batch, (image, question, annotation, metadata) in enumerate(dataloader):
        print (batch+1)
        image = image.to(device)
        image_embedding = image_embedding_extractor.forward(image)
        image_embeddings_batch.append(image_embedding)

        question_embedding = sentence_embedding_generator(question).numpy()
        question_embedding = t(question_embedding)
        question_embedding = question_embedding.squeeze_(0).to(device)
        question_embeddings_batch.append(question_embedding)

        annotations_batch.append(annotation)

        m = [{} for i in range(len(metadata['question_id']))]

        for k,v in metadata.items():
          for i in range(len(v)):
            m[i][k] = v[i].item() if isinstance(v[i],torch.Tensor) else v[i]
        
        metadatas.extend(m)
        


    image_embeddings = torch.cat(image_embeddings_batch,dim=0)
    question_embeddings = torch.cat(question_embeddings_batch,dim=0)
    annotations = torch.cat(annotations_batch,dim=0)

    return image_embeddings, question_embeddings, annotations, metadatas

img_size = 512

transform = transforms.Compose([
                                transforms.Resize(img_size),
                                transforms.CenterCrop(img_size),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                     std=[0.229, 0.224, 0.225])
                            ])



# Create Dataset & Dataloader
ds = preProcess(dataset_root,train=True,transform=transform)
dl = DataLoader(ds,batch_size=batch_size,shuffle=False,num_workers=4)

torch.save(ds.annotation_map,os.path.join(dataset_root,'annotation_map.save'))

test_dataset = preProcess(dataset_root,train=False,transform=transform)
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=4)

image_embedding_extractor = ImageEmbedding(model)
for p in image_embedding_extractor.parameters():
    p.requires_grad = False

start_time = time.time()
image_embeddings, question_embeddings, annotations, metadatas = preProcessData(dl)
elapsed_time = time.time() - start_time
print ("Total elapsed time.    : {}".format(str(timedelta(seconds=elapsed_time))))
print("{} samples in {:,.0f} seconds : {:4.1f} samples/sec".format(len(ds),
                                                                   elapsed_time,len(ds)/elapsed_time))
torch.save(image_embeddings,os.path.join(dataset_root,'image_embeddings_train.save'))
torch.save(question_embeddings,os.path.join(dataset_root,'question_embeddings_train.save'))
torch.save(annotations,os.path.join(dataset_root,'annotations_train.save'))
torch.save(metadatas,os.path.join(dataset_root,'metadatas_train.save'))

#image_embeddings, question_embeddings, annotations, metadatas = preProcessData(test_loader)

#torch.save(image_embeddings,os.path.join(dataset_root,'image_embeddings_test.save'))
#torch.save(question_embeddings,os.path.join(dataset_root,'question_embeddings_test.save'))
#torch.save(annotations,os.path.join(dataset_root,'annotations_test.save'))
#torch.save(metadatas,os.path.join(dataset_root,'metadatas_test.save'))


Dataset: training 443757 / testing 10
Dataset: training 443757 / testing 10
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258


In [None]:
len(annotations)

Obtain a list with de 1000 most frequent answers and save it

In [None]:
freq = torch.bincount(annotations)
v, i = torch.topk(freq,1000)
print("1000 most frequent answer account for {:2.0f}% of the answers".format(100*v.sum().item()/len(annotations)))
f = list(map(lambda x: idx2str[x],i))
torch.save(f,'freq.save')

In [None]:
!zip vgg_GUSE_embeddings.zip annotation_map.save image_embeddings.save question_embeddings.save annotations.save metadatas.save freq.save

updating: annotation_map.save (deflated 49%)
updating: image_embeddings.save (deflated 69%)
updating: question_embeddings.save (deflated 8%)
updating: annotations.save (deflated 81%)
updating: metadatas.save (deflated 73%)
  adding: freq.save (deflated 54%)


In [None]:
freq = torch.bincount(annotations)
v, i = torch.topk(freq,1000)
#for s,n in zip(f[:100],v[:100]):
#    print (s,"\t",n.item())

print("1000 most frequent annswer account for {:2.0f}% of the answers".format(100*v.sum().item()/len(annotations)))


1000 most frequent annswer account for 86% of the answers
