## Source code

https://github.com/MILVLG/openvqa/blob/6b9bfeb2e6462b946d7e7866ffc49dd7a8bcece3/openvqa/core/base_cfgs.py
https://github.com/MILVLG/openvqa/blob/6b9bfeb2e6462b946d7e7866ffc49dd7a8bcece3/utils/exec.py#L36
https://github.com/MILVLG/openvqa/blob/6b9bfeb2e6462b946d7e7866ffc49dd7a8bcece3/utils/test_engine.py#L49
https://github.com/MILVLG/openvqa/blob/6b9bfeb2e6462b946d7e7866ffc49dd7a8bcece3/openvqa/datasets/vqa/vqa_loader.py#L68

# Load dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import gc
import json

import numpy as np
import torch
import torch.utils.data as Data
import yaml
from yaml import CLoader

from openvqa.datasets.dataset_loader import DatasetLoader, EvalLoader
from openvqa.datasets.vqa.eval.vqa import VQA
from openvqa.datasets.vqa.eval.vqaEval import VQAEval
from openvqa.models.model_loader import CfgLoader, ModelLoader
from run import create_parser
from utils.test_engine import test_engine

parser = create_parser()
arg = '--RUN test --MODEL mcan_large --DATASET vqa --NW 2 --CKPT_V mcan_large --CKPT_E 13'
args = parser.parse_args(arg.split(' '))

cfg_file = "configs/{}/{}.yml".format(args.DATASET, args.MODEL)
with open(cfg_file, 'r') as f:
    yaml_dict = yaml.load(f, Loader=CLoader)

__C = CfgLoader(yaml_dict['MODEL_USE']).load()
args = __C.str_to_bool(args)
args_dict = __C.parse_to_dict(args)

args_dict = {**yaml_dict, **args_dict}
__C.add_args(args_dict)
__C.proc()

print('Hyper Parameters:')
print(__C)

Checking dataset ........
Finished!

Hyper Parameters:
{ BATCH_SIZE        }->64
{ BBOXFEAT_EMB_SIZE }->2048
{ BBOX_NORMALIZE    }->True
{ CACHE_PATH        }->./results/cache
{ CKPTS_PATH        }->./ckpts
{ CKPT_EPOCH        }->13
{ CKPT_PATH         }->None
{ CKPT_VERSION      }->mcan_large
{ DATASET           }->vqa
{ DATA_PATH         }->{'vqa': './data/vqa', 'gqa': './data/gqa', 'clevr': './data/clevr'}
{ DATA_ROOT         }->./data
{ DEVICES           }->[0]
{ DROPOUT_R         }->0.1
{ EVAL_BATCH_SIZE   }->16
{ EVAL_EVERY_EPOCH  }->False
{ FEATS_PATH        }->{'vqa': {'train': './data/vqa/feats/train2014', 'val': './data/vqa/feats/val2014', 'test': './data/vqa/feats/test2015'}, 'gqa': {'default-frcn': './data/gqa/feats/gqa-frcn', 'default-grid': './data/gqa/feats/gqa-grid'}, 'clevr': {'train': './data/clevr/feats/train', 'val': './data/clevr/feats/val', 'test': './data/clevr/feats/test'}}
{ FEAT_SIZE         }->{'vqa': {'FRCN_FEAT_SIZE': (100, 2048), 'BBOX_FEAT_SIZE': (100, 5)

In [3]:
dataset = DatasetLoader(__C).DataSet()

data_size = dataset.data_size
token_size = dataset.token_size
ans_size = dataset.ans_size
pretrained_emb = dataset.pretrained_emb

Finished!



In [4]:
net = ModelLoader(__C).Net(
    __C,
    pretrained_emb,
    token_size,
    ans_size
)

net.cuda()
net.eval()

with open('ckpts/ckpt_mcan_large/epoch13.pkl', 'rb') as f:
    state = torch.load(f, map_location='cpu')
net.load_state_dict(state['state_dict'])

<All keys matched successfully>

In [5]:
dataloader = Data.DataLoader(
    dataset,
    batch_size=__C.EVAL_BATCH_SIZE,
    shuffle=False,
    num_workers=__C.NUM_WORKERS,
    pin_memory=__C.PIN_MEM
)

# Output attention weights

In [6]:
from openvqa.utils.make_mask import make_mask
import math
import torch.nn.functional as F

def get_mcan_att(v, k, q, mask, net_module):
    """Forward MHAtt module to get the attention map (intermediate values).
    Default softmax is over text tokens (using text mask)
    We twist it over visual regions for visualization (using visual mask).
    """
    n_batches = q.size(0)

    v = net_module.linear_v(v).view(
        n_batches,
        -1,
        net_module._MHAtt__C.MULTI_HEAD,
        int(net_module._MHAtt__C.HIDDEN_SIZE / net_module._MHAtt__C.MULTI_HEAD)
    ).transpose(1, 2)

    k = net_module.linear_k(k).view(
        n_batches,
        -1,
        net_module._MHAtt__C.MULTI_HEAD,
        int(net_module._MHAtt__C.HIDDEN_SIZE / net_module._MHAtt__C.MULTI_HEAD)
    ).transpose(1, 2)

    q = net_module.linear_q(q).view(
        n_batches,
        -1,
        net_module._MHAtt__C.MULTI_HEAD,
        int(net_module._MHAtt__C.HIDDEN_SIZE / net_module._MHAtt__C.MULTI_HEAD)
    ).transpose(1, 2)
    
    # att computation
    d_k = q.size(-1)

    scores = torch.matmul(
        q, k.transpose(-2, -1)
    ) / math.sqrt(d_k)
    
    # softmax over visual regions for visualization
    scores = scores.transpose(-1, -2)
    if mask is not None:
        scores = scores.masked_fill(mask, -1e9)
    scores = scores.transpose(-1, -2)
    att_map = F.softmax(scores, dim=-2)
    
    # default softmax over text tokens
    # if mask is not None:
        # scores = scores.masked_fill(mask, -1e9)
    # att_map = F.softmax(scores, dim=-1)
    
    return att_map

In [7]:
%%time
atts = []
i = 0
for step, (frcn_feat, grid_feat, bbox_feat, ques_ix, ans) in enumerate(dataloader):
    print("\rEvaluation: [step %4d/%4d]" % (
        step,
        int(data_size / __C.EVAL_BATCH_SIZE),
    ), end='          ')
    
    frcn_feat = frcn_feat.cuda()
    grid_feat = grid_feat.cuda()
    bbox_feat = bbox_feat.cuda()
    ques_ix = ques_ix.cuda()
    
    lang_feat_mask = make_mask(ques_ix.unsqueeze(2))
    lang_feat = net.embedding(ques_ix)
    lang_feat, _ = net.lstm(lang_feat)

    img_feat, img_feat_mask  = net.adapter(frcn_feat, grid_feat, bbox_feat)
    #print('img_feat.shape:\t', img_feat.shape)
    #print('lang_feat.shape:\t', lang_feat.shape)
    
    y = lang_feat
    x = img_feat
    y_mask = lang_feat_mask
    x_mask = img_feat_mask
    
    # Get encoder last hidden vector
    for enc in net.backbone.enc_list:
        y = enc(y, y_mask)
    
    # Input encoder last hidden vector, and obtain decoder last hidden vectors
    for dec_i, dec in enumerate(net.backbone.dec_list):
        if dec_i != len(net.backbone.dec_list) - 1: # intermediate layers
            x = dec.norm1(x + dec.dropout1(dec.mhatt1(v=x, k=x, q=x, mask=x_mask)))
            x = dec.norm2(x + dec.dropout2(dec.mhatt2(v=y, k=y, q=x, mask=y_mask)))
            x = dec.norm3(x + dec.dropout3(dec.ffn(x)))
        else: # last layer
            x = dec.norm1(x + dec.dropout1(dec.mhatt1(v=x, k=x, q=x, mask=x_mask)))
            att = get_mcan_att(v=y, k=y, q=x, mask=x_mask, net_module=dec.mhatt2)
            #print('att_map.shape:\t', att_map.shape)

    atts.append(att.detach().cpu().numpy().sum(axis=-1).mean(axis=1))
#     i += 1
#     if i > 3:
#         break
res = np.concatenate(atts)
print(res.shape)
np.savez_compressed('att_weight_mcan', att=res)

Evaluation: [step 27987/27987]          (447793, 100)
CPU times: user 22min 34s, sys: 5min 49s, total: 28min 24s
Wall time: 24min 59s


In [8]:
res = np.concatenate(atts)
print(res.shape)
np.savez_compressed('att_weight_mcan', att=res)

(447793, 100)
