In [1]:
import numpy
import pickle
import random
import json
import os
from collections import Counter
from scipy.stats import mode


from dataset_utils.text_processing import tokenize


## store implications in imdb

In [None]:
in_dir = 'data/imdb/'
out_dir = 'data/imdb_imps/'
file = 'minival'

In [None]:
imps = pickle.load(open(out_dir+'vqa_'+file+'_imps.pkl','rb'))
imdb = numpy.load(in_dir+'imdb_'+file+'2014.npy',allow_pickle=True)

In [None]:
for i in imdb[1:]:
    key = i['question_id']
    if 0 in [len(v) for v in imps[key].values()]: # if any valid answer doesn't have any implications  
        i['is_imps'] = False
    else:
        i['is_imps'] = True
        i['qa_implications'] = imps[key]

In [None]:
for i in imdb[1:]:
    if i['is_imps']:
        qa = i['qa_implications']
        i['qa_tokens']={}
        i['qa_answers']={}
        for key in qa.keys():
            i['qa_tokens'][key] = []
            i['qa_answers'][key] = []
            for imp in qa[key]:
                i['qa_tokens'][key].append(text_processing.tokenize(imp[0]))
                i['qa_answers'][key].append(imp[1])
        i.pop('qa_implications',None)
    
    else:
        i['qa_tokens']={}
        i['qa_answers']={}
        for key in set(i['valid_answers']):
            i['qa_tokens'][key] = [i['question_tokens']]
            i['qa_answers'][key] = [key]

In [None]:
pickle.dump(imdb,open(out_dir+'imdb_'+file+'2014.npy','wb'))

## Add Ons for several restrictions in imdb 

In [None]:
imdb = numpy.load(out_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
imdb_ori = numpy.load(in_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
q = json.load(open('orig_data/vqa_v2.0/v2_mscoco_'+file+'2014_annotations.json','rb'))

In [None]:
# tokenize and delete previous keys

for i in imdb[1:]:
    if i['is_imps']:
        qa = i['qa_implications']
        i['qa_tokens']={}
        i['qa_answers']={}
        for key in qa.keys():
            i['qa_tokens'][key] = []
            i['qa_answers'][key] = []
            for imp in qa[key]:
                i['qa_tokens'][key].append(text_processing.tokenize(imp[0]))
                i['qa_answers'][key].append(imp[1])
        i.pop('qa_implications',None)
    
    else:
        i['qa_tokens']={}
        i['qa_answers']={}
        for key in set(i['valid_answers']):
            i['qa_tokens'][key] = [i['question_tokens']]
            i['qa_answers'][key] = [key]

In [None]:
from scipy.stats import mode

for i in imdb[1:]:
    if not i['is_imps']:
        for key in i['qa_answers']:
            if key not in ['yes','no']:
                i['qa_answers'][key] = [mode(i['valid_answers'])[0][0]]

In [None]:
qmap = {}

for ann in q['annotations']:
    atype = ann['answer_type']
    qid = ann['question_id']
    qmap[qid] = atype

In [None]:
idx = []
for i in range(1,len(imdb)):
    if not imdb[i]['is_imps'] and qmap[imdb[i]['question_id']] != 'yes/no':
        idx.append(i)

imdb = numpy.delete(imdb,idx)

In [None]:

for i in imdb[1:]:
    if not i['is_imps'] and qmap[i['question_id']]!='yes/no':
        print(i)

In [None]:
# delete questions w/o any implications for all valid answers

idx = []
for i in range(1,len(imdb)):
    if not imdb[i]['is_imps']:
        idx.append(i)
    imdb[i].pop('is_imps')

imdb = numpy.delete(imdb,idx)

In [None]:
# delete questions w/o implications for any valid answers

idx = []
for i in range(1,len(imdb)):
    qa = imdb[i]['qa_answers']
    for key in qa.keys():
        if len(qa[key])==0 :
            idx.append(i)
            break

imdb = numpy.delete(imdb,idx)

In [None]:
pickle.dump(imdb,open(out_dir+'imdb_'+file+'2014.npy','wb'))

## Dataset analysis

In [None]:
in_dir = 'data/imdb/'
out_dir = 'data/imdb_imps/'
file = 'train'

imdb = numpy.load(out_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
imdb_ori = numpy.load(in_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
q = json.load(open('orig_data/vqa_v2.0/v2_mscoco_'+file+'2014_annotations.json','rb'))

In [None]:
question_types = {}
qmap = {}

for ann in q['annotations']:
    atype = ann['answer_type']
    qid = ann['question_id']
    qmap[qid] = atype
    if atype not in question_types.keys():
        question_types[atype] = []
    question_types[atype].append(qid)

In [None]:
print('Stats original vqa2.0')
print('Total number of questions: %d' %(len(q['annotations'])))
for key in question_types.keys():
    print('%s lenght: %d percentage: %.2f' % (key,len(question_types[key]),100*len(question_types[key])/len(q['annotations'])))

In [None]:
updated_question_types = {}

for ann in imdb[1:]:
    qid = ann['question_id']
    atype = qmap[qid]
    
    if atype not in updated_question_types.keys():
        updated_question_types[atype]=[]
    updated_question_types[atype].append(qid)

In [None]:
print('Stats new dataset:')
for key in updated_question_types:
    print('%s lenght: %d percentage: %.2f' % (key,len(updated_question_types[key]),100*len(updated_question_types[key])/len(imdb)))

# Extract vocab

In [7]:
input_files = ['orig_data/vqa_v2.0/v2_OpenEnded_mscoco_train2014_questions.json',
                'orig_data/vqa_v2.0/v2_OpenEnded_mscoco_val2014_questions.json',
                'orig_data/vqa_v2.0/v2_OpenEnded_mscoco_test2015_questions.json']
out_dir = '../'
min_freq = 0
vocab_file_name = 'vocabulary_vqa.txt'

In [8]:
word_count = Counter()
questions = []

for idx, input_file in enumerate(input_files):
    with open(input_file, 'r') as f:
        questions += json.load(f)['questions']

In [15]:
question_length = [None]*len(questions)
for inx, question in enumerate(questions):
    words = tokenize(question['question'])
    question_length[inx] = len(words)
    word_count.update(words)

vocabulary = [w[0] for w in word_count.items() if w[1] >= min_freq]
vocabulary.sort()
vocabulary = ['<unk>'] + vocabulary

len(vocabulary) #from original dataset!

18416

In [10]:
files = ['train','val2train','minival'] #from implications
questions_imps = []

for file in files:
    imps = pickle.load(open('data/imdb_imps/vqa_'+file+'_imps.pkl','rb'))
    imdb_ori = numpy.load('data/imdb/imdb_'+file+'2014.npy',allow_pickle=True)

    for i in imdb_ori[1:]:
        key = i['question_id']
        questions_imps += [q[0] for v in imps[key].values() for q in v]

In [17]:
for question in questions_imps:
    words = tokenize(question)
    word_count.update(words)
    
vocabulary = [w[0] for w in word_count.items() if w[1] >= min_freq]
vocabulary.sort()
vocabulary = ['<unk>'] + vocabulary

In [12]:
len(questions_imps),len(questions)

(1982279, 1105904)

In [20]:
vocab_file = os.path.join(out_dir, vocab_file_name)
with open(vocab_file, 'w') as f:
    f.writelines([w+'\n' for w in vocabulary])

In [18]:
len(vocabulary)

18416

In [19]:
vocabulary[:10]

['<unk>', '!', '!"', '!."', '"', '"  \'', '" & "', '" \'', '" -', '" <']

## Create implications imdb (for augmentation)

In [14]:
in_dir = 'data/imdb/'
out_dir = 'data/imdb_imps/'
file = 'val2train'

imps = pickle.load(open(out_dir+'vqa_'+file+'_imps.pkl','rb'))
imdb = numpy.load(out_dir+'imdb_'+file+'2014.npy',allow_pickle=True)

In [15]:
imdb_just_imps = [imdb[0].copy()]

for i in imdb[1:]:
    
    if i['is_imps']:
        vans = mode(i['valid_answers'])[0][0]

        for q,a,implied in zip(i['qa_tokens'][vans],i['qa_answers'][vans],imps[i['question_id']][vans]):
            cp = i.copy()
            cp.pop('qa_tokens',None)
            cp.pop('qa_answers',None)
            cp.pop('is_imps',None)
            cp['question_str'] = implied[0]
            cp['question_tokens'] = q
            cp['valid_answers'] = [a for _ in range(10)]
            cp['all_answers'] = [a for _ in range(10)]
            imdb_just_imps.append(cp)

In [16]:
pickle.dump(imdb_just_imps,open(in_dir+'imdb_just_imps'+file+'2014.npy','wb'))

In [12]:
len(imdb_just_imps),len(imdb)

(531092, 443758)

In [13]:
imdb_just_imps[0]

{'create_time': '2018-03-29 16:39',
 'dataset_name': 'vqa',
 'version': 1,
 'has_answer': True,
 'has_gt_layout': False}

## playground