In [1]:
import numpy
import pickle
import random
import json
import os
from collections import Counter
from scipy.stats import mode
import pandas as pd

from dataset_utils.text_processing import tokenize

## store implications in imdb

In [None]:
in_dir = 'data/imdb/'
out_dir = 'data/imdb_imps/'
file = 'minival'

In [None]:
imps = pickle.load(open('data/imdb_imps/vqa_'+file+'_imps.pkl','rb'))
imdb = numpy.load(in_dir+'imdb_'+file+'2014.npy',allow_pickle=True)

In [None]:
source_map = {
 'ans=0 implies none' : 'logeq',
 'ans>0 implies some': 'necessary_condition',
 'color mutex': 'mutex',
 'color_in_answer_must_be_in_picture': 'necessary_condition',
 'n+1': 'mutex',
 'noun_in_answer_must_be_in_picture': 'necessary_condition',
 'remove_modifier': 'necessary_condition',
 'subjectyes': 'logeq',
 'what': 'logeq',
 'where': 'logeq',
 'whereprep': 'logeq',
 'wordnet mutex': 'mutex',
 'wordnet_adj_mutex': 'mutex',
 'xory_no': 'mutex',
 'xory_yes': 'logeq',
 'yeseqcount': 'logeq'
}

mp = {
  'logeq':[1,0,0],
  'necessary_condition':[0,1,0],
  'mutex':[0,0,1]
}

In [None]:
for i in imdb[1:]:
    key = i['question_id']
    
    if 0 in [len(v) for v in imps[key].values()]: # if any valid answer doesn't have any implications  
        i['is_imps'] = False
    else:
        i['is_imps'] = True
        i['qa_implications'] = imps[key]

In [None]:
for i in imdb[1:]:
    if i['is_imps']:
        qa = i['qa_implications']
        i['qa_tokens']={}
        i['qa_answers']={}
        i['imp_type']={}
        for key in qa.keys():
            i['qa_tokens'][key] = []
            i['qa_answers'][key] = []
            i['imp_type'][key] = []
            
            for imp in qa[key]:
                i['qa_tokens'][key].append(tokenize(imp[0]))
                i['qa_answers'][key].append(imp[1])
                i['imp_type'][key].append(mp[source_map[imp[2]]])

        i.pop('qa_implications',None)
    
    else:
        i['qa_tokens']={}
        i['qa_answers']={}
        for key in set(i['valid_answers']):
            i['qa_tokens'][key] = [i['question_tokens']]
            i['qa_answers'][key] = [key]

In [None]:
pickle.dump(imdb,open(out_dir+'imdb_'+file+'2014.npy','wb'))

## Add Ons for several restrictions in imdb 

In [None]:
imdb = numpy.load(out_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
imdb_ori = numpy.load(in_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
q = json.load(open('orig_data/vqa_v2.0/v2_mscoco_'+file+'2014_annotations.json','rb'))

In [None]:
# tokenize and delete previous keys

for i in imdb[1:]:
    if i['is_imps']:
        qa = i['qa_implications']
        i['qa_tokens']={}
        i['qa_answers']={}
        for key in qa.keys():
            i['qa_tokens'][key] = []
            i['qa_answers'][key] = []
            for imp in qa[key]:
                i['qa_tokens'][key].append(text_processing.tokenize(imp[0]))
                i['qa_answers'][key].append(imp[1])
        i.pop('qa_implications',None)
    
    else:
        i['qa_tokens']={}
        i['qa_answers']={}
        for key in set(i['valid_answers']):
            i['qa_tokens'][key] = [i['question_tokens']]
            i['qa_answers'][key] = [key]

In [None]:
from scipy.stats import mode

for i in imdb[1:]:
    if not i['is_imps']:
        for key in i['qa_answers']:
            if key not in ['yes','no']:
                i['qa_answers'][key] = [mode(i['valid_answers'])[0][0]]

In [None]:
qmap = {}

for ann in q['annotations']:
    atype = ann['answer_type']
    qid = ann['question_id']
    qmap[qid] = atype

In [None]:
idx = []
for i in range(1,len(imdb)):
    if not imdb[i]['is_imps'] and qmap[imdb[i]['question_id']] != 'yes/no':
        idx.append(i)

imdb = numpy.delete(imdb,idx)

In [None]:

for i in imdb[1:]:
    if not i['is_imps'] and qmap[i['question_id']]!='yes/no':
        print(i)

In [None]:
# delete questions w/o any implications for all valid answers

idx = []
for i in range(1,len(imdb)):
    if not imdb[i]['is_imps']:
        idx.append(i)
    imdb[i].pop('is_imps')

imdb = numpy.delete(imdb,idx)

In [None]:
# delete questions w/o implications for any valid answers

idx = []
for i in range(1,len(imdb)):
    qa = imdb[i]['qa_answers']
    for key in qa.keys():
        if len(qa[key])==0 :
            idx.append(i)
            break

imdb = numpy.delete(imdb,idx)

In [None]:
pickle.dump(imdb,open(out_dir+'imdb_'+file+'2014.npy','wb'))

## Dataset analysis

In [None]:
in_dir = 'data/imdb/'
out_dir = 'data/imdb_imps/'
file = 'train'

imdb = numpy.load(out_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
imdb_ori = numpy.load(in_dir+'imdb_'+file+'2014.npy',allow_pickle=True)
q = json.load(open('orig_data/vqa_v2.0/v2_mscoco_'+file+'2014_annotations.json','rb'))

In [None]:
question_types = {}
qmap = {}

for ann in q['annotations']:
    atype = ann['answer_type']
    qid = ann['question_id']
    qmap[qid] = atype
    if atype not in question_types.keys():
        question_types[atype] = []
    question_types[atype].append(qid)

In [None]:
print('Stats original vqa2.0')
print('Total number of questions: %d' %(len(q['annotations'])))
for key in question_types.keys():
    print('%s lenght: %d percentage: %.2f' % (key,len(question_types[key]),100*len(question_types[key])/len(q['annotations'])))

In [None]:
updated_question_types = {}

for ann in imdb[1:]:
    qid = ann['question_id']
    atype = qmap[qid]
    
    if atype not in updated_question_types.keys():
        updated_question_types[atype]=[]
    updated_question_types[atype].append(qid)

In [None]:
print('Stats new dataset:')
for key in updated_question_types:
    print('%s lenght: %d percentage: %.2f' % (key,len(updated_question_types[key]),100*len(updated_question_types[key])/len(imdb)))

# Extract vocab

In [None]:
input_files = ['orig_data/vqa_v2.0/v2_OpenEnded_mscoco_train2014_questions.json',
                'orig_data/vqa_v2.0/v2_OpenEnded_mscoco_val2014_questions.json',
                'orig_data/vqa_v2.0/v2_OpenEnded_mscoco_test2015_questions.json']
out_dir = '../'
min_freq = 0
vocab_file_name = 'vocabulary_vqa.txt'

In [None]:
word_count = Counter()
questions = []

for idx, input_file in enumerate(input_files):
    with open(input_file, 'r') as f:
        questions += json.load(f)['questions']

In [None]:
question_length = [None]*len(questions)
for inx, question in enumerate(questions):
    words = tokenize(question['question'])
    question_length[inx] = len(words)
    word_count.update(words)

vocabulary = [w[0] for w in word_count.items() if w[1] >= min_freq]
vocabulary.sort()
vocabulary = ['<unk>'] + vocabulary

len(vocabulary) #from original dataset!

In [None]:
files = ['train','val2train','minival'] #from implications
questions_imps = []

for file in files:
    imps = pickle.load(open('data/imdb_imps/vqa_'+file+'_imps.pkl','rb'))
    imdb_ori = numpy.load('data/imdb/imdb_'+file+'2014.npy',allow_pickle=True)

    for i in imdb_ori[1:]:
        key = i['question_id']
        questions_imps += [q[0] for v in imps[key].values() for q in v]

In [None]:
for question in questions_imps:
    words = tokenize(question)
    word_count.update(words)
    
vocabulary = [w[0] for w in word_count.items() if w[1] >= min_freq]
vocabulary.sort()
vocabulary = ['<unk>'] + vocabulary

In [None]:
len(questions_imps),len(questions)

In [None]:
vocab_file = os.path.join(out_dir, vocab_file_name)
with open(vocab_file, 'w') as f:
    f.writelines([w+'\n' for w in vocabulary])

In [None]:
len(vocabulary)

In [None]:
vocabulary[:10]

## Create implications imdb (for augmentation)

In [None]:
in_dir = 'data/imdb/'
out_dir = 'data/imdb_imps/'
file = 'val2train'

imps = pickle.load(open(out_dir+'vqa_'+file+'_imps.pkl','rb'))
imdb = numpy.load(out_dir+'imdb_'+file+'2014.npy',allow_pickle=True)

In [None]:
imdb_just_imps = [imdb[0].copy()]

for i in imdb[1:]:
    
    if i['is_imps']:
        vans = mode(i['valid_answers'])[0][0]

        for q,a,implied in zip(i['qa_tokens'][vans],i['qa_answers'][vans],imps[i['question_id']][vans]):
            cp = i.copy()
            cp.pop('qa_tokens',None)
            cp.pop('qa_answers',None)
            cp.pop('is_imps',None)
            cp['question_str'] = implied[0]
            cp['question_tokens'] = q
            cp['valid_answers'] = [a for _ in range(10)]
            cp['all_answers'] = [a for _ in range(10)]
            imdb_just_imps.append(cp)

In [None]:
pickle.dump(imdb_just_imps,open(in_dir+'imdb_just_imps'+file+'2014.npy','wb'))

In [None]:
len(imdb_just_imps),len(imdb)

In [None]:
imdb_just_imps[0]

## Data for manual annotation

In [None]:
in_dir = 'data/imdb/'
imdb_v = numpy.load(in_dir+'imdb_val2014.npy',allow_pickle=True)

In [None]:
q_v = json.load(open('orig_data/vqa_v2.0/v2_mscoco_val2014_annotations.json','rb'))

qmap = {}
    
for ann in q_v['annotations']:
    atype = ann['answer_type']
    qid = ann['question_id']
    qmap[qid] = atype

In [None]:
_set = set()
for i in imdb_man[1:]:
    _set.add(i['question_id'])

In [None]:
len(_set)

In [None]:
imdb = []
for i in imdb_v[1:]:
    if qmap[i['question_id']] !='yes/no' and i['question_id'] not in _set:
        imdb.append(i)

In [None]:
random.seed(42)
sel = random.sample(imdb,10000)

In [None]:
data = []
for i in sel:
    qid = i['question_id']
    q = i['question_str']
    a = mode(i['valid_answers'])[0][0]
    if a not in ['unknown','<unk>']:
        data.append({'qid':qid,'question':q,'answer':a})

In [None]:
df = pd.DataFrame(data)
df['Logeq'] = ""
df['Necc'] = ""
df['Mutex'] = ""

In [None]:
direc = 'manualAnnotations_new/'
for day in numpy.arange(8):
    i=1200*day
    df[i:400+i].to_excel(direc+str(day+1)+'_1.xlsx')
    df[400+i:800+i].to_excel(direc+str(day+1)+'_2.xlsx')
    df[800+i:1200+i].to_excel(direc+str(day+1)+'_3.xlsx')

In [None]:
df.to_csv(direc+'all.csv')

## manual to imdb

In [2]:
data = pd.read_excel('manAnnot/QA_from_val.xlsx',index_col=0)

In [4]:
imdb_val = numpy.load('data/imdb/imdb_val2014.npy',allow_pickle=True)

In [3]:
data

Unnamed: 0,Logeq,Mutex,Necc,answer,qid,question
0,is this dude using knife to cut the cake?,is this dude using a hammer to cut the cake?,is there any cake in the picture?,knife,340069003,This bored dude is using what instrument to cu...
3,is a city bus pictured?,is the type of bus pictured a tourist bus?,is there a bus pictured?,city bus,188817001,What type of bus is pictured?
8,are there logs on the ground behind the giraffe?,is there spoon laying on the ground behind the...,is there a giraffe?,logs,573778003,What is laying on the ground behind the giraffe?
10,is the horse brown?,is the horse black?,is there a horse?,brown,405135001,What color is the horse?
11,is the surfboard green?,is the surfboard yellow?,is there a surfboard?,green,187362006,What color is the surfboard?
...,...,...,...,...,...,...
8395,is the man talking into a microphone?,is the man talking into a phone?,is the man talking into anything?,microphone,196742000,What is the man talking into?
8396,is a bike in between the trees and the parking...,is a car in between the trees and the parking ...,is anything in between the trees and the parki...,bike,152771012,What is in between the trees and the parking s...
8397,"is the gray, white, and black item in the righ...","is the gray, white, and black item in the righ...","is there a gray, white, and black item in the ...",backpack,357604002,"What is the gray, white, and black item in the..."
8398,are 4 birds on the posts?,are 5 birds on the posts?,are birds on the posts?,4,553790006,How many birds are on the posts?


In [9]:
imdb_man = [imdb_val[0].copy()]

for _,d in data.iterrows():
    entry = {}
    qid = d['qid']
    image_id = int(qid/1000)
    
    entry['image_name'] = 'COCO_val2014_'+str(image_id).zfill(12)
    entry['image_id'] = image_id
    entry['feature_path'] = 'COCO_val2014_'+str(image_id).zfill(12)+'.npy'
    
    for i,(q, a) in enumerate(zip(['Logeq','Necc','Mutex'],['yes','yes','no'])):
        entry['question_id'] = qid*10+(i+1)
        entry['question_str'] = d[q]
        entry['question_tokens'] = tokenize(d[q])
        entry['valid_answers'] = [a for _ in range(10)]
        entry['all_answers'] = [a for _ in range(10)]
        imdb_man.append(entry.copy()) 

In [10]:
pickle.dump(imdb_man,open('data/imdb_manual/imdb_val2014.npy','wb'))

In [12]:
imdb_man[1]

{'image_name': 'COCO_val2014_000000340069',
 'image_id': 340069,
 'feature_path': 'COCO_val2014_000000340069.npy',
 'question_id': 3400690031,
 'question_str': 'is this dude using knife to cut the cake?',
 'question_tokens': ['is',
  'this',
  'dude',
  'using',
  'knife',
  'to',
  'cut',
  'the',
  'cake'],
 'valid_answers': ['yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes'],
 'all_answers': ['yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes']}

## Playground

In [6]:
c=0
for i in imdb_val[1:]:
    a = mode(i['valid_answers'])[0][0]
    if a in ['unknown','<unk>']:
        c+=1
c

4426

In [7]:
c/len(imdb_val)

0.02064799048307714