In [1]:
from pathlib import Path
root = Path.cwd().parent
DATA_PATH = root / 'data'
RESOURCES = root / 'resources'

In [2]:
import os
import json
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import pickle

# Step 0: Get all data of 2017

In [3]:
def create_id_list(jsonfile):
    id_list = []
    with open(jsonfile) as f:
        data = json.load(f)
    for item in data['annotations']:
        id_list.append(item['image_id'])
    return set(id_list)
train_id_2017 = list(create_id_list(str(DATA_PATH / 'train2017.json')))
val_id_2017 = list(create_id_list(str(DATA_PATH / 'val2017.json')))
# train2017.json and val2017.json are belong to Cocodataset 2017 for image captioning

# Step 1: Create all item question-answer

In [3]:
def merge_json(json1, json2, type_):
    list1, list2 = list(), list()
    with open(json1) as f:
        data1 = json.load(f)
    for item in data1[type_]:
        list1.append(item)

    with open(json2) as p:
        data2 = json.load(p)
    for item in data2[type_]:
        list2.append(item)

    return list1 + list2

In [4]:
full_item_annotation = merge_json(str(DATA_PATH / 'train_annotation.json'), str(DATA_PATH / 'val_annotation.json'), type_='annotations')

In [6]:
full_item_question = merge_json(str(DATA_PATH / 'train_question.json'), str(DATA_PATH / 'val_question.json'), type_='questions')

# Step 2: Calculate number of top used words, label encoder

In [7]:
def get_top_words(full_item, ratio=None, top_k=None):
    wc = dict()
    for item in full_item:
        for answer in item['answers']:
            ans = answer['answer']
            for an in ans.split():
                if an not in wc:
                    wc[an] = 0
                wc[an] += 1
    sorted_x = sorted(wc.items(), key=lambda x: x[1], reverse=True)
    if ratio:
        return sorted_x[:int(len(sorted_x) * ratio)]
    elif top_k:
        return sorted_x[:top_k]

In [7]:
def get_top_answer(full_item, ratio=None, top_k=None):
    dictionary = dict()
    for item in full_item:
        for answer in item['answers']:
            ans = answer['answer']
            if ans not in dictionary:
                dictionary[ans] = 0
            dictionary[ans] += 1
    sorted_x = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
    print("All unique answers: ", len(sorted_x))
    if ratio:
        return sorted_x[:int(len(sorted_x) * ratio)]
    elif top_k:
        return sorted_x[:top_k]

In [8]:
top_answers = get_top_answer(full_item_annotation, ratio=None, top_k=1000)
top_answers = [k[0] for k in top_answers]

All unique answers:  219875


In [17]:
wc = get_top_words(full_item, top_k=1000)
wc = [k[0] for k in wc]

In [31]:
def dem_do_phu_tu(full_item, wc):
    cnt = 0 
    total = 0
    wc = {k: '' for k in wc}
    for item in tqdm(full_item):
        for answer in item['answers']:
            ans = answer['answer']
            total += 1
            for an in ans.split(): 
                if an in wc:
                    continue
                else:
                    break
            else:
                cnt += 1
    print('Count: ', cnt)
    print('Total answers: ', total)
    print('{}% of data use {} words'.format((cnt/total) * 100, len(wc)))
    return cnt/total

In [9]:
def dem_do_phu_ans(full_item, top_answer):
    cnt = 0 
    total = 0
    top_answer = {k: '' for k in top_answer}
    for item in tqdm(full_item):
        for answer in item['answers']:
            ans = answer['answer']
            total += 1
            if ans in top_answer:
                cnt += 1
                
    print('Count: ', cnt)
    print('Total answers: ', total)
    print('{}% of data use {} answers'.format((cnt/total) * 100, len(top_answer)))
    return cnt/total

In [9]:
dem_do_phu_tu(full_item, wc)

NameError: name 'dem_do_phu_tu' is not defined

In [10]:
dem_do_phu_ans(full_item_annotation, top_answers)

100%|██████████| 658111/658111 [00:01<00:00, 446401.74it/s]

Count:  5460952
Total answers:  6581110
82.97919347951941% of data use 1000 answers





0.8297919347951941

In [12]:
le = LabelEncoder()

In [13]:
le.fit(top_answers)

LabelEncoder()

In [15]:
'yes' in le.classes_

True

In [17]:
with open(str(RESOURCES / 'le.pkl'), 'wb') as f:
    pickle.dump(le, f)

# Step 3: Create pair questionId-answer, which answer have word in wc

In [24]:
temp = []
for i in full_item_annotation:
    temp.append(i['answer_type'])
set(temp)

{'number', 'other', 'yes/no'}

In [22]:
full_item_annotation[9051]

{'answer_type': 'other',
 'answers': [{'answer': 'shower head',
   'answer_confidence': 'yes',
   'answer_id': 1},
  {'answer': 'shower and bathtub', 'answer_confidence': 'yes', 'answer_id': 2},
  {'answer': 'bathtub', 'answer_confidence': 'maybe', 'answer_id': 3},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 4},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 5},
  {'answer': 'paper', 'answer_confidence': 'yes', 'answer_id': 6},
  {'answer': 'bathtub', 'answer_confidence': 'yes', 'answer_id': 7},
  {'answer': 'person', 'answer_confidence': 'yes', 'answer_id': 8},
  {'answer': 'bathtub', 'answer_confidence': 'yes', 'answer_id': 9},
  {'answer': 'shower', 'answer_confidence': 'yes', 'answer_id': 10}],
 'image_id': 546179,
 'multiple_choice_answer': 'shower',
 'question_id': 546179002,
 'question_type': 'none of the above'}

In [19]:
def quesid_ans(full_item, top_answers, save_txt=False):
    lines = list()
    total = 0
    top_answers = {k: '' for k in top_answers}
    dictionary = dict()
    for item in tqdm(full_item):
        total += 1
        answer = item['multiple_choice_answer']
        if answer in top_answers:
            question_id = item['question_id']
            line = str(question_id) + "\t" + answer
            lines.append(line)
            dictionary.setdefault(question_id, answer)

    if save_txt:
        data = "\n".join(lines)
        with open(str(RESOURCES / 'questionId_answer.txt'), 'w') as f:
            f.write(data)
    
    print("Total answers: ", total * 10)
    print("Number of answer after filtering: ", len(lines))
    return dictionary

In [20]:
questionId_answer = quesid_ans(full_item_annotation, top_answers, save_txt=True)

100%|██████████| 658111/658111 [00:01<00:00, 616035.34it/s]

Total answers:  6581110
Number of answer after filtering:  572794





In [21]:
for key, val in tqdm(questionId_answer.items()):
    assert (val in top_answers) == True

100%|██████████| 572794/572794 [00:01<00:00, 507560.40it/s]


In [22]:
with open(str(RESOURCES / 'questionID_answer.pkl'), 'wb') as f:
    pickle.dump(questionId_answer, f)

# Step 4: Create imageID_questionID

In [23]:
def imageId_question(full_item, questionId_answer, save_txt=False):
    question_id = {k: '' for k in questionId_answer.keys()}
    dictionary = dict()
    lines = list()
    for item in tqdm(full_item):
        image_id = item['image_id']
        if image_id not in dictionary:
            dictionary[image_id] = list()
        ques_id = item['question_id']
        if ques_id in question_id:
            dictionary[image_id].append(ques_id)
            line = str(image_id) + '\t' + str(ques_id)
            lines.append(line)
    if save_txt:
        data = "\n".join(lines)
        with open(str(RESOURCES / 'imageID_questionID.txt'), 'w') as f:
            f.write(data)
    return dictionary
            

In [24]:
imageId_quesID = imageId_question(full_item_annotation, questionId_answer, save_txt=True)

100%|██████████| 658111/658111 [00:02<00:00, 282982.40it/s]


In [25]:
len(imageId_quesID)

123287

In [26]:
with open(str(RESOURCES / 'imageID_questionID.pkl'), 'wb') as f:
    pickle.dump(imageId_quesID, f)

# Step 5: Split data according to train, val 2017

In [27]:
def create_2017_data(imageId_quesID, data2017):
    data = dict()
    for id_ in tqdm(data2017): 
        if id_ in imageId_quesID:
            data[id_] = imageId_quesID[id_]
    return data   

In [28]:
train2017 = create_2017_data(imageId_quesID, train_id_2017)
val2017 = create_2017_data(imageId_quesID, val_id_2017)

100%|██████████| 118287/118287 [00:00<00:00, 1142407.62it/s]
100%|██████████| 5000/5000 [00:00<00:00, 1028570.31it/s]


In [36]:
len(train2017)

118287

In [37]:
len(val2017)

5000

In [29]:
with open(str(RESOURCES / 'train2017.pkl'), 'wb') as f:
    pickle.dump(train2017, f)
    
with open(str(RESOURCES / 'val2017.pkl'), 'wb') as p:
    pickle.dump(val2017, p)