In [1]:
import os
import csv
import string
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Tokenizer
from nltk.tokenize import WhitespaceTokenizer
# Demo with different tokenizers.
# https://text-processing.com/demo/tokenize/

In [2]:
# Export mappings to file.
def save_mappings_to_csv_file(folder, filename, word_to_ix, fieldnames = ["word", "index"], show = False):
    """
    Saves mappings dictionary to a file.

    :param filename: File with encodings (absolute path + filename).
    :param word_to_ix: dictionary with word:index keys
    """
    # Expand path.
    folder = os.path.expanduser(folder)
    # Make sure directory exists.
    os.makedirs(os.path.dirname(folder +'/'), exist_ok=True)

    file_path = os.path.join(folder, filename)

    with open(file_path, mode='w+') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # Create header.
        writer.writeheader()

        # Write word-index pairs.
        for (k,v) in word_to_ix.items():
            if show:
                print("{} : {}".format(k,v))
            writer.writerow({fieldnames[0]:k, fieldnames[1]: v})


In [3]:
def add_answer_label_to_vocabulary(answer_to_ix, answers):
    """
    Processes answers one by one, treating each as a single label.
    """
    for answer in answers:
        # If new token.
        if answer not in answer_to_ix:
            answer_to_ix[answer] = len(answer_to_ix)
            print("Adding '{}': {}".format(answer, len(answer_to_ix)-1) ) 

In [4]:
answer_to_ix = {}

In [5]:
# Set folders.
data_folder = os.path.expanduser('~/data/gqa')
print(data_folder)

/Users/tomaszkornuta/data/gqa


## Play with data

In [6]:
file = os.path.join(data_folder, "questions", 'val_balanced_questions.json')
with open(file) as f:
    dataset = json.load(f)
print(dataset.keys())

dict_keys(['05515938', '17197213', '08223573', '14778715', '14778714', '1231468', '12143164', '12143165', '19486857', '17284200', '08369253', '05515937', '15962275', '09229761', '13663260', '08321209', '16130590', '11615428', '08185113', '03916612', '08185117', '08185114', '17267240', '101024942', '08185118', '06895964', '09821879', '13571728', '00355883', '14676445', '14676447', '0328071', '03739128', '06645355', '06645358', '04285321', '13112324', '13598462', '0364186', '18555171', '0364188', '08164591', '1942734', '0464559', '0464558', '031007945', '13515212', '17728948', '10889906', '13368729', '151015980', '17284203', '0090767', '17120817', '17819046', '06435607', '06134756', '02315424', '17810030', '13831865', '04819662', '11563995', '12824169', '03260227', '10433179', '16542238', '07603522', '08203407', '09143711', '09143715', '07603528', '09195303', '00805479', '18175193', '02497337', '01163638', '13168817', '13168814', '091013819', '15381074', '18283096', '111015885', '1839631

In [7]:
print(len(dataset))
print(dataset['17197213'])

132062
{'semantic': [{'operation': 'select', 'dependencies': [], 'argument': 'helmet (3205899)'}, {'operation': 'filter vposition', 'dependencies': [0], 'argument': 'middle'}, {'operation': 'query', 'dependencies': [1], 'argument': 'color'}], 'entailed': ['17197211', '17197212', '17197214', '17197215', '17197221'], 'equivalent': ['17197213'], 'question': 'What color is the helmet in the middle of the image?', 'imageId': '2331963', 'isBalanced': True, 'groups': {'global': 'color', 'local': '10q-helmet_color'}, 'answer': 'light blue', 'semanticStr': 'select: helmet (3205899)->filter vposition: middle [0]->query: color [1]', 'annotations': {'answer': {}, 'question': {'4': '3205899'}, 'fullAnswer': {'1': '3205899'}}, 'types': {'detailed': 'directWhich', 'semantic': 'attr', 'structural': 'query'}, 'fullAnswer': 'The helmet is light blue.'}


In [8]:
item = dataset['07603522']
for key in item.keys():
    print("{}: {}".format(key, item[key]))

semantic: [{'operation': 'select', 'dependencies': [], 'argument': 'furniture (1337202)'}, {'operation': 'filter size', 'dependencies': [0], 'argument': 'not(small)'}, {'operation': 'choose name', 'dependencies': [1], 'argument': 'table|bed'}]
entailed: ['07603523']
equivalent: ['07603522', '07603523']
question: What piece of furniture is not small, the bed or the table?
imageId: 2381360
isBalanced: True
groups: {'global': 'furniture', 'local': '12c-furniture_n#small'}
answer: bed
semanticStr: select: furniture (1337202)->filter size: not(small) [0]->choose name: table|bed [1]
annotations: {'answer': {'0': '1337202'}, 'question': {'11': '1337204', '8': '1337202', '1:4': '1337204'}, 'fullAnswer': {'1': '1337202'}}
types: {'detailed': 'categoryThatChoose', 'semantic': 'cat', 'structural': 'choose'}
fullAnswer: The bed is not small.


## Extract vocabulary

### Training

In [10]:
file = os.path.join(data_folder, "questions", "train_all_questions", "train_all_questions_0.json")
with open(file) as f:
    train_dataset = json.load(f)
print(len(train_dataset))

1430536


In [22]:
for key,value in train_dataset.items():
    print("key = ",key)
    print("value = ",value)
    break

key =  07333408
value =  {'semantic': [{'operation': 'select', 'dependencies': [], 'argument': 'wall (722332)'}, {'operation': 'filter color', 'dependencies': [0], 'argument': 'white'}, {'operation': 'relate', 'dependencies': [1], 'argument': '_,on,s (722335)'}, {'operation': 'query', 'dependencies': [2], 'argument': 'name'}], 'entailed': [], 'equivalent': ['07333408'], 'question': 'What is on the white wall?', 'imageId': '2375429', 'isBalanced': True, 'groups': {'global': '', 'local': '14-wall_on,s'}, 'answer': 'pipe', 'semanticStr': 'select: wall (722332)->filter color: white [0]->relate: _,on,s (722335) [1]->query: name [2]', 'annotations': {'answer': {'0': '722335'}, 'question': {'4:6': '722332'}, 'fullAnswer': {'1': '722335', '5': '722332'}}, 'types': {'detailed': 'relS', 'semantic': 'rel', 'structural': 'query'}, 'fullAnswer': 'The pipe is on the wall.'}


In [24]:
train_answers = [item["answer"] for key, item in train_dataset.items()]
print(train_answers[0:10])
print(len(train_answers))

['pipe', 'wall', 'white', 'no', 'no', 'white', 'no', 'yes', 'yes', 'yes']
1430536


In [26]:
# Actual loop for loading all training files.
for i in range(10):
    file = os.path.join(data_folder, "questions", "train_all_questions", "train_all_questions_{}.json".format(i))
    with open(file) as f:
        train_dataset = json.load(f)
    print("Loaded {}: {}".format(i, len(train_dataset)))
    # Get all answers.
    train_answers = [item["answer"] for key, item in train_dataset.items()]
    # Add them to vocabulary.
    add_answer_label_to_vocabulary(answer_to_ix, train_answers)
print(len(answer_to_ix))

Loaded:  1430536
Loaded:  1430536
Adding 'visitors': 1547
Adding 'school': 1548
Adding 'rhinos': 1549
Adding 'syrup': 1550
Adding 'heart': 1551
Adding 'wolf': 1552
Adding 'sharks': 1553
Adding 'paper container': 1554
Adding 'drain': 1555
Adding 'uniforms': 1556
Adding 'biscuits': 1557
Adding 'life jackets': 1558
Adding 'plantains': 1559
Adding 'life preserver': 1560
Adding 'feathers': 1561
Adding 'whale': 1562
Adding 'sticks': 1563
Adding 'milk carton': 1564
Adding 'clocks': 1565
Adding 'diaper': 1566
Adding 'soap bottle': 1567
Adding 'customer': 1568
Adding 'music': 1569
Adding 'amusement park': 1570
Adding 'doors': 1571
Adding 'wildflowers': 1572
Adding 'masks': 1573
Adding 'scaffolding': 1574
Adding 'earrings': 1575
Adding 'marina': 1576
Adding 'cotton': 1577
Adding 'wig': 1578
Adding 'dog food': 1579
Adding 'shelter': 1580
Adding 'seagulls': 1581
Adding 'staircase': 1582
Adding 'action figure': 1583
Adding 'towel dispenser': 1584
Adding 'powder': 1585
Adding 'dinosaurs': 1586
Addin

### Validation

In [27]:
file = os.path.join(data_folder, "questions", "val_all_questions.json")
with open(file) as f:
    val_dataset = json.load(f)
print("Loaded : {}".format(len(val_dataset)))

Loaded : 2011853
Adding 'pikachu': 1845
Adding 'wave': 1846
Adding 'taking notes': 1847
Adding 'elevator': 1848
Adding 'shampoo': 1849
Adding 'horse hoof': 1850
Adding 'orchids': 1851
1852


In [28]:
# Get all answers.
val_answers = [item["answer"] for key, item in val_dataset.items()]
# Add them to vocabulary.
add_answer_label_to_vocabulary(answer_to_ix, val_answers)
print(len(answer_to_ix))

1852


### Test

In [29]:
file = os.path.join(data_folder, "questions", "test_all_questions.json")
with open(file) as f:
    test_dataset = json.load(f)
print("Loaded : {}".format(len(test_dataset)))

Loaded : 1340048


In [30]:
for key,value in test_dataset.items():
    print("key = ",key)
    print("value = ",value)
    break

key =  201971873
value =  {'isBalanced': False, 'question': 'Is the blanket to the right of a pillow?', 'imageId': 'n15740'}


In [31]:
# Get all answers.
test_answers = [item["answer"] for key, item in test_dataset.items()]
# Add them to vocabulary.
add_answer_label_to_vocabulary(answer_to_ix, test_answers)
print(len(answer_to_ix))
# Right, no answers here ;)

KeyError: 'answer'

## Save vocabulary

In [32]:
# Generate the name of file.
name = 'answers.all.word.mappings.csv'

print("Saving to: ",name)
# Save to both "destinations."
save_mappings_to_csv_file(data_folder, name, answer_to_ix)
save_mappings_to_csv_file('.', name, answer_to_ix, show = True)


Saving to:  answers.all.word.mappings.csv
pipe : 0
wall : 1
white : 2
no : 3
yes : 4
top : 5
silver : 6
black : 7
right : 8
man : 9
left : 10
large : 11
chair : 12
red : 13
bus : 14
sweater : 15
lady : 16
green : 17
cow : 18
herd : 19
outdoors : 20
sheep : 21
end table : 22
bottom : 23
train : 24
carrots : 25
on : 26
car : 27
coat : 28
blond : 29
snow : 30
bag : 31
jacket : 32
bench : 33
girl : 34
truck : 35
donut : 36
long : 37
elephant : 38
child : 39
closed : 40
bicycle : 41
concrete : 42
blue : 43
seat : 44
horse : 45
gray : 46
glass : 47
controller : 48
color : 49
monitor : 50
candle : 51
small : 52
zebras : 53
locomotive : 54
table : 55
jeans : 56
plate : 57
glasses : 58
metal : 59
phone : 60
full : 61
round : 62
sword : 63
vase : 64
tall : 65
bedroom : 66
thin : 67
shelf : 68
fence : 69
trees : 70
field : 71
balcony : 72
wood : 73
people : 74
clear : 75
stop sign : 76
mannequin : 77
window : 78
backpack : 79
indoors : 80
checkered : 81
dish : 82
dark : 83
brown : 84
helmet : 85
