In [1]:
import os
import csv
import string
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Tokenizer
from nltk.tokenize import WhitespaceTokenizer
# Demo with different tokenizers.
# https://text-processing.com/demo/tokenize/

In [2]:
# Create vocabulary using words from questions from both training.
tokenizer = WhitespaceTokenizer()

table = str.maketrans({key: None for key in string.punctuation})

# Dictionary word_to_ix maps each word in the vocab to a unique integer.
word_to_ix = {}
# Add special word <PAD> that we will use that during padding.
# As a result, the "real" enumeration will start from 1.
word_to_ix['<PAD>'] = 0

In [3]:
# Export mappings to file.
def save_mappings_to_csv_file(folder, filename, word_to_ix, fieldnames = ["word", "index"], show = False):
    """
    Saves mappings dictionary to a file.

    :param filename: File with encodings (absolute path + filename).
    :param word_to_ix: dictionary with word:index keys
    """
    # Expand path.
    folder = os.path.expanduser(folder)
    # Make sure directory exists.
    os.makedirs(os.path.dirname(folder +'/'), exist_ok=True)

    file_path = os.path.join(folder, filename)

    with open(file_path, mode='w+') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # Create header.
        writer.writeheader()

        # Write word-index pairs.
        for (k,v) in word_to_ix.items():
            if show:
                print("{} : {}".format(k,v))
            writer.writerow({fieldnames[0]:k, fieldnames[1]: v})


In [4]:
def add_question_words_to_vocabulary(word_to_ix, questions):
    """
    Processes questions one by one, dividing each into seperate words.
    """
    for question in questions:
        # Remove punctuation.
        question = question.translate(table)
        # Lowercase.
        question = question.lower()
        # Parse tokens.
        for word in tokenizer.tokenize(question):
            # If new token.
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                print("Adding '{}': {}".format(word, len(word_to_ix)-1) )

In [5]:
# Set folders.
data_folder = os.path.expanduser('~/data/CLEVR_v1.0')
print(data_folder)

/Users/tomaszkornuta/data/CLEVR_v1.0


## Extract vocabulary

### Training

In [6]:
# training
train_file = os.path.join(data_folder, "questions", 'CLEVR_train_questions.json')
with open(train_file) as f:
    train_dataset = json.load(f)
print(train_dataset.keys())
train_dataset = train_dataset["questions"]

dict_keys(['info', 'questions'])


In [7]:
print(len(train_dataset))
print(train_dataset[0].keys())

699989
dict_keys(['image_index', 'program', 'question_index', 'image_filename', 'question_family_index', 'split', 'answer', 'question'])


In [8]:
print(train_dataset[0])

{'image_index': 0, 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [1], 'function': 'filter_color', 'value_inputs': ['green']}, {'inputs': [2], 'function': 'count', 'value_inputs': []}, {'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [4], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [5], 'function': 'filter_color', 'value_inputs': ['purple']}, {'inputs': [6], 'function': 'filter_material', 'value_inputs': ['metal']}, {'inputs': [7], 'function': 'filter_shape', 'value_inputs': ['cube']}, {'inputs': [8], 'function': 'count', 'value_inputs': []}, {'inputs': [3, 9], 'function': 'greater_than', 'value_inputs': []}], 'question_index': 0, 'image_filename': 'CLEVR_train_000000.png', 'question_family_index': 2, 'split': 'train', 'answer': 'yes', 'question': 'Are there more big green things than large purple shiny cubes?'}


In [9]:
train_questions = [item["question"] for item in train_dataset]
add_question_words_to_vocabulary(word_to_ix, train_questions)

Adding 'are': 1
Adding 'there': 2
Adding 'more': 3
Adding 'big': 4
Adding 'green': 5
Adding 'things': 6
Adding 'than': 7
Adding 'large': 8
Adding 'purple': 9
Adding 'shiny': 10
Adding 'cubes': 11
Adding 'how': 12
Adding 'many': 13
Adding 'other': 14
Adding 'of': 15
Adding 'the': 16
Adding 'same': 17
Adding 'shape': 18
Adding 'as': 19
Adding 'tiny': 20
Adding 'cyan': 21
Adding 'matte': 22
Adding 'object': 23
Adding 'is': 24
Adding 'color': 25
Adding 'sphere': 26
Adding 'cube': 27
Adding 'what': 28
Adding 'material': 29
Adding 'that': 30
Adding 'right': 31
Adding 'brown': 32
Adding 'cylinder': 33
Adding 'and': 34
Adding 'left': 35
Adding 'gray': 36
Adding 'on': 37
Adding 'side': 38
Adding 'small': 39
Adding 'rubber': 40
Adding 'behind': 41
Adding 'thing': 42
Adding 'to': 43
Adding 'metallic': 44
Adding 'size': 45
Adding 'any': 46
Adding 'have': 47
Adding 'block': 48
Adding 'blue': 49
Adding 'yellow': 50
Adding 'a': 51
Adding 'it': 52
Adding 'ball': 53
Adding 'its': 54
Adding 'in': 55
Add

### Validation

In [10]:
valid_file = os.path.join(data_folder, "questions", 'CLEVR_val_questions.json')
with open(valid_file) as f:
    valid_dataset = json.load(f)
print(valid_dataset.keys())
valid_dataset = valid_dataset["questions"]

dict_keys(['info', 'questions'])


In [11]:
print(len(valid_dataset))
print(valid_dataset[0].keys())

149991
dict_keys(['image_index', 'program', 'question_index', 'image_filename', 'question_family_index', 'split', 'answer', 'question'])


In [12]:
print(valid_dataset[0])

{'image_index': 0, 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [1], 'function': 'filter_material', 'value_inputs': ['metal']}, {'inputs': [2], 'function': 'unique', 'value_inputs': []}, {'inputs': [3], 'function': 'same_shape', 'value_inputs': []}, {'inputs': [4], 'function': 'exist', 'value_inputs': []}], 'question_index': 0, 'image_filename': 'CLEVR_val_000000.png', 'question_family_index': 39, 'split': 'val', 'answer': 'no', 'question': 'Are there any other things that are the same shape as the big metallic object?'}


In [13]:
valid_questions = [item["question"] for item in valid_dataset]
add_question_words_to_vocabulary(word_to_ix, valid_questions)

### Test

In [14]:
# test
test_file = os.path.join(data_folder, "questions", 'CLEVR_test_questions.json')
with open(test_file) as f:
    test_dataset = json.load(f)
print(test_dataset.keys())
print(test_dataset["info"])
print(test_dataset["questions"][0])

dict_keys(['info', 'questions'])
{'split': 'test', 'license': 'Creative Commons Attribution (CC BY 4.0)', 'version': '1.0', 'date': '2/14/2017'}
{'image_index': 0, 'split': 'test', 'image_filename': 'CLEVR_test_000000.png', 'question_index': 0, 'question': 'Is there anything else that is the same shape as the small brown matte object?'}


In [15]:
test_questions = [item["question"] for item in test_dataset["questions"]]
add_question_words_to_vocabulary(word_to_ix, test_questions)

## Save vocabulary

In [16]:
# Generate the name of file.
name = 'questions.all.word.mappings.lowercase.csv'

print("Saving to: ",name)
# Save to both "destinations."
save_mappings_to_csv_file(data_folder, name, word_to_ix)
save_mappings_to_csv_file('.', name, word_to_ix, show = True)


Saving to:  questions.all.word.mappings.lowercase.csv
<PAD> : 0
are : 1
there : 2
more : 3
big : 4
green : 5
things : 6
than : 7
large : 8
purple : 9
shiny : 10
cubes : 11
how : 12
many : 13
other : 14
of : 15
the : 16
same : 17
shape : 18
as : 19
tiny : 20
cyan : 21
matte : 22
object : 23
is : 24
color : 25
sphere : 26
cube : 27
what : 28
material : 29
that : 30
right : 31
brown : 32
cylinder : 33
and : 34
left : 35
gray : 36
on : 37
side : 38
small : 39
rubber : 40
behind : 41
thing : 42
to : 43
metallic : 44
size : 45
any : 46
have : 47
block : 48
blue : 49
yellow : 50
a : 51
it : 52
ball : 53
its : 54
in : 55
front : 56
does : 57
number : 58
red : 59
spheres : 60
made : 61
metal : 62
cylinders : 63
both : 64
balls : 65
or : 66
blocks : 67
objects : 68
visible : 69
another : 70
has : 71
greater : 72
fewer : 73
less : 74
either : 75
anything : 76
else : 77
do : 78
an : 79
equal : 80
