In [1]:
import os
import csv
import string
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Tokenizer
from nltk.tokenize import WhitespaceTokenizer
# Demo with different tokenizers.
# https://text-processing.com/demo/tokenize/

In [2]:
# Create vocabulary using words from questions from both training.
tokenizer = WhitespaceTokenizer()

table = str.maketrans({key: None for key in string.punctuation})

# Dictionary word_to_ix maps each word in the vocab to a unique integer.
word_to_ix = {}
# Add special word <PAD> that we will use that during padding.
# As a result, the "real" enumeration will start from 1.
word_to_ix['<PAD>'] = 0

In [3]:
# Export mappings to file.
def save_mappings_to_csv_file(folder, filename, word_to_ix, fieldnames = ["word", "index"], show = False):
    """
    Saves mappings dictionary to a file.

    :param filename: File with encodings (absolute path + filename).
    :param word_to_ix: dictionary with word:index keys
    """
    # Expand path.
    folder = os.path.expanduser(folder)
    # Make sure directory exists.
    os.makedirs(os.path.dirname(folder +'/'), exist_ok=True)

    file_path = os.path.join(folder, filename)

    with open(file_path, mode='w+') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # Create header.
        writer.writeheader()

        # Write word-index pairs.
        for (k,v) in word_to_ix.items():
            if show:
                print("{} : {}".format(k,v))
            writer.writerow({fieldnames[0]:k, fieldnames[1]: v})


In [4]:
def add_question_words_to_vocabulary(word_to_ix, questions):
    """
    Processes questions one by one, dividing each into seperate words.
    """
    for question in questions:
        # Remove punctuation.
        question = question.translate(table)
        # Parse tokens.
        for word in tokenizer.tokenize(question):
            # If new token.
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                print("Adding '{}': {}".format(word, len(word_to_ix)-1) )

In [5]:
# Set folders.
data_folder = os.path.expanduser('~/data/CLEVR_v1.0')
print(data_folder)

/Users/tomaszkornuta/data/CLEVR_v1.0


## Extract vocabulary

### Training

In [6]:
# training
train_file = os.path.join(data_folder, "questions", 'CLEVR_train_questions.json')
with open(train_file) as f:
    train_dataset = json.load(f)
print(train_dataset.keys())
train_dataset = train_dataset["questions"]

dict_keys(['info', 'questions'])


In [7]:
print(len(train_dataset))
print(train_dataset[0].keys())

699989
dict_keys(['image_index', 'program', 'question_index', 'image_filename', 'question_family_index', 'split', 'answer', 'question'])


In [8]:
print(train_dataset[0])

{'image_index': 0, 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [1], 'function': 'filter_color', 'value_inputs': ['green']}, {'inputs': [2], 'function': 'count', 'value_inputs': []}, {'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [4], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [5], 'function': 'filter_color', 'value_inputs': ['purple']}, {'inputs': [6], 'function': 'filter_material', 'value_inputs': ['metal']}, {'inputs': [7], 'function': 'filter_shape', 'value_inputs': ['cube']}, {'inputs': [8], 'function': 'count', 'value_inputs': []}, {'inputs': [3, 9], 'function': 'greater_than', 'value_inputs': []}], 'question_index': 0, 'image_filename': 'CLEVR_train_000000.png', 'question_family_index': 2, 'split': 'train', 'answer': 'yes', 'question': 'Are there more big green things than large purple shiny cubes?'}


In [9]:
train_questions = [item["question"] for item in train_dataset]
add_question_words_to_vocabulary(word_to_ix, train_questions)

Adding 'Are': 1
Adding 'there': 2
Adding 'more': 3
Adding 'big': 4
Adding 'green': 5
Adding 'things': 6
Adding 'than': 7
Adding 'large': 8
Adding 'purple': 9
Adding 'shiny': 10
Adding 'cubes': 11
Adding 'How': 12
Adding 'many': 13
Adding 'other': 14
Adding 'are': 15
Adding 'of': 16
Adding 'the': 17
Adding 'same': 18
Adding 'shape': 19
Adding 'as': 20
Adding 'tiny': 21
Adding 'cyan': 22
Adding 'matte': 23
Adding 'object': 24
Adding 'Is': 25
Adding 'color': 26
Adding 'sphere': 27
Adding 'cube': 28
Adding 'What': 29
Adding 'material': 30
Adding 'is': 31
Adding 'that': 32
Adding 'right': 33
Adding 'brown': 34
Adding 'cylinder': 35
Adding 'and': 36
Adding 'left': 37
Adding 'gray': 38
Adding 'on': 39
Adding 'side': 40
Adding 'small': 41
Adding 'rubber': 42
Adding 'behind': 43
Adding 'thing': 44
Adding 'to': 45
Adding 'metallic': 46
Adding 'The': 47
Adding 'what': 48
Adding 'size': 49
Adding 'any': 50
Adding 'have': 51
Adding 'block': 52
Adding 'blue': 53
Adding 'yellow': 54
Adding 'There': 5

### Validation

In [10]:
valid_file = os.path.join(data_folder, "questions", 'CLEVR_val_questions.json')
with open(valid_file) as f:
    valid_dataset = json.load(f)
print(valid_dataset.keys())
valid_dataset = valid_dataset["questions"]

dict_keys(['info', 'questions'])


In [11]:
print(len(valid_dataset))
print(valid_dataset[0].keys())

149991
dict_keys(['image_index', 'program', 'question_index', 'image_filename', 'question_family_index', 'split', 'answer', 'question'])


In [12]:
print(valid_dataset[0])

{'image_index': 0, 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [1], 'function': 'filter_material', 'value_inputs': ['metal']}, {'inputs': [2], 'function': 'unique', 'value_inputs': []}, {'inputs': [3], 'function': 'same_shape', 'value_inputs': []}, {'inputs': [4], 'function': 'exist', 'value_inputs': []}], 'question_index': 0, 'image_filename': 'CLEVR_val_000000.png', 'question_family_index': 39, 'split': 'val', 'answer': 'no', 'question': 'Are there any other things that are the same shape as the big metallic object?'}


In [13]:
valid_questions = [item["question"] for item in valid_dataset]
add_question_words_to_vocabulary(word_to_ix, valid_questions)

### Test

In [14]:
# test
test_file = os.path.join(data_folder, "questions", 'CLEVR_test_questions.json')
with open(test_file) as f:
    test_dataset = json.load(f)
print(test_dataset.keys())
print(test_dataset["info"])
print(test_dataset["questions"][0])

dict_keys(['info', 'questions'])
{'split': 'test', 'license': 'Creative Commons Attribution (CC BY 4.0)', 'version': '1.0', 'date': '2/14/2017'}
{'image_index': 0, 'split': 'test', 'image_filename': 'CLEVR_test_000000.png', 'question_index': 0, 'question': 'Is there anything else that is the same shape as the small brown matte object?'}


In [15]:
test_questions = [item["question"] for item in test_dataset["questions"]]
add_question_words_to_vocabulary(word_to_ix, test_questions)

## Save vocabulary

In [17]:
# Generate the name of file.
name = 'questions.all.csv'

print("Saving to: ",name)
# Save to both "destinations."
save_mappings_to_csv_file(data_folder, name, word_to_ix)
save_mappings_to_csv_file('.', name, word_to_ix, show = True)


Saving to:  questions.all
<PAD> : 0
Are : 1
there : 2
more : 3
big : 4
green : 5
things : 6
than : 7
large : 8
purple : 9
shiny : 10
cubes : 11
How : 12
many : 13
other : 14
are : 15
of : 16
the : 17
same : 18
shape : 19
as : 20
tiny : 21
cyan : 22
matte : 23
object : 24
Is : 25
color : 26
sphere : 27
cube : 28
What : 29
material : 30
is : 31
that : 32
right : 33
brown : 34
cylinder : 35
and : 36
left : 37
gray : 38
on : 39
side : 40
small : 41
rubber : 42
behind : 43
thing : 44
to : 45
metallic : 46
The : 47
what : 48
size : 49
any : 50
have : 51
block : 52
blue : 53
yellow : 54
There : 55
a : 56
it : 57
ball : 58
its : 59
in : 60
front : 61
does : 62
number : 63
red : 64
spheres : 65
made : 66
metal : 67
cylinders : 68
both : 69
how : 70
balls : 71
or : 72
blocks : 73
objects : 74
visible : 75
Does : 76
another : 77
has : 78
greater : 79
fewer : 80
less : 81
either : 82
anything : 83
else : 84
Do : 85
an : 86
equal : 87
