In [1]:
import os
import csv
import string
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Tokenizer
from nltk.tokenize import WhitespaceTokenizer
# Demo with different tokenizers.
# https://text-processing.com/demo/tokenize/

In [2]:
# Export mappings to file.
def save_mappings_to_csv_file(folder, filename, word_to_ix, fieldnames = ["word", "index"], show = False):
    """
    Saves mappings dictionary to a file.

    :param filename: File with encodings (absolute path + filename).
    :param word_to_ix: dictionary with word:index keys
    """
    # Expand path.
    folder = os.path.expanduser(folder)
    # Make sure directory exists.
    os.makedirs(os.path.dirname(folder +'/'), exist_ok=True)

    file_path = os.path.join(folder, filename)

    with open(file_path, mode='w+') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # Create header.
        writer.writeheader()

        # Write word-index pairs.
        for (k,v) in word_to_ix.items():
            if show:
                print("{} : {}".format(k,v))
            writer.writerow({fieldnames[0]:k, fieldnames[1]: v})


In [3]:
def add_answer_label_to_vocabulary(answer_to_ix, answers):
    """
    Processes answers one by one, treating each as a single label.
    """
    for answer in answers:
        # If new token.
        if answer not in answer_to_ix:
            answer_to_ix[answer] = len(answer_to_ix)
            print("Adding '{}': {}".format(answer, len(answer_to_ix)-1) ) 

In [4]:
answer_to_ix = {}

In [5]:
# Set folders.
data_folder = os.path.expanduser('~/data/CLEVR_v1.0')
print(data_folder)

/Users/tomaszkornuta/data/CLEVR_v1.0


## Extract vocabulary

### Training

In [6]:
# training
train_file = os.path.join(data_folder, "questions", 'CLEVR_train_questions.json')
with open(train_file) as f:
    train_dataset = json.load(f)
print(train_dataset.keys())
train_dataset = train_dataset["questions"]

dict_keys(['info', 'questions'])


In [7]:
print(len(train_dataset))
print(train_dataset[0].keys())

699989
dict_keys(['image_index', 'program', 'question_index', 'image_filename', 'question_family_index', 'split', 'answer', 'question'])


In [8]:
print(train_dataset[0])

{'image_index': 0, 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [1], 'function': 'filter_color', 'value_inputs': ['green']}, {'inputs': [2], 'function': 'count', 'value_inputs': []}, {'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [4], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [5], 'function': 'filter_color', 'value_inputs': ['purple']}, {'inputs': [6], 'function': 'filter_material', 'value_inputs': ['metal']}, {'inputs': [7], 'function': 'filter_shape', 'value_inputs': ['cube']}, {'inputs': [8], 'function': 'count', 'value_inputs': []}, {'inputs': [3, 9], 'function': 'greater_than', 'value_inputs': []}], 'question_index': 0, 'image_filename': 'CLEVR_train_000000.png', 'question_family_index': 2, 'split': 'train', 'answer': 'yes', 'question': 'Are there more big green things than large purple shiny cubes?'}


In [9]:
train_answers = [item["answer"] for item in train_dataset]
add_answer_label_to_vocabulary(answer_to_ix, train_answers)

Adding 'yes': 0
Adding '2': 1
Adding 'no': 2
Adding 'rubber': 3
Adding 'large': 4
Adding '0': 5
Adding 'sphere': 6
Adding 'gray': 7
Adding 'cube': 8
Adding 'blue': 9
Adding 'brown': 10
Adding '1': 11
Adding 'yellow': 12
Adding 'purple': 13
Adding 'cylinder': 14
Adding 'small': 15
Adding 'green': 16
Adding 'metal': 17
Adding '3': 18
Adding '4': 19
Adding 'cyan': 20
Adding '6': 21
Adding 'red': 22
Adding '5': 23
Adding '8': 24
Adding '7': 25
Adding '9': 26
Adding '10': 27


### Validation

In [10]:
valid_file = os.path.join(data_folder, "questions", 'CLEVR_val_questions.json')
with open(valid_file) as f:
    valid_dataset = json.load(f)
print(valid_dataset.keys())
valid_dataset = valid_dataset["questions"]

dict_keys(['info', 'questions'])


In [11]:
print(len(valid_dataset))
print(valid_dataset[0].keys())

149991
dict_keys(['image_index', 'program', 'question_index', 'image_filename', 'question_family_index', 'split', 'answer', 'question'])


In [12]:
print(valid_dataset[0])

{'image_index': 0, 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []}, {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['large']}, {'inputs': [1], 'function': 'filter_material', 'value_inputs': ['metal']}, {'inputs': [2], 'function': 'unique', 'value_inputs': []}, {'inputs': [3], 'function': 'same_shape', 'value_inputs': []}, {'inputs': [4], 'function': 'exist', 'value_inputs': []}], 'question_index': 0, 'image_filename': 'CLEVR_val_000000.png', 'question_family_index': 39, 'split': 'val', 'answer': 'no', 'question': 'Are there any other things that are the same shape as the big metallic object?'}


In [13]:
valid_answers = [item["answer"] for item in valid_dataset]
add_answer_label_to_vocabulary(answer_to_ix, valid_answers)

### Test

In [14]:
# test
test_file = os.path.join(data_folder, "questions", 'CLEVR_test_questions.json')
with open(test_file) as f:
    test_dataset = json.load(f)
print(test_dataset.keys())
print(test_dataset["info"])
print(test_dataset["questions"][0])

dict_keys(['info', 'questions'])
{'split': 'test', 'license': 'Creative Commons Attribution (CC BY 4.0)', 'version': '1.0', 'date': '2/14/2017'}
{'image_index': 0, 'split': 'test', 'image_filename': 'CLEVR_test_000000.png', 'question_index': 0, 'question': 'Is there anything else that is the same shape as the small brown matte object?'}


## Save vocabulary

In [15]:
# Generate the name of file.
name = 'answers.all'

print("Saving to: ",name)
# Save to both "destinations."
save_mappings_to_csv_file(data_folder, name, answer_to_ix)
save_mappings_to_csv_file('.', name, answer_to_ix, show = True)


Saving to:  answers.all
yes : 0
2 : 1
no : 2
rubber : 3
large : 4
0 : 5
sphere : 6
gray : 7
cube : 8
blue : 9
brown : 10
1 : 11
yellow : 12
purple : 13
cylinder : 14
small : 15
green : 16
metal : 17
3 : 18
4 : 19
cyan : 20
6 : 21
red : 22
5 : 23
8 : 24
7 : 25
9 : 26
10 : 27
