This notebook tests the generation of the CLEVR with masks dataset.

# Image generation (example)

In [None]:
%cd image_generation
!./blender/blender --background \
    --python render_images.py -- \
    --num_images 10 --use_gpu 1 --min_objects 2 --max_objects 6
%cd ..

# Single CLEVR_scenes.json generation

In [None]:
%cd image_generation
!python collect_scenes.py --date "$(date)" \
    --input_dir /dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/scenes \
--output /dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/partial/CLEVR_scenes.json
%cd ..

# Question template generation

In [None]:
!pip install pyjson5

In [None]:
%cd relation_generator
!python generate_relations.py
%cd ..

# Question generation

In [None]:
%cd question_generation/
!python generate_questions.py \
    --input_scene_file /dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000/CLEVR_scenes.json \
    --output_questions_file ./questions.json \
    --template_dir babyarc_easy --max-num-objects 6
%cd ..

# Question analysis

In [None]:
from collections import defaultdict
import json
from typing import List

from relation_generator.generate_relations import RELATIONS

def get_unique_task_string(program: List[str]):
    """
    Parses the program for a given question and returns a unique string that identifies the 
    babyARC task that it embodies.

    This function is somewhat hacky in that it doesn't deal with the AST directly, but it
    works for the generated babyARC template programs.
    """
    inputs = []
    object_str = []
    for node in program:
        # Generate a new object str every time we see a new "scene" (which implies
        # a new object)
        if node["type"] == "scene":
            if len(object_str) != 0:
                inputs.append(",".join(object_str))
                object_str = []
            continue

        # If we're not at a scene, then we're in the middle of an object
        if node["type"].startswith("filter_"):
            # This node filters some property of the input. Let's consider it.
            object_str.append(node["type"][7:] + "=" + node["value_inputs"][0])
    inputs.append(",".join(object_str))
    relations = sorted([node["type"] for node in program if node["type"] in RELATIONS])
    
    return "+".join(relations) + "-" + ";".join(inputs)


# Load the question data
file = "question_generation/questions.json"
with open(file) as f:
    data = json.load(f)
question_list = data["questions"]

observed_question_types = dict()

# Count the number of times each question type occurs
for question in question_list:
    template_filename = question["template_filename"]
    question_family_index = question["question_family_index"]
    program = question["program"]
    image = question["image"]

    task_str = get_unique_task_string(program)

    if task_str not in observed_question_types:
        observed_question_types[task_str] = {"count": 0, "questions": [], "images": []}
    
    observed_question_types[task_str]["count"] += 1
    observed_question_types[task_str]["questions"].append(question)
    observed_question_types[task_str]["images"].append(image)
for task_str, data in observed_question_types.items():
    print("{} - {}".format(task_str, data["count"]))

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(observed_question_types, orient='index')

In [None]:
df.sort_values(by=["count"], ascending=False).images[0]

In [None]:
observed_question_types["same_size-size=large,color=purple,material=metal"]

# Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
from dataset import ClevrRelationDataset

In [None]:
dataset = ClevrRelationDataset(image_dir="/dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000/images",
                               question_dir="/dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000",
                               output_type="full-color")

In [None]:
dataset.save("/dfs/user/tailin/.results/CLEVR_relation/relations-dataset-2021-08-18-608-tasks.pt")

In [None]:
dataset = ClevrRelationDataset(image_dir="/dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000/images",
                              question_dir="./question_generation/")

In [None]:
# Visualize the dataset

import matplotlib.pyplot as plt

el = dataset[-3]

# Find a compound task
# for el in dataset:
#     if "+" in el["task_str"]:
#         break
# else:
#     assert False
    
print(el["task_str"])
# print(el["questions"][0]["question"])a

plt.figure(figsize=(15,50)) # specifying the overall grid size

for i in range(min(len(el["inputs"]), 5)):
    plt.subplot(len(el["inputs"]),2, 2 * i + 1)
    plt.imshow(el["inputs"][i]["image"].permute(1, 2, 0))
    plt.subplot(len(el["inputs"]),2, 2 * i + 2)
    plt.imshow(el["outputs"][i].permute(1, 2, 0))

plt.show()

In [None]:
from PIL import Image
plt.imshow(Image.open("/dfs/user/tailin/.results/CLEVR_relation/test1/0/images/CLEVR_new_000000.png"))

# CLEVR relation "easy" dataset

## Partial dataset

### Question generation

In [None]:
%cd question_generation/
!python generate_questions.py \
    --input_scene_file /dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/partial/CLEVR_scenes.json \
    --output_questions_file /dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-100000/partial/questions.json \
    --template_dir babyarc_easy --max-num-objects 6
%cd ..

In [None]:
dataset = ClevrRelationDataset(image_dir="/dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/images",
                               question_dir="/dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/partial",
                               output_type="full-color")

## Full dataset

### Single CLEVR_scenes.json generation

In [None]:
%cd image_generation
!python collect_scenes.py --date "$(date)" \
    --input_dir /dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/scenes \
--output /dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/full/CLEVR_scenes.json
%cd ..

In [None]:
!mkdir /dfs/user/tailin/.results/CLEVR_relation/clevr-relation-easy-mpi-0-100000/full

### Generate questions

In [None]:
%cd question_generation/
!python generate_questions.py \
    --input_scene_file /dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000/CLEVR_scenes.json \
    --output_questions_file /dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000/easy-questions/questions.json \
    --template_dir babyarc_easy --max-num-objects 6
%cd ..

### Test dataset

In [None]:
dataset = ClevrRelationDataset(image_dir="/dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000/images",
                               question_dir="/dfs/user/tailin/.results/CLEVR_relation/mpi-0-10000/easy-questions",
                               output_type="mask-only", is_easy_dataset=True)

 74%|███████▍  | 2106/2850 [35:48<09:12,  1.35it/s]

In [5]:
dataset.save("/dfs/user/tailin/.results/CLEVR_relation/relations-dataset-easy-2021-09-16-461-tasks.pt")

NameError: name 'dataset' is not defined

In [12]:
len(dataset)

461

# Helper functions

In [2]:
from dataset import create_full_dataset, create_easy_dataset

train_set, val_set, test_set = create_full_dataset()
assert len(train_set) + len(val_set) + len(test_set) == 608

Loaded 608 tasks.


In [3]:
train_set, val_set, test_set = create_easy_dataset()
assert len(train_set) + len(val_set) + len(test_set) == 461

Loaded 461 tasks.
