In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
_include_('curriculum_vqa')

In [3]:
CLEVR_root = f'{DEV_HOME}/curriculum_vqa/data-bin/CLEVR_v1.0'
CLEVR_fil_root = f'{DEV_HOME}/curriculum_vqa/data-bin/CLEVR_small_6'
CLEVR_mini_root = f'{DEV_HOME}/curriculum_vqa/data-bin/CLEVR_mini_6'

# ds = datasets.CLEVR(CLEVR_root, split='val')
# len(x['program']) < 6

In [6]:
import json
import shutil
import random

def filter_dataset(split, target_root, predicate_fn, subsample=None):
#     with open(f'{CLEVR_root}/scenes/CLEVR_{split}_scenes.json') as f:
#         scenes = json.load(f)['scenes']
        
    with open(f'{CLEVR_root}/questions/CLEVR_{split}_questions.json') as f:
        questions = json.load(f)['questions']
    
    print(f'Loaded {len(questions)} questions.')
    filtered_questions = [x for x in questions if predicate_fn(x) ]
    print(f'We have {len(filtered_questions)} questions after filtering.')
    if subsample is not None:
        filtered_questions = random.choices(filtered_questions, k=subsample)
        print(f'We have {len(filtered_questions)} questions after sub-sampling.')
        
    os.makedirs(f'{target_root}/questions', exist_ok=True)

    with open(f'{target_root}/questions/CLEVR_{split}_questions.json', 'w') as f:
        json.dump({
            'info': 'Filtered',
            'questions': filtered_questions
        }, f)
        
    os.makedirs(f'{target_root}/scenes', exist_ok=True)

    shutil.copy(f'{CLEVR_root}/scenes/CLEVR_{split}_scenes.json', f'{target_root}/scenes/CLEVR_{split}_scenes.json')

In [7]:
filter_dataset('train', CLEVR_mini_root, lambda x: len(x['program']) < 6, subsample=3000)
filter_dataset('val', CLEVR_mini_root, lambda x: len(x['program']) < 6, subsample=600)

Loaded 699989 questions.
We have 46258 questions after filtering.
We have 3000 questions after sub-sampling.
Loaded 149991 questions.
We have 9887 questions after filtering.
We have 600 questions after sub-sampling.


In [7]:
# filter_dataset('train', CLEVR_fil_root, lambda x: len(x['program']) < 6)

In [8]:
# list(map(lambda x: x['question'], filtered_questions))[:5]

In [9]:
with open(f'{CLEVR_root}/questions/CLEVR_val_questions.json') as f:
        questions = json.load(f)['questions']


In [12]:
predicate_fn = lambda x: len(x['program']) < 7
filtered_questions = [x for x in questions if predicate_fn(x) ]
len(filtered_questions)

21229

In [13]:
program_short = filtered_questions[44]
program_short

{'image_index': 29,
 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []},
  {'inputs': [0], 'function': 'filter_size', 'value_inputs': ['small']},
  {'inputs': [1], 'function': 'filter_material', 'value_inputs': ['metal']},
  {'inputs': [2], 'function': 'unique', 'value_inputs': []},
  {'inputs': [3], 'function': 'same_shape', 'value_inputs': []},
  {'inputs': [4], 'function': 'count', 'value_inputs': []}],
 'question_index': 291,
 'image_filename': 'CLEVR_val_000029.png',
 'question_family_index': 43,
 'split': 'val',
 'answer': '1',
 'question': 'How many other objects are the same shape as the tiny metal thing?'}

In [17]:
import re
    
def tokenize_program(prog_str):
    return [x for x in re.compile('([\(\).,\s])').split(prog_str) if x.strip() != '']

def build_prog_str(prog):
    answer_op = prog[-1]
    answer_op_inputs = []
    for i in answer_op['inputs']:
        # generate obj pipeline by rolling backwards
        pipe = []
        curr_line = prog[i]
        while True:
            func_args = curr_line["value_inputs"]
            if len(func_args) > 0:
                func_args = f"'{func_args[0]}'"
            else:
                func_args = ''
            pipe.append(curr_line['function'] + f'({func_args})')
            if len(curr_line['inputs']) == 0:
                break
            else:
                curr_line = prog[curr_line['inputs'][0]]
        answer_op_inputs.append('.'.join(pipe[::-1]))

    output = answer_op['function'] + '(' + ', '.join(answer_op_inputs) + ')'
    return output

In [18]:
prog = program_short['program']

prog_str = build_prog_str(prog)
prog_str

"count(scene().filter_size('small').filter_material('metal').unique().same_shape())"

In [19]:
''.join(tokenize_program(prog_str))

"count(scene().filter_size('small').filter_material('metal').unique().same_shape())"