### Overview

In this notebook, we will construct a dataset that aims to allow us to learn a multi-label CNN model (with the input as 1 pair of input-output image of a single task, and the output as the 160-array of 160 operations)

In [1]:
import numpy as np, pandas as pd, json, random
from tqdm.notebook import tqdm
tqdm.pandas()

def load_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

In [3]:
with open("hodel/dsl.py","r") as f:
    dsl_text_file = f.read()

dsl_text_file.count("\ndef ")
dsl_text_file = dsl_text_file.split("\ndef ")[1:]
functions = []
for f in dsl_text_file:
    functions.append(f.split('(')[0])

print(f"Functions ({len(functions)}): {functions}")

Functions (160): ['identity', 'add', 'subtract', 'multiply', 'divide', 'invert', 'even', 'double', 'halve', 'flip', 'equality', 'contained', 'combine', 'intersection', 'difference', 'dedupe', 'order', 'repeat', 'greater', 'size', 'merge', 'maximum', 'minimum', 'valmax', 'valmin', 'argmax', 'argmin', 'mostcommon', 'leastcommon', 'initset', 'both', 'either', 'increment', 'decrement', 'crement', 'sign', 'positive', 'toivec', 'tojvec', 'sfilter', 'mfilter', 'extract', 'totuple', 'first', 'last', 'insert', 'remove', 'other', 'interval', 'astuple', 'product', 'pair', 'branch', 'compose', 'chain', 'matcher', 'rbind', 'lbind', 'power', 'fork', 'apply', 'rapply', 'mapply', 'papply', 'mpapply', 'prapply', 'mostcolor', 'leastcolor', 'height', 'width', 'shape', 'portrait', 'colorcount', 'colorfilter', 'sizefilter', 'asindices', 'ofcolor', 'ulcorner', 'urcorner', 'llcorner', 'lrcorner', 'crop', 'toindices', 'recolor', 'shift', 'normalize', 'dneighbors', 'ineighbors', 'neighbors', 'objects', 'partit

In [4]:
with open("hodel/solvers.py","r") as f:
    solvers_file = f.read()
solvers_file
solvers_strings = solvers_file.split("\ndef ")[1:]

solver_labels = {}
NB_LABELS = 160

for string in solvers_strings:
    taskId = string.split('solve_')[1].split('(')[0]
    labels = [0]*NB_LABELS
    
    for j, f in enumerate(functions):
        if f in string:
            labels[j] = 1

    solver_labels[taskId] = labels

print(len(solver_labels))

400


In [8]:
dataset = pd.DataFrame(solver_labels).T
dataset.columns = functions
dataset['task_id'] = dataset.index.values
dataset = dataset.reset_index(drop=True)
dataset = dataset[['task_id'] + [col for col in dataset.columns if col not in ['task_id' ]]]
dataset.to_csv('hodel/function_dataset.csv', index=None)
dataset.head()

Unnamed: 0,task_id,identity,add,subtract,multiply,divide,invert,even,double,halve,...,gravitate,inbox,outbox,box,shoot,occurrences,frontiers,compress,hperiod,vperiod
0,67a3c6ac,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,68b16354,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,74dd1130,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3c9b0459,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6150a2bd,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
base_path = 'arc-prize-2024/'
train_tasks   = load_json(base_path +'arc-agi_training_challenges.json')
train_sols    = load_json(base_path +'arc-agi_training_solutions.json')
eval_tasks = load_json(base_path +'arc-agi_evaluation_challenges.json')
eval_sols  = load_json(base_path +'arc-agi_evaluation_solutions.json')

In [10]:
train_tasks['007bbfb7']

{'test': [{'input': [[7, 0, 7], [7, 0, 7], [7, 7, 0]]}],
 'train': [{'input': [[0, 7, 7], [7, 7, 7], [0, 7, 7]],
   'output': [[0, 0, 0, 0, 7, 7, 0, 7, 7],
    [0, 0, 0, 7, 7, 7, 7, 7, 7],
    [0, 0, 0, 0, 7, 7, 0, 7, 7],
    [0, 7, 7, 0, 7, 7, 0, 7, 7],
    [7, 7, 7, 7, 7, 7, 7, 7, 7],
    [0, 7, 7, 0, 7, 7, 0, 7, 7],
    [0, 0, 0, 0, 7, 7, 0, 7, 7],
    [0, 0, 0, 7, 7, 7, 7, 7, 7],
    [0, 0, 0, 0, 7, 7, 0, 7, 7]]},
  {'input': [[4, 0, 4], [0, 0, 0], [0, 4, 0]],
   'output': [[4, 0, 4, 0, 0, 0, 4, 0, 4],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 4, 0, 0, 0, 0, 0, 4, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 4, 0, 4, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 4, 0, 0, 0, 0]]},
  {'input': [[0, 0, 0], [0, 0, 2], [2, 0, 2]],
   'output': [[0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 2],


## Generate augmented data


We will augment the training data by:
- Randomly place the image into a canva of 40x40
- Flip up-down or left-right
- Mapping colors 1-9 to a different mappping

The flipping operations may damage the operation logics, but neglectible as it only affects the sub-categories of the similar moves (move up vs move down), not the general categories (connect vs extend) 

In [41]:
MAX_WIDTH = 40
MAX_HEIGHT = 40
NB_TIMES_SHUFFLES = 10
NB_TIMES_ROLLS = 6

img_ids = []
imgs = []
tids = []

for tid in tqdm(train_tasks.keys()):
    
    task = train_tasks[tid]

    nb_imgs_per_task = 0
    
    for j0 in range(NB_TIMES_SHUFFLES):
        # Create a mapping to randomly change color values (except 0), apply this mapping for all pairs of input-output for consistency
        original_colors = [1,2,3,4,5,6,7,8,9]
        shuffled_colors = [1,2,3,4,5,6,7,8,9]
        if j0!=0: random.shuffle(shuffled_colors)  # keep the original version at j0=0
        d = {k:v for k,v in zip(original_colors, shuffled_colors)}
        
        for j_pair, pair in enumerate(task['train']):
            img_in = np.array(pair['input'])
            img_out = np.array(pair['output'])

            img_in_shuffled = np.copy(img_in)
            for k, v in d.items(): img_in_shuffled[img_in==k] = v
            img_out_shuffled = np.copy(img_out)
            for k, v in d.items(): img_out_shuffled[img_out==k] = v

            # Randomly roll the input or output image to an existing canva of 40x40
            for j1 in range(NB_TIMES_ROLLS):

                if j1%3==0: 
                    img_in_shuffled_flipped = img_in_shuffled
                    img_out_shuffled_flipped = img_out_shuffled
                if j1%3==1: # flip up down for diversity, may break some task logics but neglectible
                    img_in_shuffled_flipped = np.flipud(img_in_shuffled)
                    img_out_shuffled_flipped = np.flipud(img_out_shuffled)
                if j1%3==2: # flip left right for diversity, may break some task logics but neglectible
                    img_in_shuffled_flipped = np.fliplr(img_in_shuffled)
                    img_out_shuffled_flipped = np.fliplr(img_out_shuffled)
            
                img_in_pad = -np.ones((MAX_WIDTH, MAX_HEIGHT))
                img_in_pad[:img_in_shuffled.shape[0], :img_in_shuffled.shape[1]] = img_in_shuffled_flipped
                roll_x = np.random.randint(1, max(0, MAX_WIDTH - img_in_shuffled.shape[0]))
                roll_y = np.random.randint(1, max(0, MAX_HEIGHT - img_in_shuffled.shape[1]))
                if j1 == 0: roll_x, roll_y = 0, 0
                img_in_pad_rolled = np.roll(img_in_pad, (roll_x, roll_y), axis=(0, 1))
                img_in_pad_rolled = np.repeat(img_in_pad_rolled[:, :, np.newaxis], 3, axis=2) # make 3 channels for image convention
                
                img_out_pad = -np.ones((MAX_WIDTH, MAX_HEIGHT))
                img_out_pad[:img_out_shuffled.shape[0], :img_out_shuffled.shape[1]] = img_out_shuffled_flipped
                roll_x = np.random.randint(1, max(0, MAX_WIDTH - img_out_shuffled.shape[0]))
                roll_y = np.random.randint(1, max(0, MAX_HEIGHT - img_out_shuffled.shape[1]))
                if j1 == 0: roll_x, roll_y = 0, 0
                img_out_pad_rolled = np.roll(img_out_pad, (roll_x, roll_y), axis=(0, 1))
                img_out_pad_rolled = np.repeat(img_out_pad_rolled[:, :, np.newaxis], 3, axis=2)
                
                pair_img = np.stack([img_in_pad_rolled, img_out_pad_rolled], axis=0) # shape for each pair [2, 40, 40, 3]
               
                imgs.append(pair_img.astype(int))
                img_ids.append(f"{tid}_pair{j_pair}_shuf{j0}_roll{j1}")
                tids.append(tid)

  0%|          | 0/400 [00:00<?, ?it/s]

In [42]:
imgs_np = np.stack(imgs, axis=0) 
imgs_np = imgs_np.astype(np.int8) # Final shape (78120 samples, 2 imgs (in-out), 40 (w), 40 (h), 3 (channels))
print('imgs_np.shape:', imgs_np.shape)
df = pd.DataFrame({'task_id': tids, 'image_id': img_ids})
df.head()

imgs_np.shape: (78120, 2, 40, 40, 3)


Unnamed: 0,task_id,image_id
0,007bbfb7,007bbfb7_pair0_shuf0_roll0
1,007bbfb7,007bbfb7_pair0_shuf0_roll1
2,007bbfb7,007bbfb7_pair0_shuf0_roll2
3,007bbfb7,007bbfb7_pair0_shuf0_roll3
4,007bbfb7,007bbfb7_pair0_shuf0_roll4


In [43]:
# Print the min, max, mean of nb.image pairs per taskid
df.groupby('task_id').count().min(),  df.groupby('task_id').count().max(), df.groupby('task_id').count().mean()

(image_id    120
 dtype: int64,
 image_id    600
 dtype: int64,
 image_id    195.3
 dtype: float64)

In [44]:
# Save to dataset files
df.to_csv('hodel/augmented_image_dataset_metadata.csv', index=None)
np.save('hodel/augmented_image_dataset_imageData.npy', imgs_np)