modify submit file
- https://www.kaggle.com/meaninglesslives/using-decision-trees-for-arc
- https://www.kaggle.com/davidbnn92/task-tagging

In [1]:
import numpy as np
import pandas as pd
import os
import json
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib import colors
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
data_path = Path('/kaggle/input/abstraction-and-reasoning-challenge/')
training_path = data_path / 'training'
evaluation_path = data_path / 'evaluation'
test_path = data_path / 'test'

In [3]:
def flattener(pred):
    str_pred = str([row for row in pred])
    str_pred = str_pred.replace(', ', '')
    str_pred = str_pred.replace('[[', '|')
    str_pred = str_pred.replace('][', '|')
    str_pred = str_pred.replace(']]', '|')
    return str_pred

# task tagging

In [4]:
def create_df(folder_path):
    task_names_list = sorted(os.listdir(folder_path))
    task_list = []
    for task_name in task_names_list: 
        task_file = str(folder_path / task_name)
        with open(task_file, 'r') as f:
            task = json.load(f)
            task_list.append(task)
    
    df = pd.DataFrame()
    df['task_name'] = task_names_list
    df['task'] = task_list
    df['number_of_train_pairs'] = df['task'].apply(lambda x: len(x['train']))
    df['number_of_test_pairs'] = df['task'].apply(lambda x: len(x['test']))
    
    # Compare image sizes
    df['inputs_all_have_same_height'] = df['task'].apply(
        lambda task: int(len(set([len(example['input']) for example in task['train']+task.get('test')])) == 1)
    )
    df['inputs_all_have_same_width'] = df['task'].apply(
        lambda task: int(len(set([len(example['input'][0]) for example in task['train']+task.get('test')])) == 1)
    )
    df['inputs_all_have_same_shape'] = df['inputs_all_have_same_height'] * df['inputs_all_have_same_width']
    #df['input_height_if_constant'] = df['task'].apply(
    #    lambda task: len(task['train'][0]['input'])
    #                 if (len(set([len(example['input']) for example in task['train']+task.get('test')])) == 1)
    #                 else np.nan
    #)
    #df['input_width_if_constant'] = df['task'].apply(
    #    lambda task: len(task['train'][0]['input'][0])
    #                 if (len(set([len(example['input'][0]) for example in task['train']+task.get('test')])) == 1)
    #                 else np.nan
    #)
    #df['outputs_all_have_same_height'] = df['task'].apply(
    #    lambda task: int(len(set([len(example['output']) for example in task['train']+task.get('test')])) == 1)
    #)
    #df['outputs_all_have_same_width'] = df['task'].apply(
    #    lambda task: int(len(set([len(example['output'][0]) for example in task['train']+task.get('test')])) == 1)
    #)
    #df['outputs_all_have_same_shape'] = df['outputs_all_have_same_height'] * df['outputs_all_have_same_width']
    #df['output_height_if_constant'] = df['task'].apply(
    #    lambda task: len(task['train'][0]['output'])
    #                 if (len(set([len(example['output']) for example in task['train']+task.get('test')])) == 1)
    #                 else np.nan
    #)
    #df['output_width_if_constant'] = df['task'].apply(
    #    lambda task: len(task['train'][0]['output'][0])
    #                 if (len(set([len(example['output'][0]) for example in task['train']+task.get('test')])) == 1)
    #                 else np.nan
    #)  
    #df['in_each_pair_shape_doesnt_change'] = df['task'].apply(
    #    lambda task: np.prod([int(len(example['input'][0])==len(example['output'][0])
    #                              and len(example['input'])==len(example['output'])
    #                             ) for example in task['train']+task.get('test')
    #                        ])
    #)
    #df['in_each_pair_shape_ratio_is_the_same'] = df['task'].apply(
    #    lambda task: (len(set([len(example['input'][0]) / len(example['output'][0])
    #                             for example in task['train']+task.get('test')]))==1) * (
    #                  len(set([len(example['input']) / len(example['output'])
    #                             for example in task['train']+task.get('test')]))==1)
    #)
    #df['o/i_height_ratio_if_constant'] = df['task'].apply(
    #    lambda task: len(task['train'][0]['output']) / len(task['train'][0]['input'])
    #                 if (len(set([len(example['input']) / len(example['output'])
    #                             for example in task['train']+task.get('test')]))==1)
    #                 else np.nan
    #)
    #df['o/i_width_ratio_if_constant'] = df['task'].apply(
    #    lambda task: len(task['train'][0]['output'][0]) / len(task['train'][0]['input'][0])
    #                 if (len(set([len(example['input'][0]) / len(example['output'][0])
    #                             for example in task['train']+task.get('test')]))==1)
    #                 else np.nan
    #)
    
    # my idea ---------
    df["same_color_sum"] = df['task'].apply(lambda task: 1 
                        if sum(sum(np.array(task['train'][0]['input'])))== sum(sum(np.array(task['train'][0]['output'])))
                        else 0)
    
    df["color_sum_ratio"] = df['task'].apply(lambda task: sum(sum(np.array(task['train'][0]['input'])))/sum(sum(np.array(task['train'][0]['output'])))
                        )

    df["color_kind_diff"] = df["task"].apply(lambda task: len(np.unique(np.array(task['train'][0]['input'])))
                                             -len(np.unique(np.array(task['train'][0]['output']))))
    

    return df

In [5]:
training_descriptive_df = create_df(training_path)
evaluation_descriptive_df = create_df(evaluation_path)
test_descriptive_df = create_df(test_path)
training_descriptive_df.head()

Unnamed: 0,task_name,task,number_of_train_pairs,number_of_test_pairs,inputs_all_have_same_height,inputs_all_have_same_width,inputs_all_have_same_shape,same_color_sum,color_sum_ratio,color_kind_diff
0,007bbfb7.json,"{'test': [{'input': [[7, 0, 7], [7, 0, 7], [7,...",5,1,1,1,1,0,0.142857,0
1,00d62c1b.json,"{'train': [{'input': [[0, 0, 0, 0, 0, 0], [0, ...",5,1,0,0,0,0,0.692308,-1
2,017c7c7b.json,"{'train': [{'input': [[0, 1, 0], [1, 1, 0], [0...",3,1,1,1,1,0,0.346154,0
3,025d127b.json,"{'train': [{'input': [[0, 0, 0, 0, 0, 0, 0, 0,...",2,1,0,0,0,0,0.956522,0
4,045e512c.json,"{'train': [{'input': [[0, 0, 0, 0, 0, 0, 0, 0,...",3,1,1,1,1,0,0.429348,0


# feature engineering and learning

In [6]:
def neighbours(color, cur_row, cur_col, nrows, ncols, radius):

    if cur_row<=radius-1: top = -1
    else: top = color[cur_row-radius][cur_col]
        
    if cur_row>=nrows-radius: bottom = -1
    else: bottom = color[cur_row+radius][cur_col]
        
    if cur_col<=radius-1: left = -1
    else: left = color[cur_row][cur_col-radius]
        
    if cur_col>=ncols-radius: right = -1
    else: right = color[cur_row][cur_col+radius]
        
    return top, bottom, left, right

def get_tl_tr(color, cur_row, cur_col, nrows, ncols, radius):
        
    if cur_row<=radius-1:
        top_left = -1
        top_right = -1
    else:
        if cur_col<=radius-1: top_left=-1
        else: top_left = color[cur_row-radius][cur_col-radius]
        if cur_col>=ncols-radius: top_right=-1
        else: top_right = color[cur_row-radius][cur_col+radius]   
        
    return top_left, top_right

def get_bl_br(color, cur_row, cur_col, nrows, ncols, radius):
        
    if cur_row>=nrows-radius:
        bottom_left = -1
        bottom_right = -1
    else:
        if cur_col<=radius-1: bottom_left=-1
        else: bottom_left = color[cur_row+radius][cur_col-radius]
        if cur_col>=ncols-radius: bottom_right=-1
        else: bottom_right = color[cur_row+radius][cur_col+radius]   
        
    return bottom_left, bottom_right

def make_features(input_color, nfeat):
    nrows, ncols = input_color.shape
    feat = np.zeros((nrows*ncols,nfeat))
    cur_idx = 0
    for i in range(nrows):
        for j in range(ncols):
            feat[cur_idx,0] = i
            feat[cur_idx,1] = j
            feat[cur_idx,2] = input_color[i][j]
            feat[cur_idx,3:7] = neighbours(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,7:9] = get_tl_tr(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,9] = len(np.unique(input_color[i,:]))
            feat[cur_idx,10] = len(np.unique(input_color[:,j]))
            feat[cur_idx,11] = (i+j)
            feat[cur_idx,12] = len(np.unique(input_color[i-local_neighb:i+local_neighb,
                                                         j-local_neighb:j+local_neighb]))
            
            feat[cur_idx,13:15] = get_bl_br(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,15] = np.sum(input_color[i,:])
            feat[cur_idx,16] = np.sum(input_color[:,j])
            feat[cur_idx,17:21] = neighbours(input_color, i, j, nrows, ncols,2)
            #feat[cur_idx,21:23] = get_tl_tr(input_color, i, j, nrows, ncols,2)
            #feat[cur_idx,23:25] = get_bl_br(input_color, i, j, nrows, ncols,2)
            cur_idx += 1
        
    return feat

In [7]:
def features(task, mode='train'):
    num_train_pairs = len(task[mode])
    feat, target = [], []
    
    global local_neighb
    for task_num in range(num_train_pairs):
        input_color = np.array(task[mode][task_num]['input'])
        target_color = task[mode][task_num]['output']
        nrows, ncols = len(task[mode][task_num]['input']), len(task[mode][task_num]['input'][0])

        target_rows, target_cols = len(task[mode][task_num]['output']), len(task[mode][task_num]['output'][0])
        
        if (target_rows!=nrows) or (target_cols!=ncols):
            not_valid=1
            return None, None, 1

        imsize = nrows*ncols
        offset = imsize*task_num*3 #since we are using three types of aug
        feat.extend(make_features(input_color, nfeat))
        target.extend(np.array(target_color).reshape(-1,))
            
    return np.array(feat), np.array(target), 0

In [8]:
nfeat = 21
local_neighb = 5
def modelling(mode):
    print(mode)
    sample_sub = pd.read_csv(data_path/'sample_submission.csv')
    sample_sub = sample_sub.set_index('output_id')
    
    valid_scores = {}
    model_accuracies = {'ens': []}
    pred_taskids = []
    
    if mode=='eval':
        task_path = evaluation_path
    elif mode=='train':
        task_path = training_path
    elif mode=='test':
        task_path = test_path
    all_task_ids = sorted(os.listdir(task_path))

    # training -----
    for task_id in all_task_ids:
        task_file = str(task_path / task_id)
        with open(task_file, 'r') as f:
            task = json.load(f)

        feat, target, not_valid = features(task)
        if not_valid:
            print('ignoring task', task_file)
            not_valid = 0
            continue

        model = XGBClassifier(n_estimators=50, max_depth = 5, n_jobs=-1)
        model.fit(feat, target, verbose=-1)
        #model = LGBMClassifier(n_estimators=50, max_depth=10, n_jobs=-1)
        #model.fit(feat, target, verbose=-1)
    # training on input pairs is done
    
    # test predictions begins here
        num_test_pairs = len(task['test'])
        for task_num in range(num_test_pairs):
            cur_idx = 0
            input_color = np.array(task['test'][task_num]['input'])
            nrows, ncols = len(task['test'][task_num]['input']), len(
                task['test'][task_num]['input'][0])
            feat = make_features(input_color, nfeat)
            #print('Made predictions for ', task_id[:-5])
            preds = model.predict(feat).reshape(nrows,ncols)
        
            if (mode=='train') or (mode=='eval'):
                ens_acc = (np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols)
                model_accuracies['ens'].append(ens_acc)
                pred_taskids.append(f'{task_id[:-5]}_{task_num}')
                #print('ensemble accuracy',(np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols))
            
            preds = preds.astype(int).tolist()
            #plot_test(preds, task_id)
            sample_sub.loc[f'{task_id[:-5]}_{task_num}','output'] = flattener(preds)
    return sample_sub, model_accuracies, pred_taskids

train_sub, train_accuracies, train_ids= modelling('train')
eval_sub, eval_accuracies, eval_ids  = modelling('eval')
test_sub, _, _ = modelling('test')

train
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/007bbfb7.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/017c7c7b.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/0520fde7.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/0b148d64.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/10fcaaa3.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/1190e5a7.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/137eaa0f.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/1b2d62fb.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/1c786137.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/1cf80156.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/1f85a75f.json
ignoring task /kaggle/input/abstracti

In [9]:
df = pd.DataFrame(train_accuracies, index=train_ids)
for c in df.columns:
    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]==1).sum())

df = pd.DataFrame(eval_accuracies, index=eval_ids)
for c in df.columns:
    print(f'for {c} no. of complete evaluation tasks is', (df.loc[:, c]==1).sum())

for ens no. of complete training tasks is 29
for ens no. of complete evaluation tasks is 8


In [10]:
test_sub.reset_index(drop=False).to_csv('submission.csv')
test_sub.reset_index(drop=False).head()

Unnamed: 0,output_id,output
0,00576224_0,|32|78| |32|78| |00|00|
1,009d5c81_0,|00000000000000|00000222272222|00000200020302|...
2,00dbd492_0,|00888000000222220000|02222222220233320000|028...
3,03560426_0,|8188000000|8188000000|8282000000|8277000000|0...
4,05a7bcf2_0,|000000000020000000080000000000|00004444488888...
