- change tag and classification rules
- focus on xgboost
- https://www.kaggle.com/meaninglesslives/using-decision-trees-for-arc
- https://www.kaggle.com/davidbnn92/task-tagging
- https://www.kaggle.com/nxrprime/grid-search-with-xgboost-and-cv

In [1]:
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from matplotlib import colors
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from itertools import permutations
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
data_path = Path('/kaggle/input/abstraction-and-reasoning-challenge/')
training_path = data_path / 'training'
evaluation_path = data_path / 'evaluation'
test_path = data_path / 'test'

In [3]:
def preprocess(task, index): # modify mistakes in data
    # 025d127b
    if index == "025d127b.json":
        for i in range(9, 12):
            for j in range(3, 8):
                task['train'][0]['output'][i][j] = 0
        for i in range(7, 10):
            for j in range(3, 6):
                task['train'][0]['output'][i][j] = 2
        task['train'][0]['output'][8][4] = 0
    # ef135b50
    elif index == "ef135b50.json":
        task['test'][0]['output'][6][4] = 9
    # bd14c3bf
    elif index == "bd14c3bf.json":
        for i in range(3):
            for j in range(5):
                if task['test'][0]['input'][i][j] == 1:
                    task['test'][0]['input'][i][j] = 2
    # a8610ef7
    elif index == "a8610ef7.json":
        for i in range(6):
            for j in range(6):
                if task['test'][0]['output'][i][j] == 8:
                    task['test'][0]['output'][i][j] = 5
        task['train'][3]['input'][0][1] = 2
        task['train'][3]['input'][5][1] = 2
    # 54db823b
    elif index == "54db823b.json":
        task['train'][0]['output'][2][3] = 3
        task['train'][0]['output'][2][4] = 9
    # e5062a87
    elif index == "e5062a87.json":
        for j in range(3, 7):
            task['train'][1]['output'][1][j] = 2
    # 1b60fb0c
    elif index == "1b60fb0c.json":
        task['train'][1]['output'][8][8] = 0
        task['train'][1]['output'][8][9] = 0
    # 82819916
    elif index == "82819916.json":
        task['train'][0]['output'][4][5] = 4
    # fea12743
    elif index == "fea12743.json":
        for i in range(11, 16):
            for j in range(6):
                if task['train'][0]['output'][i][j] == 2:
                    task['train'][0]['output'][i][j] = 8
    # 42a50994
    elif index == "42a50994.json":
        task['train'][0]['output'][1][0] = 8
        task['train'][0]['output'][0][1] = 8
    # f8be4b64
    elif index == "f8be4b64.json":
        for j in range(19):
            if task['test'][0]['output'][12][j] == 0:
                task['test'][0]['output'][12][j] = 1
        task['test'][0]['output'][12][8] = 0
    # d511f180
    elif index == "d511f180.json":
        task['train'][1]['output'][2][2] = 9
    # 10fcaaa3
    elif index == "10fcaaa3.json":
        task['train'][1]['output'][4][7] = 8
    # cbded52d
    elif index == "cbded52d.json":
        task['train'][0]['input'][4][6] = 1
    # 11852cab
    elif index == "11852cab.json":
        task['train'][0]['input'][1][2] = 3
    # 868de0fa
    elif index == "868de0fa.json":
        for j in range(2, 9):
            task['train'][2]['input'][9][j] = 0
            task['train'][2]['input'][10][j] = 1
            task['train'][2]['input'][15][j] = 0
            task['train'][2]['input'][16][j] = 1
        task['train'][2]['input'][15][2] = 1
        task['train'][2]['input'][15][8] = 1
    # 6d58a25d
    elif index == "6d58a25d.json":
        task['train'][0]['output'][10][0] = 0
        task['train'][2]['output'][6][13] = 4
    # a9f96cdd
    elif index == "a9f96cdd.json":
        task['train'][3]['output'][1][3] = 0
    # 48131b3c
    elif index == "48131b3c.json":
        task['train'][2]['output'][4][4] = 0
    # 150deff5
    elif index == "150deff5.json":
        aux = task['train'][2]['output'].copy()
        task['train'][2]['output'] = task['train'][2]['input'].copy()
        task['train'][2]['input'] = aux
    # 17cae0c1
    elif index == "17cae0c1.json":
        for i in range(3):
            for j in range(3, 6):
                task['test'][0]['output'][i][j] = 9
    # e48d4e1a
    elif index == "e48d4e1a.json":
        task['train'][3]['input'][0][9] = 5
        task['train'][3]['output'][0][9] = 0
    # 8fbca751
    elif index == "8fbca751.json":
        task['train'][1]['output'][1][3] = 2
        task['train'][1]['output'][2][3] = 8
    # 4938f0c2
    elif index == "4938f0c2.json":
        for i in range(12):
            for j in range(6,13):
                if task['train'][2]['input'][i][j]==2:
                    task['train'][2]['input'][i][j] = 0
        for i in range(5,11):
            for j in range(7):
                if task['train'][2]['input'][i][j]==2:
                    task['train'][2]['input'][i][j] = 0
    # 9aec4887
    elif index == "9aec4887.json":
        task['train'][0]['output'][1][4] = 8
    # b0f4d537
    elif index == "b0f4d537.json":
        for i in range(9):
            task['train'][0]['output'][i][3] = 0
            task['train'][0]['output'][i][4] = 1
        task['train'][0]['output'][2][3] = 3
        task['train'][0]['output'][2][4] = 3
        task['train'][0]['output'][5][3] = 2
    # aa300dc3
    elif index == "aa300dc3.json":
        task['train'][1]['input'][1][7] = 5
        task['train'][1]['output'][1][7] = 5
        task['train'][1]['input'][8][2] = 5
        task['train'][1]['output'][8][2] = 5
    # ad7e01d0
    elif index == "ad7e01d0.json":
        task['train'][0]['output'][6][7] = 0
    # a8610ef7
    elif index == "a8610ef7.json":
        task['train'][3]['input'][0][1] = 0
        task['train'][3]['input'][5][1] = 0
        task['train'][3]['output'][0][1] = 0
        task['train'][3]['output'][5][1] = 0
    # 97239e3d
    elif index == "97239e3d.json":
        task['test'][0]['input'][14][6] = 0
        task['test'][0]['input'][14][10] = 0
    # d687bc17
    elif index == "d687bc17.json":
        task['train'][2]['output'][7][1] = 4
    return task

In [4]:
def flattener(pred):
    str_pred = str([row for row in pred])
    str_pred = str_pred.replace(', ', '')
    str_pred = str_pred.replace('[[', '|')
    str_pred = str_pred.replace('][', '|')
    str_pred = str_pred.replace(']]', '|')
    return str_pred

# task tagging

In [5]:
def color_check(color_list):
    tmp = color_list[0]
    for i in range(1,len(color_list)):
        if set(tmp) != set(color_list[i]):
            return False
    return True

def create_df(folder_path):
    task_names_list = sorted(os.listdir(folder_path))
    task_list = []
    for task_name in task_names_list: 
        task_file = str(folder_path / task_name)
        with open(task_file, 'r') as f:
            task = json.load(f)
            if "test" not in str(folder_path):
                task = preprocess(task, task_name)
            task_list.append(task)
    
    df = pd.DataFrame()
    df['task_name'] = task_names_list
    df['task'] = task_list
    df['number_of_train_pairs'] = df['task'].apply(lambda x: len(x['train']))
    df['number_of_test_pairs'] = df['task'].apply(lambda x: len(x['test']))
    
    # Compare image sizes
    df['inputs_all_have_same_height'] = df['task'].apply(
        lambda task: int(len(set([len(example['input']) for example in task['train']])) == 1)
    )
    df['inputs_all_have_same_width'] = df['task'].apply(
        lambda task: int(len(set([len(example['input'][0]) for example in task['train']])) == 1)
    )
    df['inputs_all_have_same_shape'] = df['inputs_all_have_same_height'] * df['inputs_all_have_same_width']
    df['input_height_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['input'])
                     if (len(set([len(example['input']) for example in task['train']])) == 1)
                     else np.nan
    )
    df['input_width_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['input'][0])
                     if (len(set([len(example['input'][0]) for example in task['train']])) == 1)
                     else np.nan
    )
    df['outputs_all_have_same_height'] = df['task'].apply(
        lambda task: int(len(set([len(example['output']) for example in task['train']])) == 1)
    )
    df['outputs_all_have_same_width'] = df['task'].apply(
        lambda task: int(len(set([len(example['output'][0]) for example in task['train']])) == 1)
    )
    df['outputs_all_have_same_shape'] = df['outputs_all_have_same_height'] * df['outputs_all_have_same_width']
    df['output_height_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output'])
                     if (len(set([len(example['output']) for example in task['train']])) == 1)
                     else np.nan
    )
    df['output_width_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output'][0])
                     if (len(set([len(example['output'][0]) for example in task['train']])) == 1)
                     else np.nan
    )  
    df['in_each_pair_shape_doesnt_change'] = df['task'].apply(
        lambda task: np.prod([int(len(example['input'][0])==len(example['output'][0])
                                  and len(example['input'])==len(example['output'])
                                 ) for example in task['train']
                            ])
    )
    df['in_each_pair_shape_ratio_is_the_same'] = df['task'].apply(
        lambda task: (len(set([len(example['input'][0]) / len(example['output'][0])
                                 for example in task['train']]))==1) * (
                      len(set([len(example['input']) / len(example['output'])
                                 for example in task['train']]))==1)
    )
    df['o/i_height_ratio_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output']) / len(task['train'][0]['input'])
                     if (len(set([len(example['input']) / len(example['output'])
                                 for example in task['train']]))==1)
                     else np.nan
    )
    df['o/i_width_ratio_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output'][0]) / len(task['train'][0]['input'][0])
                     if (len(set([len(example['input'][0]) / len(example['output'][0])
                                 for example in task['train']]))==1)
                     else np.nan
    )
    
    # my idea ---------
    # **
    df["input_color_change_or_not"] = df['task'].apply(lambda task: color_check([list(np.unique(np.array(example['input']))) for example in task["train"]] ))
    # **
    df['color_kind_increase'] = df['task'].apply(
        lambda task: np.all([len(np.unique(np.array(example['input']))) < len(np.unique(np.array(example['output']))) for example in task['train']]))
    df['color_kind_decrease'] = df['task'].apply(
        lambda task: np.all([len(np.unique(np.array(example['input']))) > len(np.unique(np.array(example['output']))) for example in task['train']]))
    return df

training_descriptive_df = create_df(training_path)
evaluation_descriptive_df = create_df(evaluation_path)
test_descriptive_df = create_df(test_path)

In [6]:
def classification(row):
    # same shape and color doesn't change in input and color kind decrease
    if row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.input_color_change_or_not==1 and row.color_kind_decrease==1:
        return 1
    # same shape and color doesn't change in input and color kind increase
    elif row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.input_color_change_or_not==1 and row.color_kind_increase==1:
        return 2
    # same shape and color doesn't change in input and color kind same
    elif row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.input_color_change_or_not==1 and row.color_kind_increase==0  and row.color_kind_decrease==0:
        return 3
    # same shape and decrease color sum → xgboost
    if row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.input_color_change_or_not==0 and row.color_kind_decrease==1:
        return 4
    # different shape and decrease color sum
    if row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.input_color_change_or_not==0 and row.color_kind_increase==1:
        return 5
    # different shape and increase color sum
    elif row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.input_color_change_or_not==0 and row.color_kind_increase==0  and row.color_kind_decrease==0:
        return 6
    # otherwise
    else:
        return 7

training_descriptive_df["class"] = training_descriptive_df.apply(lambda x: classification(x), axis=1)
evaluation_descriptive_df["class"] = evaluation_descriptive_df.apply(lambda x: classification(x), axis=1)
test_descriptive_df["class"] = test_descriptive_df.apply(lambda x: classification(x), axis=1)

In [7]:
#print(training_descriptive_df["class"].value_counts(normalize=True))
#print(evaluation_descriptive_df["class"].value_counts(normalize=True))
#print(test_descriptive_df["class"].value_counts(normalize=True))

# feature engineering and learning

In [8]:
def neighbours(color, cur_row, cur_col, nrows, ncols, radius):

    if cur_row<=radius-1: top = -1
    else: top = color[cur_row-radius][cur_col]
        
    if cur_row>=nrows-radius: bottom = -1
    else: bottom = color[cur_row+radius][cur_col]
        
    if cur_col<=radius-1: left = -1
    else: left = color[cur_row][cur_col-radius]
        
    if cur_col>=ncols-radius: right = -1
    else: right = color[cur_row][cur_col+radius]
        
    return top, bottom, left, right

def get_tl_tr(color, cur_row, cur_col, nrows, ncols, radius):
        
    if cur_row<=radius-1:
        top_left = -1
        top_right = -1
    else:
        if cur_col<=radius-1: top_left=-1
        else: top_left = color[cur_row-radius][cur_col-radius]
        if cur_col>=ncols-radius: top_right=-1
        else: top_right = color[cur_row-radius][cur_col+radius]   
        
    return top_left, top_right

def get_bl_br(color, cur_row, cur_col, nrows, ncols, radius):
        
    if cur_row>=nrows-radius:
        bottom_left = -1
        bottom_right = -1
    else:
        if cur_col<=radius-1: bottom_left=-1
        else: bottom_left = color[cur_row+radius][cur_col-radius]
        if cur_col>=ncols-radius: bottom_right=-1
        else: bottom_right = color[cur_row+radius][cur_col+radius]   
        
    return bottom_left, bottom_right

def diagonal(color, cur_row, cur_col, nrows, ncols, direction):
    element = []
    element.append(color[cur_row, cur_col])
    if direction == "upper-right":
        for i in range(-nrows,nrows):
            if (cur_row + i < nrows and cur_row +i >=0) and (cur_col - i < ncols and cur_col - i >=0):
                element.append(color[cur_row+i][cur_col-i])
            else:
                continue
    else:
        for i in range(-nrows,nrows):  
            if (cur_row + i < nrows and cur_row +i >=0) and (cur_col + i < ncols and cur_col + i >=0):
                element.append(color[cur_row+i][cur_col+i])
            else:
                continue
        
    return np.array(element)

def make_features(input_color, nfeat):
    nrows, ncols = input_color.shape
    feat = np.zeros((nrows*ncols,nfeat))
    cur_idx = 0
    for i in range(nrows):
        for j in range(ncols):
            feat[cur_idx,0] = i
            feat[cur_idx,1] = j
            feat[cur_idx,2] = input_color[i][j]
            feat[cur_idx,3:7] = neighbours(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,7:9] = get_tl_tr(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,9] = len(np.unique(input_color[i,:]))
            feat[cur_idx,10] = len(np.unique(input_color[:,j]))
            feat[cur_idx,11] = (i+j)
            feat[cur_idx,12] = len(np.unique(input_color[i-1:i+1,j-1:j+1]))
            feat[cur_idx,13:15] = get_bl_br(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,15] = np.sum(input_color[i,:])
            feat[cur_idx,16] = np.sum(input_color[:,j])
            feat[cur_idx,17:21] = neighbours(input_color, i, j, nrows, ncols,2)
            feat[cur_idx,21] = np.max(input_color[i,:])
            feat[cur_idx,22] = np.min(input_color[i,:])
            feat[cur_idx,23] = np.max(input_color[:,j])
            feat[cur_idx,24] = np.min(input_color[:,j])
            feat[cur_idx,25:29] = neighbours(input_color, i, j, nrows, ncols,3)
            feat[cur_idx,29] = np.sum(input_color[i-1:i+1,j-1:j+1])
            feat[cur_idx,30] = np.sum(input_color[i-2:i+2,j-2:j+2])
            feat[cur_idx,31] = len(input_color[i-5:i+5,j-5:j+5])
            #feat[cur_idx,32] = np.sum(input_color[i+1,:]) if i+1<nrows else -1
            #feat[cur_idx,33] = np.sum(input_color[i-1,:]) if i-1>0 else -1
            #feat[cur_idx,34] = np.sum(input_color[:,j+1]) if j+1<ncols else -1
            #feat[cur_idx,35] = np.sum(input_color[:,j-1]) if j-1>0 else -1     
            cur_idx += 1
        
    return feat

In [9]:
def features(task, mode='train'):
    num_train_pairs = len(task[mode])
    feat, target = [], []
    
    global local_neighb
    for task_num in range(num_train_pairs):
        input_color = np.array(task[mode][task_num]['input'])
        target_color = task[mode][task_num]['output']
        nrows, ncols = len(task[mode][task_num]['input']), len(task[mode][task_num]['input'][0])

        target_rows, target_cols = len(task[mode][task_num]['output']), len(task[mode][task_num]['output'][0])
        
        if (target_rows!=nrows) or (target_cols!=ncols):
            return None, None, 1

        imsize = nrows*ncols
        offset = imsize*task_num*3 #since we are using three types of aug
        feat.extend(make_features(input_color, nfeat))
        target.extend(np.array(target_color).reshape(-1,))
            
    return np.array(feat), np.array(target), 0

In [10]:
nfeat = 32
local_neighb = 5
def data_aug(tasks):
    tmp = []
    for i in range(len(tasks)):
        tmp1 = {'input': 0, 'output': 0}
        tmp1["input"], tmp1["output"] = np.fliplr(tasks[i]["input"]).tolist(), np.fliplr(tasks[i]["output"]).tolist()
        tmp2 = {'input': 0, 'output': 0}
        tmp2["input"], tmp2["output"] = np.flipud(tasks[i]["input"]).tolist(), np.flipud(tasks[i]["output"]).tolist()
        tmp3 = {'input': 0, 'output': 0}
        tmp3["input"], tmp3["output"] = np.rot90(tasks[i]["input"]).tolist(), np.rot90(tasks[i]["output"]).tolist()
        tmp4 = {'input': 0, 'output': 0}
        tmp4["input"], tmp4["output"] = np.rot90(np.fliplr(tasks[i]["input"]),1).tolist(), np.rot90(np.fliplr(tasks[i]["output"]),1).tolist()
        tmp5 = {'input': 0, 'output': 0}
        tmp5["input"], tmp5["output"] = np.rot90(np.fliplr(tasks[i]["input"]),2).tolist(), np.rot90(np.fliplr(tasks[i]["output"]),2).tolist()
        tmp6 = {'input': 0, 'output': 0}
        tmp6["input"], tmp6["output"] = np.rot90(np.fliplr(tasks[i]["input"]),3).tolist(), np.rot90(np.fliplr(tasks[i]["output"]),3).tolist()
        tmp7 = {'input': 0, 'output': 0}
        tmp7["input"], tmp7["output"] = np.rot90(np.flipud(tasks[i]["input"]),1).tolist(), np.rot90(np.flipud(tasks[i]["output"]),1).tolist()
        tmp8 = {'input': 0, 'output': 0}
        tmp8["input"], tmp8["output"] = np.rot90(np.flipud(tasks[i]["input"]),2).tolist(),np.rot90(np.flipud(tasks[i]["output"]),2).tolist()
        tmp9 = {'input': 0, 'output': 0}
        tmp9["input"], tmp9["output"] =np.rot90(np.flipud(tasks[i]["input"]),3).tolist(),np.rot90(np.flipud(tasks[i]["output"]),3).tolist()
        tmp10 = {'input': 0, 'output': 0}
        tmp10["input"], tmp10["output"] = np.fliplr(np.flipud(tasks[i]["input"])).tolist(),np.fliplr(np.flipud(tasks[i]["output"])).tolist()
        tmp11 = {'input': 0, 'output': 0}
        tmp11["input"], tmp11["output"] = np.flipud(np.fliplr(tasks[i]["input"])).tolist(),np.flipud(np.fliplr(tasks[i]["output"])).tolist()
        tmp.append(tmp1)
        tmp.append(tmp2)
        tmp.append(tmp3)
        tmp.append(tmp4)
        tmp.append(tmp5)
        tmp.append(tmp6)
        tmp.append(tmp7)
        tmp.append(tmp8)
        tmp.append(tmp9)
        tmp.append(tmp10)
        tmp.append(tmp11)
    for i in tmp:
        tasks.append(i)
    return tasks

def data_aug2(a):
    flg = 0
    new_data = []
    for job in a:
        color = [i for i in np.unique(np.array(job["input"])) if i != 0]
        color_out = [i for i in np.unique(np.array(job["output"])) if i != 0]
        if set(color) == set(color_out):
            color_pos = []
            color_posout = []
            for i in color: # for input
                tmp = np.argwhere(np.array(job["input"])==i).tolist()
                color_pos.append(tmp)
            for i in color: # for output
                tmp = np.argwhere(np.array(job["output"])==i).tolist()
                color_posout.append(tmp)
            ind = [j for j in range(len(color))]
            for i,ele in enumerate(permutations(ind)):
                if i != 0:
                    tmp1 = np.copy(job["input"])
                    for c in range(len(ele)):
                        for pos in color_pos[ele[c]]:
                            tmp1[pos[0],pos[1]] = color[c]
                    tmp2 = np.copy(job["output"])
                    for c in range(len(ele)):
                        for pos in color_posout[ele[c]]:
                            tmp2[pos[0],pos[1]] = color[c]
                    if len(new_data) >50:
                        break
                    new_data.append({"input":tmp1.tolist(), "output":tmp2.tolist()})
        else:
            return a
    for i in new_data:
        a.append(i)
    return a

def modelling(mode, kind):
    print(mode)
    count = 0
    sample_sub = pd.read_csv(data_path/'sample_submission.csv')
    sample_sub = sample_sub.set_index('output_id')
    
    valid_scores = {}
    model_accuracies = {'ens': []}
    pred_taskids = []
    
    if mode=='eval':
        task_path = evaluation_path
        df = evaluation_descriptive_df
    elif mode=='train':
        task_path = training_path
        df = training_descriptive_df
    elif mode=='test':
        task_path = test_path
        df = test_descriptive_df
    all_task_ids = sorted(os.listdir(task_path))
    # training ----------
    for task_id in all_task_ids:
        class_num = df[df.task_name==task_id]["class"].values[0]
        task_file = str(task_path / task_id)
        with open(task_file, 'r') as f:
            task = json.load(f)
        
        if mode != "test":
            task = preprocess(task, task_id)

        if class_num == 1 or class_num==3 or class_num==5 or class_num==6:
            _, _, not_valid = features(task)
            if not_valid:
                print('ignoring task', task_file)
                count += 1
                continue
            
            task["train"] = data_aug2(task["train"])
            feat, target, _ = features(task)
        elif class_num == 2 or class_num == 4:
            _, _, not_valid = features(task)
            if not_valid:
                print('ignoring task', task_file)
                count += 1
                continue
            
            task["train"] = data_aug(task["train"])
            feat, target, _ = features(task)
        else:
            feat, target, not_valid = features(task)
            if not_valid:
                print('ignoring task', task_file)
                count += 1
                continue


        if kind == "xgb":   
            model = XGBClassifier(n_estimators=50, max_depth = 5, num_leaves=10, learning_rate=0.1, n_jobs=-1)
        elif kind == "lgb":
            model = LGBMClassifier(n_estimators=60, max_depth=5, min_child_samples=1, n_jobs=-1, learning_rate=0.25)
        else:
            model = CatBoostClassifier(n_estimators=70, max_depth = 6, min_child_samples=1, learning_rate=0.25)
        model.fit(feat, target, verbose=0)
    # training on input pairs is done ----------
    
    # test predictions begins here
        num_test_pairs = len(task['test'])
        for task_num in range(num_test_pairs):
            cur_idx = 0
            input_color = np.array(task['test'][task_num]['input'])
            nrows, ncols = len(task['test'][task_num]['input']), len(task['test'][task_num]['input'][0])
            feat = make_features(input_color, nfeat)
            preds = model.predict(feat).reshape(nrows,ncols)
        
            if (mode=='train') or (mode=='eval'):
                ens_acc = (np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols)
                model_accuracies['ens'].append(ens_acc)
                pred_taskids.append(f'{task_id[:-5]}_{task_num}')
                print(str(class_num) + ", " + str(task_id) + ' ensemble accuracy',(np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols))
            else:
                preds = preds.astype(int).tolist()
                sample_sub.loc[f'{task_id[:-5]}_{task_num}','output'] = flattener(preds)
    print(str(count)+" tasks were ignored.")
    return sample_sub, model_accuracies, pred_taskids

_, train_xgb_accuracies, train_ids= modelling('train', 'xgb')
_, eval_xgb_accuracies, eval_ids  = modelling('eval', 'xgb')
#_, train_lgb_accuracies, train_ids= modelling('train', 'lgb')
#_, eval_lgb_accuracies, eval_ids  = modelling('eval', 'lgb')
#_, train_cat_accuracies, train_ids= modelling('train', 'cat')
#_, eval_cat_accuracies, eval_ids  = modelling('eval', 'cat')

train
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/007bbfb7.json
2, 00d62c1b.json ensemble accuracy 0.9375
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/017c7c7b.json
6, 025d127b.json ensemble accuracy 0.82
6, 045e512c.json ensemble accuracy 0.9297052154195011
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/0520fde7.json
4, 05269061.json ensemble accuracy 0.6530612244897959
3, 05f2a901.json ensemble accuracy 0.8818181818181818
6, 06df4c85.json ensemble accuracy 0.9349112426035503
2, 08ed6ac7.json ensemble accuracy 0.8148148148148148
1, 09629e4f.json ensemble accuracy 0.7107438016528925
6, 0962bcdd.json ensemble accuracy 0.7638888888888888
6, 0a938d79.json ensemble accuracy 0.6767676767676768
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/training/0b148d64.json
5, 0ca9ddb6.json ensemble accuracy 0.8518518518518519
6, 0d3d703e.json ensemble accuracy 1.0
4, 0dfd9992.json ensemble accuracy 0.64

In [11]:
df = pd.DataFrame(train_xgb_accuracies, index=train_ids)
for c in df.columns:
    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]==1).sum())
    print(df[df.loc[:, c]==1].index)
    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]>0.9).sum())

df = pd.DataFrame(eval_xgb_accuracies, index=eval_ids)
for c in df.columns:
    print(f'for {c} no. of complete evaluation tasks is', (df.loc[:, c]==1).sum())
    print(df[df.loc[:, c]==1].index)
    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]>0.9).sum())

for ens no. of complete training tasks is 31
Index(['0d3d703e_0', '2281f1f4_0', '23581191_0', '25d8a9c8_0', '25ff71a9_0',
       '25ff71a9_1', '2bee17df_0', '32597951_0', '3618c87e_0', '3aa6fb7a_0',
       '4258a5f9_0', '50cb2852_0', '67385a82_0', '6c434453_0', '6d75e8bb_0',
       '6f8cd79b_0', '794b24be_0', '794b24be_1', 'a79310a0_0', 'a9f96cdd_0',
       'aedd82e4_0', 'b1948b0a_0', 'b60334d2_0', 'b6afb2da_0', 'bb43febb_0',
       'bdad9b1f_0', 'c0f76784_0', 'c8f0f002_0', 'd037b0a7_0', 'd4f3cd78_0',
       'dc433765_0'],
      dtype='object')
for ens no. of complete training tasks is 114
for ens no. of complete evaluation tasks is 15
Index(['0e671a1a_0', '137f0df0_0', '140c817e_0', '1c0d0a4b_0', '319f2597_0',
       '516b51b7_0', '5b526a93_0', '60a26a3e_0', '6ea4a07e_0', '6ea4a07e_1',
       '84f2aca1_0', 'ae58858e_0', 'd37a1ef5_0', 'da2b0fe3_0', 'e0fb7511_0'],
      dtype='object')
for ens no. of complete training tasks is 107


In [12]:
#df = pd.DataFrame(train_lgb_accuracies, index=train_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]>0.9).sum())


#df = pd.DataFrame(eval_lgb_accuracies, index=eval_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete evaluation tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]>0.9).sum())

In [13]:
#df = pd.DataFrame(train_cat_accuracies, index=train_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]>0.9).sum())

#df = pd.DataFrame(eval_cat_accuracies, index=eval_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete evaluation tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]>0.9).sum())

# Prediction

In [14]:
test_xgb, _, _ = modelling('test', 'xgb')
test_xgb.to_csv("submission.csv", index=False)

test
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/00576224.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0692e18c.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0934a4d8.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0a1d4ef5.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0bb8deee.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0c786b71.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0c9aba6e.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/12997ef3.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/136b0064.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/15696249.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/195ba7dc.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/1990f7a8.json

In [15]:
#final_sub = test_xgb.reset_index()
#final_sub = final_sub.sort_values(by="output_id")

#test_lgb = test_lgb.sort_values(by="output_id")
#test_cat = test_cat.sort_values(by="output_id")
#out1 = final_sub["output"].astype(str).values
#out2 = test_lgb["output"].astype(str).values
#out3 = test_cat["output"].astype(str).values

#merge_output = []
#for o1, o2, o3 in zip(out1, out2, out3):
#    o = o1.strip().split(" ")[:1] + o2.strip().split(" ")[:1] + o2.strip().split(" ")[:1]
#    o = " ".join(o[:3])
#    merge_output.append(o)
#final_sub["output"] = merge_output
#final_sub["output"] = final_sub["output"].astype(str)
#final_sub.to_csv("submission.csv", index=False)
#final_sub.head()

# example try

In [16]:
nfeat = 32
local_neighb = 5
def data_aug(tasks):
    tmp = []
    for i in range(len(tasks)):
        tmp1 = {'input': 0, 'output': 0}
        tmp1["input"] = np.fliplr(tasks[i]["input"]).tolist()
        tmp1["output"] = np.fliplr(tasks[i]["output"]).tolist()
        tmp2 = {'input': 0, 'output': 0}
        tmp2["input"] = np.flipud(tasks[i]["input"]).tolist()
        tmp2["output"] = np.flipud(tasks[i]["output"]).tolist()
        tmp3 = {'input': 0, 'output': 0}
        tmp3["input"] = np.rot90(tasks[i]["input"]).tolist()
        tmp3["output"] = np.rot90(tasks[i]["output"]).tolist()
        tmp4= {'input': 0, 'output': 0}
        tmp4["input"] = np.transpose(tasks[i]["input"]).tolist()
        tmp4["output"] = np.transpose(tasks[i]["output"]).tolist()
        tmp.append(tmp1)
        tmp.append(tmp2)
        tmp.append(tmp3)
        tmp.append(tmp4)
    for i in tmp:
        tasks.append(i)
    return tasks

def data_aug2(a):
    flg = 0
    new_data = []
    for job in a:
        color = [i for i in np.unique(np.array(job["input"])) if i != 0]
        color_out = [i for i in np.unique(np.array(job["output"])) if i != 0]
        if set(color) == set(color_out):
            color_pos = []
            color_posout = []
            for i in color: # for input
                tmp = np.argwhere(np.array(job["input"])==i).tolist()
                color_pos.append(tmp)
            for i in color: # for output
                tmp = np.argwhere(np.array(job["output"])==i).tolist()
                color_posout.append(tmp)
            ind = [j for j in range(len(color))]
            for i,ele in enumerate(permutations(ind)):
                if i != 0:
                    tmp1 = np.copy(job["input"])
                    for c in range(len(ele)):
                        for pos in color_pos[ele[c]]:
                            tmp1[pos[0],pos[1]] = color[c]
                    tmp2 = np.copy(job["output"])
                    for c in range(len(ele)):
                        for pos in color_posout[ele[c]]:
                            tmp2[pos[0],pos[1]] = color[c]
                    if len(new_data) >100:
                        break
                    new_data.append({"input":tmp1.tolist(), "output":tmp2.tolist()})
        else:
            return a
    for i in new_data:
        a.append(i)
    return a

def make_features(input_color, nfeat): # for class 2 and 4
    nrows, ncols = input_color.shape
    feat = np.zeros((nrows*ncols,nfeat))
    cur_idx = 0
    for i in range(nrows):
        for j in range(ncols):
            feat[cur_idx,0] = i
            feat[cur_idx,1] = j
            feat[cur_idx,2] = input_color[i][j]
            feat[cur_idx,3:7] = neighbours(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,7:9] = get_tl_tr(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,9] = len(np.unique(input_color[i,:]))
            feat[cur_idx,10] = len(np.unique(input_color[:,j]))
            feat[cur_idx,11] = (i+j)
            feat[cur_idx,12] = len(np.unique(input_color[i-1:i+1,j-1:j+1]))
            feat[cur_idx,13:15] = get_bl_br(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,15] = np.sum(input_color[i,:])
            feat[cur_idx,16] = np.sum(input_color[:,j])
            feat[cur_idx,17:21] = neighbours(input_color, i, j, nrows, ncols,2)
            feat[cur_idx,21] = np.max(input_color[i,:])
            feat[cur_idx,22] = np.min(input_color[i,:])
            feat[cur_idx,23] = np.max(input_color[:,j])
            feat[cur_idx,24] = np.min(input_color[:,j])
            feat[cur_idx,25:29] = neighbours(input_color, i, j, nrows, ncols,3)
            feat[cur_idx,29] = np.sum(input_color[i-1:i+1,j-1:j+1])
            feat[cur_idx,30] = np.sum(input_color[i-2:i+2,j-2:j+2])
            feat[cur_idx,31] = len(input_color[i-5:i+5,j-5:j+5])          
            cur_idx += 1
        
    return feat

def features(task, mode='train'):
    num_train_pairs = len(task[mode])
    feat, target = [], []
    
    global local_neighb
    for task_num in range(num_train_pairs):
        input_color = np.array(task[mode][task_num]['input'])
        target_color = task[mode][task_num]['output']
        nrows, ncols = len(task[mode][task_num]['input']), len(task[mode][task_num]['input'][0])

        target_rows, target_cols = len(task[mode][task_num]['output']), len(task[mode][task_num]['output'][0])
        
        if (target_rows!=nrows) or (target_cols!=ncols):
            return None, None, 1

        imsize = nrows*ncols
        offset = imsize*task_num*3 #since we are using three types of aug
        feat.extend(make_features(input_color, nfeat))
        target.extend(np.array(target_color).reshape(-1,))
            
    return np.array(feat), np.array(target), 0

def plot_task(task1,task2):
    """
    Plots the first train and test pairs of a specified task,
    using same color scheme as the ARC app
    """
    cmap = colors.ListedColormap(
        ['#000000', '#0074D9','#FF4136','#2ECC40','#FFDC00',
         '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
    norm = colors.Normalize(vmin=0, vmax=9)
    fig, axs = plt.subplots(1, 2, figsize=(8,8))
    axs[0].imshow(task1, cmap=cmap, norm=norm)
    axs[0].axis('off')
    axs[0].set_title('Input')
    axs[1].imshow(task2, cmap=cmap, norm=norm)
    axs[1].axis('off')
    axs[1].set_title('Output')
    plt.tight_layout()
    plt.show()

def modelling(mode, kind):
    print(mode)
    count = 0
    sample_sub = pd.read_csv(data_path/'sample_submission.csv')
    sample_sub = sample_sub.set_index('output_id')
    
    valid_scores = {}
    model_accuracies = {'ens': []}
    pred_taskids = []
    
    if mode=='eval':
        task_path = evaluation_path
        df = evaluation_descriptive_df
    elif mode=='train':
        task_path = training_path
        df = training_descriptive_df
    elif mode=='test':
        task_path = test_path
        df = test_descriptive_df
    all_task_ids = sorted(os.listdir(task_path))
    # training ----------
    for task_id in all_task_ids:
        if task_id == "68b16354.json":
            class_num = df[df.task_name==task_id]["class"].values[0]
            print(class_num)
            task_file = str(task_path / task_id)
            with open(task_file, 'r') as f:
                task = json.load(f)
        
            if mode != "test":
                task = preprocess(task, task_id)

            if class_num == 4 or class_num==8 or class_num==2:
                _, _, not_valid = features(task)
                if not_valid:
                    print('ignoring task', task_file)
                    count += 1
                    continue
            
                task["train"] = data_aug(task["train"])
                feat, target, _ = features(task)
            else:
                feat, target, not_valid = features(task)
                if not_valid:
                    print('ignoring task', task_file)
                    count += 1
                    continue


            if kind == "xgb":   
                model = XGBClassifier(n_estimators=200, max_depth = 5, num_leaves=15, learning_rate=0.1, n_jobs=-1)
            elif kind == "lgb":
                model = LGBMClassifier(n_estimators=60, max_depth=5, min_child_samples=1, n_jobs=-1, learning_rate=0.25)
            else:
                model = CatBoostClassifier(n_estimators=70, max_depth = 6, min_child_samples=1, learning_rate=0.25)
            model.fit(feat, target)
    # training on input pairs is done ----------
    
    # test predictions begins here
            num_test_pairs = len(task['test'])
            for task_num in range(num_test_pairs):
                cur_idx = 0
                input_color = np.array(task['test'][task_num]['input'])
                nrows, ncols = len(task['test'][task_num]['input']), len(task['test'][task_num]['input'][0])
                feat = make_features(input_color, nfeat)
                preds = model.predict(feat).reshape(nrows,ncols)
                plot_task(input_color, preds)
        
                if (mode=='train') or (mode=='eval'):
                    ens_acc = (np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols)
                    model_accuracies['ens'].append(ens_acc)
                    pred_taskids.append(f'{task_id[:-5]}_{task_num}')
                    print('ensemble accuracy',(np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols))
                else:
                    preds = preds.astype(int).tolist()
                    sample_sub.loc[f'{task_id[:-5]}_{task_num}','output'] = flattener(preds)
    print(str(count)+" tasks were ignored.")
    return sample_sub, model_accuracies, pred_taskids

#_, _, _ = modelling('train', 'xgb')
#_, _, _ = modelling('eval', 'xgb')