- add new variables and improve local score to 32 and 13
- check ensemble of xgb, lgb and cat
- https://www.kaggle.com/meaninglesslives/using-decision-trees-for-arc
- https://www.kaggle.com/davidbnn92/task-tagging

In [1]:
import numpy as np
import pandas as pd
import os
import json
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib import colors
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
data_path = Path('/kaggle/input/abstraction-and-reasoning-challenge/')
training_path = data_path / 'training'
evaluation_path = data_path / 'evaluation'
test_path = data_path / 'test'

In [3]:
def flattener(pred):
    str_pred = str([row for row in pred])
    str_pred = str_pred.replace(', ', '')
    str_pred = str_pred.replace('[[', '|')
    str_pred = str_pred.replace('][', '|')
    str_pred = str_pred.replace(']]', '|')
    return str_pred

# task tagging

In [4]:
def create_df(folder_path):
    task_names_list = sorted(os.listdir(folder_path))
    task_list = []
    for task_name in task_names_list: 
        task_file = str(folder_path / task_name)
        with open(task_file, 'r') as f:
            task = json.load(f)
            task_list.append(task)
    
    df = pd.DataFrame()
    df['task_name'] = task_names_list
    df['task'] = task_list
    df['number_of_train_pairs'] = df['task'].apply(lambda x: len(x['train']))
    df['number_of_test_pairs'] = df['task'].apply(lambda x: len(x['test']))
    
    # Compare image sizes
    df['inputs_all_have_same_height'] = df['task'].apply(
        lambda task: int(len(set([len(example['input']) for example in task['train']])) == 1)
    )
    df['inputs_all_have_same_width'] = df['task'].apply(
        lambda task: int(len(set([len(example['input'][0]) for example in task['train']])) == 1)
    )
    df['inputs_all_have_same_shape'] = df['inputs_all_have_same_height'] * df['inputs_all_have_same_width']
    df['input_height_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['input'])
                     if (len(set([len(example['input']) for example in task['train']])) == 1)
                     else np.nan
    )
    df['input_width_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['input'][0])
                     if (len(set([len(example['input'][0]) for example in task['train']])) == 1)
                     else np.nan
    )
    df['outputs_all_have_same_height'] = df['task'].apply(
        lambda task: int(len(set([len(example['output']) for example in task['train']])) == 1)
    )
    df['outputs_all_have_same_width'] = df['task'].apply(
        lambda task: int(len(set([len(example['output'][0]) for example in task['train']])) == 1)
    )
    df['outputs_all_have_same_shape'] = df['outputs_all_have_same_height'] * df['outputs_all_have_same_width']
    df['output_height_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output'])
                     if (len(set([len(example['output']) for example in task['train']])) == 1)
                     else np.nan
    )
    df['output_width_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output'][0])
                     if (len(set([len(example['output'][0]) for example in task['train']])) == 1)
                     else np.nan
    )  
    df['in_each_pair_shape_doesnt_change'] = df['task'].apply(
        lambda task: np.prod([int(len(example['input'][0])==len(example['output'][0])
                                  and len(example['input'])==len(example['output'])
                                 ) for example in task['train']
                            ])
    )
    df['in_each_pair_shape_ratio_is_the_same'] = df['task'].apply(
        lambda task: (len(set([len(example['input'][0]) / len(example['output'][0])
                                 for example in task['train']]))==1) * (
                      len(set([len(example['input']) / len(example['output'])
                                 for example in task['train']]))==1)
    )
    df['o/i_height_ratio_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output']) / len(task['train'][0]['input'])
                     if (len(set([len(example['input']) / len(example['output'])
                                 for example in task['train']]))==1)
                     else np.nan
    )
    df['o/i_width_ratio_if_constant'] = df['task'].apply(
        lambda task: len(task['train'][0]['output'][0]) / len(task['train'][0]['input'][0])
                     if (len(set([len(example['input'][0]) / len(example['output'][0])
                                 for example in task['train']]))==1)
                     else np.nan
    )
    
    # my idea ---------
    df["same_color_sum"] = df['task'].apply(lambda task: 
                        np.all([int(sum(sum(np.array(example['input'])))== sum(sum(np.array(example['output'])))) for example in task['train']]))
    
    df["same_color_sum_in_edge"] = df['task'].apply(lambda task: 
                        np.all([int(sum(np.array(example['input'])[0,:]) +sum(np.array(example['input'])[:,0]) + 
                                    sum(np.array(example['input'])[-1,:]) +sum(np.array(example['input'])[:,-1])
                                    == 
                                    sum(np.array(example['output'])[0,:]) +sum(np.array(example['output'])[:,0]) + 
                                    sum(np.array(example['output'])[-1,:]) +sum(np.array(example['output'])[:,-1])) for example in task['train']]))
    
    df["io_color_kind_diff"] = df['task'].apply(lambda task: [len(np.unique(np.array(example['input']))) - len(np.unique(np.array(example['output']))) for example in task['train']])
    df["io_color_kind_diff_constant"] = df['io_color_kind_diff'].apply(lambda task: np.unique(np.array(task))[0] if len(np.unique(np.array(task)))==1 else -1)
    df["output_not_include_0"] = df['task'].apply(lambda task: np.all([np.all(np.array(example['output']) > 0) for example in task['train']]))
    df["increase_color_sum"] = df['task'].apply(lambda task: 
                        np.all([int(sum(sum(np.array(example['input']))) < sum(sum(np.array(example['output'])))) for example in task['train']]))
    df["decrease_color_sum"] = df['task'].apply(lambda task: 
                        np.all([int(sum(sum(np.array(example['input']))) > sum(sum(np.array(example['output'])))) for example in task['train']]))
    

    return df

training_descriptive_df = create_df(training_path)
evaluation_descriptive_df = create_df(evaluation_path)
test_descriptive_df = create_df(test_path)

In [5]:
def classification(row):
    # same shape and same color sum → xgboost
    if row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.same_color_sum==1:
        return 1
    # same shape and increase color sum and include black in output　→ xgboost
    elif row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.increase_color_sum==1 and row.output_not_include_0 == 0:
        return 2
    # same shape and incrase color sum and no black in output
    elif row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.increase_color_sum==1 and row.output_not_include_0 == 1:
        return 3
    # same shape and decrease color sum → xgboost
    elif row["in_each_pair_shape_doesnt_change"] == 1 and row["o/i_height_ratio_if_constant"] ==1 and row["o/i_width_ratio_if_constant"]==1 and row.decrease_color_sum==1:
        return 4
    # different shape and decrease color sum
    elif row["in_each_pair_shape_doesnt_change"] == 0 and row.decrease_color_sum==1:
        return 5
    # different shape and increase color sum
    elif row["in_each_pair_shape_doesnt_change"] == 0 and row.increase_color_sum==1:
        return 6
    # different shape and same color sum
    elif row["in_each_pair_shape_doesnt_change"] == 0 and row.same_color_sum==1:
        return 7
    # otherwise
    else:
        return 8
training_descriptive_df["class"] = training_descriptive_df.apply(lambda x: classification(x), axis=1)
evaluation_descriptive_df["class"] = evaluation_descriptive_df.apply(lambda x: classification(x), axis=1)
test_descriptive_df["class"] = test_descriptive_df.apply(lambda x: classification(x), axis=1)

In [6]:
print(training_descriptive_df["class"].value_counts(normalize=True))
print(evaluation_descriptive_df["class"].value_counts(normalize=True))
print(test_descriptive_df["class"].value_counts(normalize=True))

2    0.3650
5    0.2000
8    0.1075
4    0.1050
6    0.0925
1    0.0600
3    0.0475
7    0.0225
Name: class, dtype: float64
2    0.3650
5    0.2000
8    0.0950
4    0.0950
6    0.0925
1    0.0700
3    0.0600
7    0.0225
Name: class, dtype: float64
2    0.31
5    0.23
4    0.18
6    0.11
8    0.09
3    0.04
1    0.03
7    0.01
Name: class, dtype: float64


# feature engineering and learning

In [7]:
def neighbours(color, cur_row, cur_col, nrows, ncols, radius):

    if cur_row<=radius-1: top = -1
    else: top = color[cur_row-radius][cur_col]
        
    if cur_row>=nrows-radius: bottom = -1
    else: bottom = color[cur_row+radius][cur_col]
        
    if cur_col<=radius-1: left = -1
    else: left = color[cur_row][cur_col-radius]
        
    if cur_col>=ncols-radius: right = -1
    else: right = color[cur_row][cur_col+radius]
        
    return top, bottom, left, right

def get_tl_tr(color, cur_row, cur_col, nrows, ncols, radius):
        
    if cur_row<=radius-1:
        top_left = -1
        top_right = -1
    else:
        if cur_col<=radius-1: top_left=-1
        else: top_left = color[cur_row-radius][cur_col-radius]
        if cur_col>=ncols-radius: top_right=-1
        else: top_right = color[cur_row-radius][cur_col+radius]   
        
    return top_left, top_right

def get_bl_br(color, cur_row, cur_col, nrows, ncols, radius):
        
    if cur_row>=nrows-radius:
        bottom_left = -1
        bottom_right = -1
    else:
        if cur_col<=radius-1: bottom_left=-1
        else: bottom_left = color[cur_row+radius][cur_col-radius]
        if cur_col>=ncols-radius: bottom_right=-1
        else: bottom_right = color[cur_row+radius][cur_col+radius]   
        
    return bottom_left, bottom_right

def diagonal(color, cur_row, cur_col, nrows, ncols, direction):
    element = []
    element.append(color[cur_row, cur_col])
    if direction == "upper-right":
        for i in range(-nrows,nrows):
            if (cur_row + i < nrows and cur_row +i >=0) and (cur_col - i < ncols and cur_col - i >=0):
                element.append(color[cur_row+i][cur_col-i])
            else:
                continue
    else:
        for i in range(-nrows,nrows):  
            if (cur_row + i < nrows and cur_row +i >=0) and (cur_col + i < ncols and cur_col + i >=0):
                element.append(color[cur_row+i][cur_col+i])
            else:
                continue
        
    return np.array(element)

def fliplr_check(color, cur_row, cur_col, r):
    pixmap = color[cur_row-r:cur_row+r,cur_col-r:cur_col+r].copy()
    if pixmap.shape[0] == pixmap.shape[1]:
        return np.all(color[cur_row-r:cur_row+r,cur_col-r:cur_col+r] == np.fliplr(np.array(pixmap)))
    else:
        return -1
    
def flipud_check(color, cur_row, cur_col, r):
    pixmap = color[cur_row-r:cur_row+r,cur_col-r:cur_col+r].copy()
    if pixmap.shape[0] == pixmap.shape[1]:
        return np.all(color[cur_row-r:cur_row+r,cur_col-r:cur_col+r] == np.flipud(np.array(pixmap)))
    else:
        return -1
    
def flip90_check(color, cur_row, cur_col, r):
    pixmap = color[cur_row-r:cur_row+r,cur_col-r:cur_col+r].copy()
    if pixmap.shape[0] == pixmap.shape[1]:
        return np.all(color[cur_row-r:cur_row+r,cur_col-r:cur_col+r] == np.rot90(np.array(pixmap)))
    else:
        return -1

def make_features(input_color, nfeat): # for class 2 and 4
    nrows, ncols = input_color.shape
    feat = np.zeros((nrows*ncols,nfeat))
    cur_idx = 0
    for i in range(nrows):
        for j in range(ncols):
            feat[cur_idx,0] = i
            feat[cur_idx,1] = j
            feat[cur_idx,2] = input_color[i][j]
            feat[cur_idx,3:7] = neighbours(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,7:9] = get_tl_tr(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,9] = len(np.unique(input_color[i,:]))
            feat[cur_idx,10] = len(np.unique(input_color[:,j]))
            feat[cur_idx,11] = (i+j)
            feat[cur_idx,12] = len(np.unique(input_color[i-1:i+1,j-1:j+1]))
            feat[cur_idx,13:15] = get_bl_br(input_color, i, j, nrows, ncols,1)
            feat[cur_idx,15] = np.sum(input_color[i,:])
            feat[cur_idx,16] = np.sum(input_color[:,j])
            feat[cur_idx,17:21] = neighbours(input_color, i, j, nrows, ncols,2)
            feat[cur_idx,21] = np.max(input_color[i,:])
            feat[cur_idx,22] = np.min(input_color[i,:])
            feat[cur_idx,23] = np.max(input_color[:,j])
            feat[cur_idx,24] = np.min(input_color[:,j])
            feat[cur_idx,25:29] = neighbours(input_color, i, j, nrows, ncols,3)
            feat[cur_idx,29] = np.sum(input_color[i-1:i+1,j-1:j+1])
            feat[cur_idx,30] = np.sum(input_color[i-2:i+2,j-2:j+2])
            feat[cur_idx,31] = len(input_color[i-5:i+5,j-5:j+5])
            cur_idx += 1
        
    return feat

In [8]:
def features(task, mode='train'):
    num_train_pairs = len(task[mode])
    feat, target = [], []
    
    global local_neighb
    for task_num in range(num_train_pairs):
        input_color = np.array(task[mode][task_num]['input'])
        target_color = task[mode][task_num]['output']
        nrows, ncols = len(task[mode][task_num]['input']), len(task[mode][task_num]['input'][0])

        target_rows, target_cols = len(task[mode][task_num]['output']), len(task[mode][task_num]['output'][0])
        
        if (target_rows!=nrows) or (target_cols!=ncols):
            return None, None, 1

        imsize = nrows*ncols
        offset = imsize*task_num*3 #since we are using three types of aug
        feat.extend(make_features(input_color, nfeat))
        target.extend(np.array(target_color).reshape(-1,))
            
    return np.array(feat), np.array(target), 0

In [9]:
nfeat = 32
local_neighb = 5
def modelling(mode, kind):
    print(mode)
    count = 0
    sample_sub = pd.read_csv(data_path/'sample_submission.csv')
    sample_sub = sample_sub.set_index('output_id')
    
    valid_scores = {}
    model_accuracies = {'ens': []}
    pred_taskids = []
    
    if mode=='eval':
        task_path = evaluation_path
        #all_task_ids = list(evaluation_descriptive_df[evaluation_descriptive_df["class"]==3]["task_name"]) # for particular class
    elif mode=='train':
        task_path = training_path
        #all_task_ids = list(training_descriptive_df[training_descriptive_df["class"]==3]["task_name"]) # for particular class
    elif mode=='test':
        task_path = test_path
    all_task_ids = sorted(os.listdir(task_path))
    # training -----
    for task_id in all_task_ids:
        task_file = str(task_path / task_id)
        with open(task_file, 'r') as f:
            task = json.load(f)

        feat, target, not_valid = features(task)
        if not_valid:
            print('ignoring task', task_file)
            count += 1
            continue

        if kind == "xgb":   
            model = XGBClassifier(n_estimators=50, max_depth = 5, num_leaves=10, learning_rate=0.1, n_jobs=-1)
        elif kind == "lgb":
            model = LGBMClassifier(n_estimators=60, max_depth=4, n_jobs=-1, learning_rate=0.25)
            #model = LGBMClassifier(n_estimators=60, max_depth=4, n_jobs=-1, learning_rate=0.25)
        else:
            model = CatBoostClassifier(n_estimators=50, max_depth = 6, learning_rate=0.25)
            #model = CatBoostClassifier(n_estimators=50, max_depth = 6, learning_rate=0.25)
        model.fit(feat, target, verbose=0)
    # training on input pairs is done
    
    # test predictions begins here
        num_test_pairs = len(task['test'])
        for task_num in range(num_test_pairs):
            cur_idx = 0
            input_color = np.array(task['test'][task_num]['input'])
            nrows, ncols = len(task['test'][task_num]['input']), len(task['test'][task_num]['input'][0])
            feat = make_features(input_color, nfeat)
            preds = model.predict(feat).reshape(nrows,ncols)
        
            if (mode=='train') or (mode=='eval'):
                ens_acc = (np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols)
                model_accuracies['ens'].append(ens_acc)
                pred_taskids.append(f'{task_id[:-5]}_{task_num}')
                #print('ensemble accuracy',(np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols))
            
            preds = preds.astype(int).tolist()
            #plot_test(preds, task_id)
            sample_sub.loc[f'{task_id[:-5]}_{task_num}','output'] = flattener(preds)
    print(str(count)+" tasks were ignored.")
    return sample_sub, model_accuracies, pred_taskids

#_, train_xgb_accuracies, train_ids= modelling('train', 'xgb')
#_, eval_xgb_accuracies, eval_ids  = modelling('eval', 'xgb')
#_, train_lgb_accuracies, train_ids= modelling('train', 'lgb')
#_, eval_lgb_accuracies, eval_ids  = modelling('eval', 'lgb')
#_, train_cat_accuracies, train_ids= modelling('train', 'cat')
#_, eval_cat_accuracies, eval_ids  = modelling('eval', 'cat')

In [10]:
#df = pd.DataFrame(train_xgb_accuracies, index=train_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)

#df = pd.DataFrame(eval_xgb_accuracies, index=eval_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete evaluation tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)

In [11]:
#df = pd.DataFrame(train_lgb_accuracies, index=train_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)

#df = pd.DataFrame(eval_lgb_accuracies, index=eval_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete evaluation tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)

In [12]:
#df = pd.DataFrame(train_cat_accuracies, index=train_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete training tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)

#df = pd.DataFrame(eval_cat_accuracies, index=eval_ids)
#for c in df.columns:
#    print(f'for {c} no. of complete evaluation tasks is', (df.loc[:, c]==1).sum())
#    print(df[df.loc[:, c]==1].index)

In [13]:
nfeat = 32
local_neighb = 5
def ensemble(mode):
    print(mode)
    count = 0
    sample_sub = pd.read_csv(data_path/'sample_submission.csv')
    sample_sub = sample_sub.set_index('output_id')
    
    valid_scores = {}
    model_accuracies = {'ens': []}
    pred_taskids = []
    
    if mode=='eval':
        task_path = evaluation_path
        #all_task_ids = list(evaluation_descriptive_df[evaluation_descriptive_df["class"]==3]["task_name"]) # for particular class
    elif mode=='train':
        task_path = training_path
        #all_task_ids = list(training_descriptive_df[training_descriptive_df["class"]==3]["task_name"]) # for particular class
    elif mode=='test':
        task_path = test_path
    all_task_ids = sorted(os.listdir(task_path))
    # training -----
    for task_id in all_task_ids:
        task_file = str(task_path / task_id)
        with open(task_file, 'r') as f:
            task = json.load(f)

        feat, target, not_valid = features(task)
        if not_valid:
            print('ignoring task', task_file)
            count += 1
            continue

        model_xgb = XGBClassifier(n_estimators=50, max_depth = 5, num_leaves=10, learning_rate=0.1, n_jobs=-1)
        model_lgb = LGBMClassifier(n_estimators=60, max_depth=4, n_jobs=-1, learning_rate=0.25)
        model_cat = CatBoostClassifier(n_estimators=50, max_depth = 6, learning_rate=0.25)
        model_xgb.fit(feat, target, verbose=0)
        model_lgb.fit(feat, target, verbose=0)
        model_cat.fit(feat, target, verbose=0)
    # training on input pairs is done
    
    # test predictions begins here
        num_test_pairs = len(task['test'])
        for task_num in range(num_test_pairs):
            cur_idx = 0
            input_color = np.array(task['test'][task_num]['input'])
            nrows, ncols = len(task['test'][task_num]['input']), len(task['test'][task_num]['input'][0])
            feat = make_features(input_color, nfeat)
            preds_xgb = model_xgb.predict(feat).reshape(nrows,ncols)
            preds_lgb = model_lgb.predict(feat).reshape(nrows,ncols)
            preds_cat = model_cat.predict(feat).reshape(nrows,ncols)

            if (mode=='train') or (mode=='eval'):
                ens_acc = (np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols)
                model_accuracies['ens'].append(ens_acc)
                pred_taskids.append(f'{task_id[:-5]}_{task_num}')
                #print('ensemble accuracy',(np.array(task['test'][task_num]['output'])==preds).sum()/(nrows*ncols))
            
            preds_list = [flattener(preds_xgb.astype(int).tolist()), flattener(preds_lgb.astype(int).tolist()), flattener(preds_cat.astype(int).tolist()),]
            sample_sub.loc[f'{task_id[:-5]}_{task_num}','output'] = " ".join(preds_list)
    print(str(count)+" tasks were ignored.")
    return sample_sub, model_accuracies, pred_taskids

#_, train_xgb_accuracies, train_ids= ensemble('train')
#_, eval_xgb_accuracies, eval_ids  = ensemble('eval')

In [14]:
test_sub, _, _ = ensemble('test')
test_sub.to_csv('submission.csv')
test_sub.head()

test
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/00576224.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0692e18c.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0934a4d8.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0a1d4ef5.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0bb8deee.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0c786b71.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/0c9aba6e.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/12997ef3.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/136b0064.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/15696249.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/195ba7dc.json
ignoring task /kaggle/input/abstraction-and-reasoning-challenge/test/1990f7a8.json

Unnamed: 0_level_0,output
output_id,Unnamed: 1_level_1
00576224_0,|32|78| |32|78| |00|00|
009d5c81_0,|00000000000000|00000222222222|00000200020307|...
00dbd492_0,|00000000000222220000|02222222220233320000|020...
03560426_0,|8188000000|8188000000|8282000000|8202000000|0...
05a7bcf2_0,|000000000020000000080000000000|00004444488888...
