In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import warnings as wr
wr.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import category_encoders as ce
from collections import deque

# sklearn imports
import sklearn
import json

from scipy.stats import uniform
from scipy.cluster.hierarchy import dendrogram, linkage

from sklearn import metrics
from sklearn import pipeline
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import neural_network
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE

from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier 

from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering

from tqdm.auto import tqdm

random_seed = 42

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arc-prize-2024/arc-agi_training_solutions.json
/kaggle/input/arc-prize-2024/arc-agi_evaluation_solutions.json
/kaggle/input/arc-prize-2024/arc-agi_evaluation_challenges.json
/kaggle/input/arc-prize-2024/sample_submission.json
/kaggle/input/arc-prize-2024/arc-agi_training_challenges.json
/kaggle/input/arc-prize-2024/arc-agi_test_challenges.json


In [3]:
# Load the JSON data
with open('/kaggle/input/arc-prize-2024/arc-agi_training_solutions.json', 'r') as file:
    solutions_data = json.load(file)
with open('/kaggle/input/arc-prize-2024/arc-agi_training_challenges.json', 'r') as file:
    challenges_data = json.load(file)

training_challenges_df = pd.read_json("/kaggle/input/arc-prize-2024/arc-agi_training_challenges.json")

In [4]:
def combine_dicts(dict1, dict2):
    combined_dict = {}
    
    for key in dict1.keys():
        if key in dict2:
            combined_dict[key] = {
                'solution': [{'output': dict1[key]}],
                'test': dict2[key]['test'],
                'train': dict2[key]['train']
            }
    
    return combined_dict
training_dict = combine_dicts(solutions_data, challenges_data)
i=0
for key,value in enumerate(training_dict.items()):
    if i<1:
        print(f"key is {key}: value is{value}")
        i += 1
    else:break

key is 0: value is('007bbfb7', {'solution': [{'output': [[[7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 7, 0, 0, 0, 0, 7, 7, 0], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 7, 0, 0, 0, 0, 7, 7, 0], [7, 0, 7, 7, 0, 7, 0, 0, 0], [7, 0, 7, 7, 0, 7, 0, 0, 0], [7, 7, 0, 7, 7, 0, 0, 0, 0]]]}], 'test': [{'input': [[7, 0, 7], [7, 0, 7], [7, 7, 0]]}], 'train': [{'input': [[0, 7, 7], [7, 7, 7], [0, 7, 7]], 'output': [[0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 7, 7, 0, 7, 7, 0, 7, 7], [7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 7, 7, 0, 7, 7, 0, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7]]}, {'input': [[4, 0, 4], [0, 0, 0], [0, 4, 0]], 'output': [[4, 0, 4, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 4, 0, 0, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 4, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 

In [5]:

def create_dataframe(data):
    records = []
    
    for id_key, id_value in data.items():
        record = {}
        record['id'] = id_key
        record['solution'] = id_value['solution'][0]['output'][0]  # Assuming single solution
        record['test'] = id_value['test'][0]['input']  # Assuming single test input

        # Adding train inputs and outputs dynamically
        for i, train_case in enumerate(id_value['train']):
            record[f'train_input_{i+1}'] = train_case['input']
            record[f'train_output_{i+1}'] = train_case['output']
        
        records.append(record)
    
    df = pd.DataFrame(records)
    return df


transform_df = create_dataframe(training_dict)
transform_df.head(5)

Unnamed: 0,id,solution,test,train_input_1,train_output_1,train_input_2,train_output_2,train_input_3,train_output_3,train_input_4,...,train_input_6,train_output_6,train_input_7,train_output_7,train_input_8,train_output_8,train_input_9,train_output_9,train_input_10,train_output_10
0,007bbfb7,"[[7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, ...","[[7, 0, 7], [7, 0, 7], [7, 7, 0]]","[[0, 7, 7], [7, 7, 7], [0, 7, 7]]","[[0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, ...","[[4, 0, 4], [0, 0, 0], [0, 4, 0]]","[[4, 0, 4, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, ...","[[0, 0, 0], [0, 0, 2], [2, 0, 2]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, ...","[[6, 6, 0], [6, 0, 0], [0, 6, 6]]",...,,,,,,,,,,
1,00d62c1b,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0], [0, 0, 3, 0, 0, 0], [0, 3...","[[0, 0, 0, 0, 0, 0], [0, 0, 3, 0, 0, 0], [0, 3...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 0, ...","[[0, 0, 0, 0, 0, 3, 0, 0, 0, 0], [0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 3, 0, 0, 0, 0], [0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 3, ...",...,,,,,,,,,,
2,017c7c7b,"[[2, 2, 2], [0, 2, 0], [0, 2, 0], [2, 2, 2], [...","[[1, 1, 1], [0, 1, 0], [0, 1, 0], [1, 1, 1], [...","[[0, 1, 0], [1, 1, 0], [0, 1, 0], [0, 1, 1], [...","[[0, 2, 0], [2, 2, 0], [0, 2, 0], [0, 2, 2], [...","[[0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [...","[[0, 2, 0], [2, 0, 2], [0, 2, 0], [2, 0, 2], [...","[[0, 1, 0], [1, 1, 0], [0, 1, 0], [0, 1, 0], [...","[[0, 2, 0], [2, 2, 0], [0, 2, 0], [0, 2, 0], [...",,...,,,,,,,,,,
3,025d127b,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 4, 4, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 4, 4, 4, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 6, 6, 6, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 6, 6, 6, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 8, 8, 8, 8, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 8, 8, 8, ...",,,,...,,,,,,,,,,
4,045e512c,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,...",,...,,,,,,,,,,


## adding simple objects description columns

In [6]:
def find_shapes_without_wrap(input_grid):
    from collections import deque

    def bfs(start, num):
        queue = deque([start])
        visited[start[0]][start[1]] = True
        shape_positions = [start]
        
        while queue:
            x, y = queue.popleft()
            for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
                nx, ny = x + dx, y + dy
                if 0 <= nx < len(input_grid) and 0 <= ny < len(input_grid[0]) and not visited[nx][ny] and input_grid[nx][ny] == num:
                    visited[nx][ny] = True
                    queue.append((nx, ny))
                    shape_positions.append((nx, ny))
                    
        return shape_positions
    
    shapes = []
    visited = [[False] * len(row) for row in input_grid]
    
    for i in range(len(input_grid)):
        for j in range(len(input_grid[0])):
            if not visited[i][j]:
                shape_num = input_grid[i][j]
                shape_positions = bfs((i, j), shape_num)
                shape_indices = [i * len(input_grid[0]) + j for i, j in shape_positions]
                shape_size = len(shape_positions)
                shapes.append([shape_indices, shape_size, shape_num])
                
    return shapes


In [7]:

def find_shapes_with_wrap(input_grid):
    rows, cols = len(input_grid), len(input_grid[0])
    visited = [[False] * cols for _ in range(rows)]
    shapes = []
    
    def bfs_with_wrap(start, num, input_grid, visited):
        rows, cols = len(input_grid), len(input_grid[0])
        queue = deque([start])
        visited[start[0]][start[1]] = True
        shape_positions = [start]

        while queue:
            x, y = queue.popleft()
            for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
                nx, ny = (x + dx) % rows, (y + dy) % cols  # Wrap around using modulo
                if not visited[nx][ny] and input_grid[nx][ny] == num:
                    visited[nx][ny] = True
                    queue.append((nx, ny))
                    shape_positions.append((nx, ny))

        return shape_positions

    for i in range(rows):
        for j in range(cols):
            if not visited[i][j]:
                num = input_grid[i][j]
                shape_positions = bfs_with_wrap((i, j), num, input_grid, visited)
                flat_positions = [x * cols + y for x, y in shape_positions]
                shapes.append([flat_positions, len(flat_positions), num])

    return shapes


In [8]:
def apply_shape_functions(transform_df):
    for i in range(1, 11):
        input_col = f'train_input_{i}'
        output_col = f'train_output_{i}'

        # Check if the column exists
        if input_col in transform_df.columns and output_col in transform_df.columns:
            
            # Apply the function to specific columns only if the values are not null
            mask_input = transform_df[input_col].notna()
            mask_output = transform_df[output_col].notna()
            
            transform_df.loc[mask_input, f'shape_info_no_wrap_input_{i}'] = transform_df.loc[mask_input, input_col].apply(find_shapes_without_wrap)
            transform_df.loc[mask_output, f'shape_info_no_wrap_output_{i}'] = transform_df.loc[mask_output, output_col].apply(find_shapes_without_wrap)

            transform_df.loc[mask_input, f'shape_info_wrap_input_{i}'] = transform_df.loc[mask_input, input_col].apply(find_shapes_with_wrap)
            transform_df.loc[mask_output, f'shape_info_wrap_output_{i}'] = transform_df.loc[mask_output, output_col].apply(find_shapes_with_wrap)

    return transform_df

# Apply the function to the DataFrame
transform_df = apply_shape_functions(transform_df)

# Display the first 5 rows of the DataFrame
transform_df.head(1)

Unnamed: 0,id,solution,test,train_input_1,train_output_1,train_input_2,train_output_2,train_input_3,train_output_3,train_input_4,...,shape_info_wrap_input_8,shape_info_wrap_output_8,shape_info_no_wrap_input_9,shape_info_no_wrap_output_9,shape_info_wrap_input_9,shape_info_wrap_output_9,shape_info_no_wrap_input_10,shape_info_no_wrap_output_10,shape_info_wrap_input_10,shape_info_wrap_output_10
0,007bbfb7,"[[7, 0, 7, 0, 0, 0, 7, 0, 7], [7, 0, 7, 0, 0, ...","[[7, 0, 7], [7, 0, 7], [7, 7, 0]]","[[0, 7, 7], [7, 7, 7], [0, 7, 7]]","[[0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, ...","[[4, 0, 4], [0, 0, 0], [0, 4, 0]]","[[4, 0, 4, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, ...","[[0, 0, 0], [0, 0, 2], [2, 0, 2]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, ...","[[6, 6, 0], [6, 0, 0], [0, 6, 6]]",...,,,,,,,,,,


In [9]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
