In [2]:
import json
import numpy as np
import pandas as pd

train_challenge = './kaggle/input/arc-prize-2024/arc-agi_training_challenges.json'
train_solution = "./kaggle/input/arc-prize-2024/arc-agi_training_solutions.json"

eval_challenge = './kaggle/input/arc-prize-2024/arc-agi_evaluation_challenges.json'
eval_solution = './kaggle/input/arc-prize-2024/arc-agi_evaluation_solutions.json'

test_challenge = './kaggle/input/arc-prize-2024/arc-agi_test_challenges.json'
test_solution = './kaggle/input/arc-prize-2024/sample_submission.json'


def DataMaker(challenge_data, solution_data):
    
    # Loading the data that contains the "challenge"
    challenge = pd.read_json(challenge_data)
    
    # Loading the data that contains the "Solution"
    with open(solution_data) as json_data:
        solution = json.load(json_data) 
        
    # getting alll the id values present in the dataset
    all_ids = list(challenge.columns)
    
    # concatinating along the test the way it is done for the train part
    for i in all_ids:
        
        # Getting the value of each cell for challenge dataset
        substitute = challenge[f'{i}']['test'][0]
        
        # Creating a new "output" key value pair
        substitute['output'] = solution[f'{i}'][0] 
        
        # Changing the value to "input : []" and "output : []"
        # instead of "input : []"
        challenge[f'{i}']['test'] = substitute       
        
        
    return challenge

def InputOutputDataset(df):
    
    all_ids = list(df.columns)
    new_df = pd.DataFrame(columns= ['id','input','output','input_shape','output_shape'])
    for i in all_ids:
        size = len(df[i]['train'])
        for j in range(size) :
            ip = df[i]['train'][j]['input']
            op = df[i]['train'][j]['output']
            ip_shape = np.array(df[i]['train'][j]['input']).shape
            op_shape = np.array(df[i]['train'][j]['output']).shape
            temp_df = pd.DataFrame()
            temp_df['id'] = [f'{i}_train_{j}']
            temp_df['input'] = [ip]
            temp_df['output'] = [op]
            temp_df['input_shape'] = [ip_shape]
            temp_df['output_shape'] = [op_shape]

            new_df = new_df._append(temp_df,ignore_index = True)
    
        ip = df[i]['test']['input']
        op = df[i]['test']['output']
        ip_shape = np.array(df[i]['test']['input']).shape
        op_shape = np.array(df[i]['test']['output']).shape
        temp_df = pd.DataFrame()
        temp_df['id'] = [f'{i}_test']
        temp_df['input'] = [ip]
        temp_df['output'] = [op]
        temp_df['input_shape'] = [ip_shape]
        temp_df['output_shape'] = [op_shape]
        new_df = new_df._append(temp_df,ignore_index = True)

    return new_df

new_train_data = DataMaker(train_challenge,train_solution)
new_train_data = InputOutputDataset(new_train_data)

new_train_data.head()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  challenge[f'{i}']['test'] = substitute


Unnamed: 0,id,input,output,input_shape,output_shape
0,007bbfb7_train_0,"[[0, 7, 7], [7, 7, 7], [0, 7, 7]]","[[0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, ...","(3, 3)","(9, 9)"
1,007bbfb7_train_1,"[[4, 0, 4], [0, 0, 0], [0, 4, 0]]","[[4, 0, 4, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, ...","(3, 3)","(9, 9)"
2,007bbfb7_train_2,"[[0, 0, 0], [0, 0, 2], [2, 0, 2]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, ...","(3, 3)","(9, 9)"
3,007bbfb7_train_3,"[[6, 6, 0], [6, 0, 0], [0, 6, 6]]","[[6, 6, 0, 6, 6, 0, 0, 0, 0], [6, 0, 0, 6, 0, ...","(3, 3)","(9, 9)"
4,007bbfb7_train_4,"[[2, 2, 2], [0, 0, 0], [0, 2, 2]]","[[2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 0, 0, 0, 0, ...","(3, 3)","(9, 9)"


"12997ef3": 
[
    [
        [0, 2, 2, 0, 8, 8, 0, 3, 3, 0, 6, 6], 
        [2, 2, 0, 8, 8, 0, 3, 3, 0, 6, 6, 0], 
        [2, 2, 0, 8, 8, 0, 3, 3, 0, 6, 6, 0]
    ], 
    
    [
        [0, 7, 0], 
        [7, 7, 7], 
        [0, 7, 0], 
        [0, 6, 0], 
        [6, 6, 6], 
        [0, 6, 0], 
        [0, 3, 0], 
        [3, 3, 3], 
        [0, 3, 0], 
        [0, 2, 0], 
        [2, 2, 2], 
        [0, 2, 0]
    ]
]

|id|input|output|example_input|example_output|
|-|-|-|-|-|
1|[[1]]|[[2]]|[[],[],[]]|[[1],[2],[3]]|
1|[[1,2],[3,4],[5,6]]|[[2,3],[4,5],[6,7]]|[[],[],[]]|[[1],[2],[3]]|

In [32]:
i = 5

def load_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

challenges = load_json(test_challenge)
solution = load_json(eval_solution)

data = []
max_len=0
for key, value in challenges.items():
    print(key)
    print(value)
    
    for i in range(len(value['test'])):
        # 각 반복마다 새로운 입력과 출력을 저장
        task_input = value['test'][i]['input']
        task_output = solution[key][i]
    
        # 학습 데이터의 예제 입력 및 출력 수집
        example_input = [ex['input'] for ex in value['train']]
        example_output = [ex['output'] for ex in value['train']]
        max_len = max(len(example_input), max_len)
        # 데이터프레임으로 변환될 데이터를 리스트에 저장
        data.append({
            'id': key,
            'input': task_input,
            'output': task_output,
            'ex_input': example_input,
            'ex_output': example_output
        })
print(max_len)
# 리스트를 데이터프레임으로 변환
df = pd.DataFrame(data)


007bbfb7
{'test': [{'input': [[7, 0, 7], [7, 0, 7], [7, 7, 0]]}], 'train': [{'input': [[0, 7, 7], [7, 7, 7], [0, 7, 7]], 'output': [[0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 7, 7, 0, 7, 7, 0, 7, 7], [7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 7, 7, 0, 7, 7, 0, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7]]}, {'input': [[4, 0, 4], [0, 0, 0], [0, 4, 0]], 'output': [[4, 0, 4, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 4, 0, 0, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 4, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 0, 0, 0, 0]]}, {'input': [[0, 0, 0], [0, 0, 2], [2, 0, 2]], 'output': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 2, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 0, 0, 2]

KeyError: '007bbfb7'

In [29]:
t = df.iloc[9]['ex_input']


In [None]:
# 특정 컬럼에서 중복된 값을 가지는 행 찾기 (예: 'output' 컬럼)
duplicate_rows = df[df.duplicated(['ex_input'], keep=False)]
print("max_len:", max_len)
duplicate_rows
