In [13]:
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader

def load_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

train_challenge = './kaggle/input/arc-prize-2024/arc-agi_training_challenges.json'
train_solution = './kaggle/input/arc-prize-2024/arc-agi_training_solutions.json'

challenges = load_json(train_challenge)
solution = load_json(train_solution)

data = []

for key, value in challenges.items():
    for i in range(len(value['test'])):
        # 각 반복마다 새로운 입력과 출력을 저장
        task_input = value['test'][i]['input']
        task_output = solution[key][i]
    
        # 학습 데이터의 예제 입력 및 출력 수집
        example_input = [ex['input'] for ex in value['train']]
        example_output = [ex['output'] for ex in value['train']]
        # 데이터프레임으로 변환될 데이터를 리스트에 저장
        data.append({
            'id': key,
            'input': task_input,
            'output': task_output,
            'ex_input': example_input,
            'ex_output': example_output
        })

# 리스트를 데이터프레임으로 변환
df = pd.DataFrame(data)

# 30x30으로 패딩하는 함수
def pad_to_30x30(tensor):
    # 현재 tensor의 크기 얻기
    if tensor.dim() == 2:  # 2D tensor의 경우 (단일 채널 이미지)
        tensor = tensor.unsqueeze(0)  # 채널 차원 추가
    c, h, w = tensor.shape
    # 패딩 크기 계산
    pad_h = max(0, 30 - h)
    pad_w = max(0, 30 - w)
    
    # 패딩 적용
    if pad_h > 0 or pad_w > 0:
        tensor = torch.nn.functional.pad(tensor, (0, pad_w, 0, pad_h), mode='constant', value=0)
    
    # 자르기
    tensor = tensor[:, :30, :30]
    return tensor

def augment_data(input_data, output_data, target_samples):
    augmented_inputs = []
    augmented_outputs = []
    
    while len(augmented_inputs) < target_samples:
        for inp, out in zip(input_data, output_data):
            # 증강된 입력과 출력을 패딩하여 30x30 크기로 맞추기
            augmented_input = pad_to_30x30(torch.tensor(inp, dtype=torch.float32))
            augmented_output = pad_to_30x30(torch.tensor(out, dtype=torch.float32))
            augmented_inputs.append(augmented_input)
            augmented_outputs.append(augmented_output)
            if len(augmented_inputs) >= target_samples:
                break
    
    return augmented_inputs[:target_samples], augmented_outputs[:target_samples]

def augment_df(df, num_samples=50):
    for idx, row in df.iterrows():
        ex_inputs = row['ex_input']
        ex_outputs = row['ex_output']
        if len(ex_inputs) < num_samples:
            augmented_inputs, augmented_outputs = augment_data(ex_inputs, ex_outputs, num_samples)
            df.at[idx, 'ex_input'] = augmented_inputs
            df.at[idx, 'ex_output'] = augmented_outputs
        elif len(ex_inputs) > num_samples:
            df.at[idx, 'ex_input'] = ex_inputs[:num_samples]
            df.at[idx, 'ex_output'] = ex_outputs[:num_samples]
    return df

df_augmented = augment_df(df, num_samples=50)

class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        input_tensor = pad_to_30x30(torch.tensor(item['input'], dtype=torch.float32))
        output_tensor = pad_to_30x30(torch.tensor(item['output'], dtype=torch.float32))
        ex_inputs = torch.stack([pad_to_30x30(torch.tensor(ex, dtype=torch.float32)) for ex in item['ex_input']])
        ex_outputs = torch.stack([pad_to_30x30(torch.tensor(ex, dtype=torch.float32)) for ex in item['ex_output']])
        return item['id'], input_tensor, output_tensor, ex_inputs, ex_outputs

def collate_fn(batch):
    batch_dict = {}
    for item_id, input_tensor, output_tensor, ex_inputs, ex_outputs in batch:
        if item_id not in batch_dict:
            batch_dict[item_id] = {
                'inputs': [],
                'outputs': [],
                'ex_inputs': [],
                'ex_outputs': []
            }
        batch_dict[item_id]['inputs'].append(input_tensor)
        batch_dict[item_id]['outputs'].append(output_tensor)
        batch_dict[item_id]['ex_inputs'].append(ex_inputs)
        batch_dict[item_id]['ex_outputs'].append(ex_outputs)
    
    final_batch = []
    for item_id, data in batch_dict.items():
        final_batch.append((
            item_id,
            torch.stack(data['inputs']),
            torch.stack(data['outputs']),
            torch.stack(data['ex_inputs']),
            torch.stack(data['ex_outputs'])
        ))

    return final_batch

# Dataset과 DataLoader 생성
dataset = CustomDataset(df_augmented)
data_loader = DataLoader(dataset, batch_size=3, collate_fn=collate_fn)

# DataLoader 사용 예시
for batch in data_loader:
    for item in batch:
        item_id, inputs, outputs, ex_inputs, ex_outputs = item
        print(f'ID: {item_id}')
        print(f'Input shape: {inputs.shape}')
        print(f'Output shape: {outputs.shape}')
        print(f'Example Input shape: {ex_inputs.shape}')
        print(f'Example Output shape: {ex_outputs.shape}')


  ex_inputs = torch.stack([pad_to_30x30(torch.tensor(ex, dtype=torch.float32)) for ex in item['ex_input']])
  ex_outputs = torch.stack([pad_to_30x30(torch.tensor(ex, dtype=torch.float32)) for ex in item['ex_output']])


ID: 007bbfb7
Input shape: torch.Size([1, 1, 30, 30])
Output shape: torch.Size([1, 1, 30, 30])
Example Input shape: torch.Size([1, 50, 1, 30, 30])
Example Output shape: torch.Size([1, 50, 1, 30, 30])
ID: 00d62c1b
Input shape: torch.Size([1, 1, 30, 30])
Output shape: torch.Size([1, 1, 30, 30])
Example Input shape: torch.Size([1, 50, 1, 30, 30])
Example Output shape: torch.Size([1, 50, 1, 30, 30])
ID: 017c7c7b
Input shape: torch.Size([1, 1, 30, 30])
Output shape: torch.Size([1, 1, 30, 30])
Example Input shape: torch.Size([1, 50, 1, 30, 30])
Example Output shape: torch.Size([1, 50, 1, 30, 30])
ID: 025d127b
Input shape: torch.Size([1, 1, 30, 30])
Output shape: torch.Size([1, 1, 30, 30])
Example Input shape: torch.Size([1, 50, 1, 30, 30])
Example Output shape: torch.Size([1, 50, 1, 30, 30])
ID: 045e512c
Input shape: torch.Size([1, 1, 30, 30])
Output shape: torch.Size([1, 1, 30, 30])
Example Input shape: torch.Size([1, 50, 1, 30, 30])
Example Output shape: torch.Size([1, 50, 1, 30, 30])
ID: 0

"12997ef3": 
[
    [
        [0, 2, 2, 0, 8, 8, 0, 3, 3, 0, 6, 6], 
        [2, 2, 0, 8, 8, 0, 3, 3, 0, 6, 6, 0], 
        [2, 2, 0, 8, 8, 0, 3, 3, 0, 6, 6, 0]
    ], 
    
    [
        [0, 7, 0], 
        [7, 7, 7], 
        [0, 7, 0], 
        [0, 6, 0], 
        [6, 6, 6], 
        [0, 6, 0], 
        [0, 3, 0], 
        [3, 3, 3], 
        [0, 3, 0], 
        [0, 2, 0], 
        [2, 2, 2], 
        [0, 2, 0]
    ]
]

|id|input|output|example_input|example_output|
|-|-|-|-|-|
1|[[1]]|[[2]]|[[],[],[]]|[[1],[2],[3]]|
1|[[1,2],[3,4],[5,6]]|[[2,3],[4,5],[6,7]]|[[],[],[]]|[[1],[2],[3]]|

In [None]:
i = 5

def load_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

challenges = load_json(test_challenge)
solution = load_json(eval_solution)

data = []
max_len=0
for key, value in challenges.items():
    print(key)
    print(value)
    
    for i in range(len(value['test'])):
        # 각 반복마다 새로운 입력과 출력을 저장
        task_input = value['test'][i]['input']
        task_output = solution[key][i]
    
        # 학습 데이터의 예제 입력 및 출력 수집
        example_input = [ex['input'] for ex in value['train']]
        example_output = [ex['output'] for ex in value['train']]
        max_len = max(len(example_input), max_len)
        # 데이터프레임으로 변환될 데이터를 리스트에 저장
        data.append({
            'id': key,
            'input': task_input,
            'output': task_output,
            'ex_input': example_input,
            'ex_output': example_output
        })
print(max_len)
# 리스트를 데이터프레임으로 변환
df = pd.DataFrame(data)


007bbfb7
{'test': [{'input': [[7, 0, 7], [7, 0, 7], [7, 7, 0]]}], 'train': [{'input': [[0, 7, 7], [7, 7, 7], [0, 7, 7]], 'output': [[0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 7, 7, 0, 7, 7, 0, 7, 7], [7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 7, 7, 0, 7, 7, 0, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7], [0, 0, 0, 7, 7, 7, 7, 7, 7], [0, 0, 0, 0, 7, 7, 0, 7, 7]]}, {'input': [[4, 0, 4], [0, 0, 0], [0, 4, 0]], 'output': [[4, 0, 4, 0, 0, 0, 4, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 4, 0, 0, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 4, 0, 4, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 0, 0, 0, 0]]}, {'input': [[0, 0, 0], [0, 0, 2], [2, 0, 2]], 'output': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 2, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 0, 0, 2]

KeyError: '007bbfb7'

In [None]:
t = df.iloc[9]['ex_input']


In [None]:
# 특정 컬럼에서 중복된 값을 가지는 행 찾기 (예: 'output' 컬럼)
duplicate_rows = df[df.duplicated(['ex_input'], keep=False)]
print("max_len:", max_len)
duplicate_rows
