# Create new data partitions

## Goal

Let's create a new train/val partition that leaves only 100 eval samples for validation. That way I could use more data for training and I will better parametrize the test-time fine-tuning.

## Imports

In [None]:
import os
import json
import random
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib as mpl
import numpy as np

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Load data

In [None]:
def load_arc_data_with_solutions(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    solutions_filepath = filepath.replace('challenges.json', 'solutions.json')
    if filepath != solutions_filepath and os.path.exists(solutions_filepath):
        with open(solutions_filepath, 'r') as f:
            solutions = json.load(f)
        for sample_id, task in data.items():
            for idx, sample in enumerate(task['test']):
                sample['output'] = solutions[sample_id][idx]
    else:
        print('No solutions file found, the solutions should already be in the data')
    return data

In [None]:
eval_data = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc24/data/arc-agi_evaluation_challenges.json')
train_data = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc24/data/arc-agi_training_challenges.json')
len(eval_data), len(train_data)

In [None]:
output_dir = '/mnt/hdd0/Kaggle/arc24/data/new_partitions'
os.makedirs(output_dir, exist_ok=True)

In [None]:
all_data = {**eval_data, **train_data}
with open(f'{output_dir}/arc-agi_all_challenges.json', 'w') as f:
    json.dump(all_data, f)

## Create new partition

In [None]:
def create_new_partition(random_seed, n_val=100):
    np.random.seed(random_seed)
    val_ids = np.random.choice(list(eval_data.keys()), n_val, replace=False)

    val_data = {k: eval_data[k] for k in val_ids}
    new_train_data = train_data.copy()
    new_train_data.update({k: value for k, value in eval_data.items() if k not in val_ids})

    with open(f'{output_dir}/val_rs{random_seed}.json', 'w') as f:
        json.dump(val_data, f)
    
    with open(f'{output_dir}/train_rs{random_seed}.json', 'w') as f:
        json.dump(new_train_data, f)

    print(f'Created partition {random_seed}')
    print(f'Val size: {len(val_data)}')
    print(f'Train size: {len(new_train_data)}')


In [None]:
create_new_partition(7)