# Create new data partitions

## Goal

Let's create a new train/val partition that leaves only 100 eval samples for validation. That way I could use more data for training and I will better parametrize the test-time fine-tuning.

## Imports

In [None]:
import os
import json
import random
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib as mpl
import numpy as np

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Load data

In [None]:
with open('/mnt/hdd0/Kaggle/arc24/data/arc-agi_evaluation_challenges.json') as f:
    eval_data = json.load(f)

with open('/mnt/hdd0/Kaggle/arc24/data/arc-agi_training_challenges.json') as f:
    train_data = json.load(f)
len(eval_data), len(train_data)

In [None]:
output_dir = '/mnt/hdd0/Kaggle/arc24/data/new_partitions'
os.makedirs(output_dir, exist_ok=True)

## Create new partition

In [None]:
def create_new_partition(random_seed, n_val=100):
    np.random.seed(random_seed)
    val_ids = np.random.choice(list(eval_data.keys()), n_val, replace=False)

    val_data = {k: eval_data[k] for k in val_ids}
    new_train_data = train_data.copy()
    new_train_data.update({k: value for k, value in eval_data.items() if k not in val_ids})

    with open(f'{output_dir}/val_rs{random_seed}.json', 'w') as f:
        json.dump(val_data, f)
    
    with open(f'{output_dir}/train_rs{random_seed}.json', 'w') as f:
        json.dump(new_train_data, f)

    print(f'Created partition {random_seed}')
    print(f'Val size: {len(val_data)}')
    print(f'Train size: {len(new_train_data)}')


In [None]:
create_new_partition(7)