# Generate new inputs v2

## Goal

On this second notebook the generation of inputs will be done using the `inference` script, I will use this notebook
to visualize the generated inputs and optimize the data and parameters.

## Imports

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import sys
import os
import random
import numpy as np
from itertools import islice

sys.path.append(os.path.realpath('../scripts/'))
from evaluation import plot_grid
from arc24.data import load_arc_data_with_solutions
from voting import get_unique_matrices_and_counts_sorted

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Create dataset for inference

I have to create a new version of the data that uses all the available samples as test samples.
That way I will create wrong predictions for all the samples of each task.

In [None]:
training_data = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc24/data/arc-agi_training_challenges.json')

In [None]:
def create_all_test_dataset(dataset):
    all_test_dataset = {}
    for task_id, task in dataset.items():
        samples = task['train'] + task['test']
        for idx, sample in enumerate(samples):
            all_test_dataset[f'{task_id}_{idx}'] = dict(
                train=samples[:idx] + samples[idx+1:],
                test=[sample],
            )
    print(f'Created dataset with {len(all_test_dataset)} samples from {len(dataset)} tasks')
    return all_test_dataset

In [None]:
training_all_test_dataset = create_all_test_dataset(training_data)

In [None]:
with open('/mnt/hdd0/Kaggle/arc24/data/all_test/training.json', 'w') as f:
    json.dump(training_all_test_dataset, f)

## Inference

```bash

export temperature=9e-1
python inference.py \
--model_path /mnt/hdd0/Kaggle/arc24/models/20240925_submission_models/06_continue-full-fine-tuning-Qwen2.5-0.5B-Instruct_lr1.5e-5_1e5steps_2gpus_8192msl/checkpoint-100000 \
--prompt_version output-from-examples-v1 \
--dataset_path /mnt/hdd0/Kaggle/arc24/data/all_test/training.json \
--output_filepath /mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/first_steps_t${temperature}.json \
--predictions_per_task 8 \
--temperature ${temperature}
```

1200 seconds to generate 8 predictions per task for the 1718 tasks.

## Visualize generated outputs

In [None]:
def visualize_generated_outputs(filepath, max_plots, random_seed=None):
    with open(filepath, 'r') as f:
        data = json.load(f)
    random.seed(random_seed)
    task_ids = random.sample(list(data.keys()), min(len(data), max_plots))

    for task_id in task_ids:
        for sample_generations in data[task_id]:
            outputs = list(sample_generations.values())
            outputs = [output for output in outputs if output]
            outputs, counts = get_unique_matrices_and_counts_sorted(outputs)
            if len(outputs) <= 1:
                continue
            print(task_id)
            for plot_idx, (output, count) in enumerate(zip(outputs, counts), 1):
                plt.subplot(1, len(sample_generations), plot_idx)
                plot_grid(output)
                plt.title(f'Count: {count}')
            plt.show()

In [None]:
visualize_generated_outputs('/mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/first_steps_t1.4.json', 10, random_seed=42)

## Measure available training samples

Let's measure how many training samples we have generated. A training sample is a unique prediction that is different to the ground truth.

In [None]:
def measure_available_training_samples(inference_filepath,
                                       dataset_filepath='/mnt/hdd0/Kaggle/arc24/data/all_test/training.json'):
    with open(inference_filepath, 'r') as f:
        inference = json.load(f)
    with open(dataset_filepath, 'r') as f:
        dataset = json.load(f)

    available_training_samples = dict()
    for task_id in inference:
        predictions = list(inference[task_id][0].values())
        predictions = [prediction for prediction in predictions if prediction]
        unique_predictions, _ = get_unique_matrices_and_counts_sorted(predictions)
        n_training_samples = 0
        for prediction in unique_predictions:
            if prediction != dataset[task_id]['test'][0]['output']:
                n_training_samples += 1
        available_training_samples[task_id] = n_training_samples
    print(f'Available training samples: {sum(available_training_samples.values())}')
    print(f'Mean available training samples per task: {sum(available_training_samples.values()) / len(available_training_samples):.2f}')
    return available_training_samples

In [None]:
available_training_samples = measure_available_training_samples('/mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/first_steps_t1.4.json')

In [None]:
plt.hist(available_training_samples.values(), bins=np.arange(-0.5, 32))
plt.xlabel('Number of available training samples per task')
plt.ylabel('Number of tasks')
plt.title('Distribution of available training samples per task');

## TODO

- [ ] Measure available training samples per inference file. (predictions that are wrong)
- [ ] Do I have a way to concatenate predictions?
- [ ] Study temperature influence