# Generate new inputs v2

## Goal

On this second notebook the generation of inputs will be done using the `inference` script, I will use this notebook
to visualize the generated inputs and optimize the data and parameters.

## Imports

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import sys
import os
import glob
import random
import numpy as np
from itertools import islice
from tqdm.auto import tqdm

sys.path.append(os.path.realpath('../scripts/'))
from evaluation import plot_grid
from arc24.data import load_arc_data_with_solutions
from voting import get_unique_matrices_and_counts_sorted

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Create dataset for inference

I have to create a new version of the data that uses all the available samples as test samples.
That way I will create wrong predictions for all the samples of each task.

In [None]:
training_data = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc24/data/arc-agi_training_challenges.json')

In [None]:
def create_all_test_dataset(dataset):
    all_test_dataset = {}
    for task_id, task in dataset.items():
        samples = task['train'] + task['test']
        for idx, sample in enumerate(samples):
            all_test_dataset[f'{task_id}_{idx}'] = dict(
                train=samples[:idx] + samples[idx+1:],
                test=[sample],
            )
    print(f'Created dataset with {len(all_test_dataset)} samples from {len(dataset)} tasks')
    return all_test_dataset

In [None]:
training_all_test_dataset = create_all_test_dataset(training_data)

In [None]:
with open('/mnt/hdd0/Kaggle/arc24/data/all_test/training.json', 'w') as f:
    json.dump(training_all_test_dataset, f)

In [None]:
evaluation_data = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc24/data/arc-agi_evaluation_challenges.json')
evaluation_all_test_dataset = create_all_test_dataset(evaluation_data)
with open('/mnt/hdd0/Kaggle/arc24/data/all_test/evaluation.json', 'w') as f:
    json.dump(evaluation_all_test_dataset, f)

## Inference

### Inference command

```bash

export temperature=9e-1
python inference.py \
--model_path /mnt/hdd0/Kaggle/arc24/models/20240925_submission_models/06_continue-full-fine-tuning-Qwen2.5-0.5B-Instruct_lr1.5e-5_1e5steps_2gpus_8192msl/checkpoint-100000 \
--prompt_version output-from-examples-v1 \
--dataset_path /mnt/hdd0/Kaggle/arc24/data/all_test/training.json \
--output_filepath /mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/first_steps_t${temperature}.json \
--predictions_per_task 8 \
--temperature ${temperature}


export temperature=9e-1
python inference.py \
--model_path /mnt/hdd0/Kaggle/arc24/models/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000 \
--prompt_version output-from-examples-v1 \
--dataset_path /mnt/hdd0/Kaggle/arc24/data/all_test/evaluation.json \
--output_filepath /mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/evaluation_x8_t${temperature}.json \
--predictions_per_task 8 \
--temperature ${temperature}


rsync -avP --prune-empty-dirs /mnt/hdd0/MEGA/projects/temp/ /mnt/hdd0/Kaggle/arc24/models/
for checkpoint_folder in /mnt/hdd0/Kaggle/arc24/models/20241022_no_training/*/checkpoint-*; do
    for temperature in 0.1 0.5 1 1.5; do
    	python easy_inference_and_evaluation.py "${checkpoint_folder}" --dataset_path /mnt/hdd0/Kaggle/arc24/data/all_test/training.json --predictions_per_task 8 --temperature ${temperature}
    done
done
```

1200 seconds to generate 8 predictions per task for the 1718 tasks.

### Combine multiple prediction files

In [None]:
def combine_multiple_prediction_files(output_filepath, *input_filepaths):
    predictions = []
    for filepath in tqdm(input_filepaths, desc='loading files'):
        with open(filepath, 'r') as f:
            predictions.append(json.load(f))
    combined_predictions = dict()
    for task_id in tqdm(predictions[0], desc='grouping predictions'):
        outputs = []
        for prediction in predictions:
            outputs.extend(list(prediction[task_id][0].values()))
        outputs, _ = get_unique_matrices_and_counts_sorted(outputs)
        combined_predictions[task_id] = [{f'attempt_{idx}': output for idx, output in enumerate(outputs, 1)}]
    with open(output_filepath, 'w') as f:
        json.dump(combined_predictions, f)

In [None]:
input_filepaths = glob.glob('/mnt/hdd0/Kaggle/arc24/evaluations/20241022_no_training/*/checkpoint-*/inference_all-test-training_*.json')
input_filepaths.extend(glob.glob('/mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/first_steps*.json'))
input_filepaths = [filepath for filepath in input_filepaths if not filepath.endswith('_task_results.json')]
combine_multiple_prediction_files("/mnt/hdd0/Kaggle/arc24/evaluations/20241022_no_training/unique_predictions.json", *input_filepaths)

## Visualize generated outputs

In [None]:
def visualize_generated_outputs(filepath, max_plots, random_seed=None, dataset_filepath='/mnt/hdd0/Kaggle/arc24/data/all_test/training.json'):
    with open(dataset_filepath, 'r') as f:
        dataset = json.load(f)

    with open(filepath, 'r') as f:
        data = json.load(f)
    random.seed(random_seed)
    task_ids = random.sample(list(data.keys()), min(len(data), max_plots))

    for task_id in task_ids:
        for sample_generations in data[task_id]:
            outputs = list(sample_generations.values())
            outputs = [output for output in outputs if output]
            outputs, counts = get_unique_matrices_and_counts_sorted(outputs)
            if len(outputs) <= 1:
                continue
            print(task_id)
            for plot_idx, (output, count) in enumerate(zip(outputs, counts), 1):
                plt.subplot(1, len(sample_generations), plot_idx)
                plot_grid(output)
                title = f'Count: {count}'
                if output == dataset[task_id]['test'][0]['output']:
                    title = f'Correct! {title}'
                plt.title(title)
            plt.show()

In [None]:
visualize_generated_outputs('/mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/first_steps_t1.4.json', 10, random_seed=42)

In [None]:
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20241022_no_training/01_lora064-Qwen2.5-0.5B-Instruct_lr1e-4_bs16_20000steps_2gpus_8192msl/checkpoint-20000/inference_all-test-training_x008_t1e+00.json'
visualize_generated_outputs(filepath, 10, random_seed=4)

## Measure available training samples

Let's measure how many training samples we have generated. A training sample is a unique prediction that is different to the ground truth.

In [None]:
def measure_available_training_samples(inference_filepath,
                                       dataset_filepath='/mnt/hdd0/Kaggle/arc24/data/all_test/training.json'):
    with open(inference_filepath, 'r') as f:
        inference = json.load(f)
    with open(dataset_filepath, 'r') as f:
        dataset = json.load(f)

    available_training_samples = dict()
    for task_id in inference:
        predictions = list(inference[task_id][0].values())
        predictions = [prediction for prediction in predictions if prediction]
        unique_predictions, _ = get_unique_matrices_and_counts_sorted(predictions)
        n_training_samples = 0
        for prediction in unique_predictions:
            if prediction != dataset[task_id]['test'][0]['output']:
                n_training_samples += 1
        available_training_samples[task_id] = n_training_samples
    print(f'Available training samples: {sum(available_training_samples.values())}')
    print(f'Mean available training samples per task: {sum(available_training_samples.values()) / len(available_training_samples):.2f}')
    return available_training_samples

In [None]:
available_training_samples = measure_available_training_samples('/mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/first_steps_t1.2.json')

In [None]:
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20241022_no_training/01_lora064-Qwen2.5-0.5B-Instruct_lr1e-4_bs16_10000steps_2gpus_8192msl/checkpoint-10000/inference_all-test-training_x008_t2e+00.json'
available_training_samples = measure_available_training_samples(filepath)

In [None]:
available_training_samples = measure_available_training_samples(
    '/mnt/hdd0/Kaggle/arc24/debug/outputs_for_verifiying/evaluation_x8_t9e-1.json',
    '/mnt/hdd0/Kaggle/arc24/data/all_test/evaluation.json')

In [None]:
filepath = "/mnt/hdd0/Kaggle/arc24/evaluations/20241022_no_training/unique_predictions.json"
available_training_samples = measure_available_training_samples(filepath)

In [None]:
plt.hist(available_training_samples.values(), bins=np.arange(-0.5, max(available_training_samples.values()) + 1))
plt.xlabel('Number of available training samples per task')
plt.ylabel('Number of tasks')
plt.grid()
plt.title('Distribution of available training samples per task');

## TODO

- [ ] Measure available training samples per inference file. (predictions that are wrong)
- [ ] Do I have a way to concatenate predictions?
- [ ] Study temperature influence