# Evaluation

## Goal

This notebook will take the inference of a model and evaluate and visualize it.

This will help to:

- understand the failures of the model
- find a better way to combine the model predictions

## Configuration

In [None]:
class cfg:
    solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_x512.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15_x128_voting.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_t01.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_t08.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15.json'
    dataset_filepath: str = '/mnt/hdd0/Kaggle/arc24/data/arc-agi_evaluation_challenges.json'

## Imports

In [None]:
import sys
import os
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.auto import tqdm

# add path to python path
sys.path.append(os.path.realpath('../scripts/'))

from evaluation import (
    load_arc_data_with_solutions, evaluate,
    study_effect_of_the_number_of_solutions,
    study_attempt_accuracy,
    visualize_tasks_and_predictions)

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Evaluation

In [None]:
raise

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
data = load_arc_data_with_solutions(cfg.dataset_filepath)

In [None]:
evaluate(data, solutions);

```
# python inference.py --max_predictions_per_task=8 --output_filepath=submission_10_b.json --n_tasks=10
# 10 tasks, 8 predictions, 7m17s
accuracy: 0.0%	correct_pixels: 71.3%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 1.2%

# updated implementation, group prompts of task: 1m07s (x7 faster)
accuracy: 0.0%	correct_pixels: 71.2%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 1.2%

# even faster implementation by grouping all prompts: 24s (x18 faster)
accuracy: 0.0%	correct_pixels: 71.3%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 1.2%
# permute train samples
accuracy: 0.0%	correct_pixels: 69.9%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 2.5%

# Same but on all the tasks, 7min
accuracy: 4.4%	correct_pixels: 77.0%	correct_size: 88.0%	pass_n: 12.0%	unanswered: 2.6%
accuracy: 4.4%	correct_pixels: 77.1%	correct_size: 88.0%	pass_n: 12.0%	unanswered: 2.6%
# with the latest implementation: 1m50
accuracy: 4.4%	correct_pixels: 77.1%	correct_size: 88.0%	pass_n: 12.0%	unanswered: 2.6%
# with train sample permutation
accuracy: 4.1%	correct_pixels: 79.0%	correct_size: 90.0%	pass_n: 11.0%	unanswered: 2.8%
# with color augmentation and train sample permutation (this shows that data augmentation is not harmful)
accuracy: 4.9%	correct_pixels: 78.3%	correct_size: 90.0%	pass_n: 16.0%	unanswered: 2.4%
# with x4 times more augmentation (32 predictions per sample), 8min
accuracy: 4.9%	correct_pixels: 83.9%	correct_size: 92.0%	pass_n: 23.0%	unanswered: 2.7%
# with x16 more augmentation (128 predictions per sample), 34min
accuracy: 4.6%	correct_pixels: 85.7%	correct_size: 94.0%	pass_n: 23.5%	unanswered: 2.5%
# with 512 predictions per sample, around 3h30
accuracy: 4.5%	correct_pixels: 87.8%	correct_size: 95.0%	pass_n: 26.0%	unanswered: 2.7%

#beam search with best_of=2, 50min
accuracy: 5.9%	correct_pixels: 78.7%	correct_size: 90.0%	pass_n: 13.5%	unanswered: 2.8%
#beam search with best_of=4, 1h30
accuracy: 6.0%	correct_pixels: 77.5%	correct_size: 89.0%	pass_n: 13.5%	unanswered: 2.8%
#beam search with best_of=8
CUDA error: an illegal memory access was encountered

#qwen 1.5 32 predictions per sample, 12 min, submission_qwen15_x32.json
accuracy: 4.9%	correct_pixels: 83.7%	correct_size: 92.0%	pass_n: 21.0%	unanswered: 3.4%
#qwen 1.5 128 predictions per sample, 55 min
accuracy: 5.0%	correct_pixels: 86.0%	correct_size: 93.0%	pass_n: 25.0%	unanswered: 3.4%
```

In [None]:
study_effect_of_the_number_of_solutions(solutions, data, n_tries=40)

In [None]:
study_attempt_accuracy(solutions, data)

Notice how the unanswered dissapear.

```
# before fixing voting bug
accuracy: 6.2%	correct_pixels: 72.6%	correct_size: 85.5%	pass_n: 12.5%	unanswered: 4.0%
Attempt 1 accuracy: 7.5%	correct_pixels: 68.8%	correct_size: 83.0%	pass_n: 7.5%	unanswered: 6.0%	
Attempt 2 accuracy: 5.0%	correct_pixels: 66.0%	correct_size: 81.0%	pass_n: 5.0%	unanswered: 2.0%	

#after fixing voting bug
accuracy: 6.2%	correct_pixels: 72.6%	correct_size: 85.5%	pass_n: 12.5%	unanswered: 0.0%
Attempt 1 accuracy: 7.5%	correct_pixels: 69.7%	correct_size: 84.0%	pass_n: 7.5%	unanswered: 0.0%
Attempt 2 accuracy: 5.0%	correct_pixels: 65.9%	correct_size: 81.0%	pass_n: 5.0%	unanswered: 0.0%
```

In [None]:
raise

In [None]:
visualize_tasks_and_predictions(solutions, data, only_correct=True)

## Temperature analysis

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)

In [None]:
def temperature_analysis(model):
    metrics, temperature = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_{model}_x128_t*.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        temperature.append(float(filepath.split('_t')[1].split('.json')[0])/10)

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.plot(temperature, [m[key] for m in metrics], 'o-')
        plt.title(key)
        plt.grid()
        plt.xlabel('temperature')
    plt.suptitle(f'Effect of the temperature on the model {model}')
    plt.tight_layout()

In [None]:
temperature_analysis(model='qwen05')

In [None]:
temperature_analysis(model='qwen15')

## Study influence of output grid shape

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
data = load_arc_data_with_solutions(cfg.dataset_filepath)
global_metrics, task_metrics = evaluate(data, solutions, verbose=False)
task_ids = list(task_metrics.keys())
global_metrics

In [None]:
def get_output_grid_shape(data):
    shapes = dict()
    for task_id, task in data.items():
        output_shapes = [np.array(sample['output']).shape for sample in task['test'] + task['train']]
        shapes[task_id] = np.mean(output_shapes, axis=0)
    return shapes

output_shapes = get_output_grid_shape(data)

In [None]:
list(global_metrics.keys())

In [None]:
plt.figure(figsize=(25, 5))

keys = ['accuracy', 'correct_pixels', 'pass_n', 'unanswered'] #'correct_size', 
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(global_metrics), plot_idx + 1)
    x = [output_shapes[task_id][1] for task_id in task_ids]
    y = [output_shapes[task_id][0] for task_id in task_ids]
    c = [task_metrics[task_id][key] for task_id in task_ids]
    plt.scatter(x, y, c=c, cmap='viridis', alpha=0.5)
    plt.colorbar(orientation='horizontal')
    plt.title(key)
    plt.xlabel('cols')
    plt.ylabel('rows')
    plt.xlim(0)
    plt.ylim(0)
    plt.grid()
plt.suptitle('Effect of the output shape on the model performance')
plt.tight_layout()

## Pseudo beam-search analysis

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)

def pseudo_beam_search_analysis_v1():
    metrics, n = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x8_T01_n*.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        n.append(int(filepath.split('_n')[1].split('.json')[0]))

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.scatter(n, [m[key] for m in metrics], )
        plt.title(key)
        plt.grid()
        plt.xlabel('n')
    # plt.suptitle(f'Effect of the temperature on the model {model}')
    plt.tight_layout()

pseudo_beam_search_analysis_v1()

No clear result, let's do another analysis with different temperatures.

In [None]:
submission_qwen05_x8_T01_n8.json

## TODO

- [x] Analyze number of succesfull predictions per task, that is the unanswered metric!
- [x] How the number of predictions affects the metrics
- [x] Sort the tasks by accuracy, correct pixels and correct size
- [x] Visualize the tasks, sorted by accuracy
- [ ] Visualize the effect of grid shape in the metrics
- [x] Accuracy of each attempt