# Evaluation

## Goal

This notebook will take the inference of a model and evaluate and visualize it.

This will help to:

- understand the failures of the model
- find a better way to combine the model predictions

## Configuration

In [None]:
class cfg:
    solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_debug.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15_x128_voting.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_t01.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_t08.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15.json'
    dataset_filepath: str = '/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'

## Imports

In [None]:
import sys
import os
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from tqdm.auto import tqdm

# add path to python path
sys.path.append(os.path.realpath('../scripts/'))

from evaluation import (
    load_arc_data_with_solutions, evaluate,
    study_effect_of_the_number_of_solutions,
    study_attempt_accuracy,
    print_metrics,
    visualize_tasks_and_predictions)
from voting import (
    select_most_voted_solutions,
    select_most_voted_solutions_solving_ties_with_logprob
)

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

In [None]:
def evaluate_multiple_checkpoints(parent_folder):
    ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
    all_metrics = dict()
    for folder in sorted(glob.glob(os.path.join(parent_folder, '*'))):
        print(folder)
        filepaths = sorted(glob.glob(os.path.join(folder, '*/inference*.json')), key=lambda x: int(x.split('checkpoint-')[-1].split('/inference')[0]))
        filepaths = [filepath for filepath in sorted(filepaths) if not filepath.endswith('voting.json') and not filepath.endswith('task_results.json')]
        for filepath in filepaths:
            with open(filepath, 'r') as f:
                solutions = json.load(f)
            print(filepath)
            metrics = evaluate(ground_truth, solutions, verbose=False)[0]
            metrics['n'] = len(list(solutions.values())[0][0])
            with open(filepath.replace('.json', '_task_results.json'), 'r') as f:
                task_results = json.load(f)
            for i in range(1, 3):
                metrics[f'vote_{i}'] = evaluate(ground_truth, select_most_voted_solutions_solving_ties_with_logprob(task_results, i), verbose=False)[0]['pass_n']

            print_metrics(metrics)
            all_metrics[filepath] = {key: value for key, value in metrics.items() if key != 'max_correct_pixels' and key != 'any_correct_size'}
        print()
    return pd.DataFrame(all_metrics).T

## Evaluation

In [None]:
raise

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
data = load_arc_data_with_solutions(cfg.dataset_filepath)

In [None]:
evaluate(data, solutions);

```
# python inference.py --max_predictions_per_task=8 --output_filepath=submission_10_b.json --n_tasks=10
# 10 tasks, 8 predictions, 7m17s
accuracy: 0.0%	correct_pixels: 71.3%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 1.2%

# updated implementation, group prompts of task: 1m07s (x7 faster)
accuracy: 0.0%	correct_pixels: 71.2%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 1.2%

# even faster implementation by grouping all prompts: 24s (x18 faster)
accuracy: 0.0%	correct_pixels: 71.3%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 1.2%
# permute train samples
accuracy: 0.0%	correct_pixels: 69.9%	correct_size: 80.0%	pass_n: 0.0%	unanswered: 2.5%

# Same but on all the tasks, 7min
accuracy: 4.4%	correct_pixels: 77.0%	correct_size: 88.0%	pass_n: 12.0%	unanswered: 2.6%
accuracy: 4.4%	correct_pixels: 77.1%	correct_size: 88.0%	pass_n: 12.0%	unanswered: 2.6%
# with the latest implementation: 1m50
accuracy: 4.4%	correct_pixels: 77.1%	correct_size: 88.0%	pass_n: 12.0%	unanswered: 2.6%
# with train sample permutation
accuracy: 4.1%	correct_pixels: 79.0%	correct_size: 90.0%	pass_n: 11.0%	unanswered: 2.8%
# with color augmentation and train sample permutation (this shows that data augmentation is not harmful)
accuracy: 4.9%	correct_pixels: 78.3%	correct_size: 90.0%	pass_n: 16.0%	unanswered: 2.4%
# with x4 times more augmentation (32 predictions per sample), 8min
accuracy: 4.9%	correct_pixels: 83.9%	correct_size: 92.0%	pass_n: 23.0%	unanswered: 2.7%
# with x16 more augmentation (128 predictions per sample), 34min
accuracy: 4.6%	correct_pixels: 85.7%	correct_size: 94.0%	pass_n: 23.5%	unanswered: 2.5%
# with 512 predictions per sample, around 3h30
accuracy: 4.5%	correct_pixels: 87.8%	correct_size: 95.0%	pass_n: 26.0%	unanswered: 2.7%

#beam search with best_of=2, 50min
accuracy: 5.9%	correct_pixels: 78.7%	correct_size: 90.0%	pass_n: 13.5%	unanswered: 2.8%
#beam search with best_of=4, 1h30
accuracy: 6.0%	correct_pixels: 77.5%	correct_size: 89.0%	pass_n: 13.5%	unanswered: 2.8%
#beam search with best_of=8
CUDA error: an illegal memory access was encountered

#qwen 1.5 32 predictions per sample, 12 min, submission_qwen15_x32.json
accuracy: 4.9%	correct_pixels: 83.7%	correct_size: 92.0%	pass_n: 21.0%	unanswered: 3.4%
#qwen 1.5 128 predictions per sample, 55 min
accuracy: 5.0%	correct_pixels: 86.0%	correct_size: 93.0%	pass_n: 25.0%	unanswered: 3.4%
```

In [None]:
study_effect_of_the_number_of_solutions(solutions, data, n_tries=40)

In [None]:
evaluate(data, select_most_voted_solutions(solutions, 2), verbose=False)[0]

In [None]:
study_attempt_accuracy(solutions, data)

Notice how the unanswered dissapear.

```
# before fixing voting bug
accuracy: 6.2%	correct_pixels: 72.6%	correct_size: 85.5%	pass_n: 12.5%	unanswered: 4.0%
Attempt 1 accuracy: 7.5%	correct_pixels: 68.8%	correct_size: 83.0%	pass_n: 7.5%	unanswered: 6.0%	
Attempt 2 accuracy: 5.0%	correct_pixels: 66.0%	correct_size: 81.0%	pass_n: 5.0%	unanswered: 2.0%	

#after fixing voting bug
accuracy: 6.2%	correct_pixels: 72.6%	correct_size: 85.5%	pass_n: 12.5%	unanswered: 0.0%
Attempt 1 accuracy: 7.5%	correct_pixels: 69.7%	correct_size: 84.0%	pass_n: 7.5%	unanswered: 0.0%
Attempt 2 accuracy: 5.0%	correct_pixels: 65.9%	correct_size: 81.0%	pass_n: 5.0%	unanswered: 0.0%
```

In [None]:
raise

In [None]:
visualize_tasks_and_predictions(solutions, data, only_correct=True)

## Temperature analysis

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)

In [None]:
def temperature_analysis(model):
    metrics, temperature = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_{model}_x128_t*.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        temperature.append(float(filepath.split('_t')[1].split('.json')[0])/10)

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.plot(temperature, [m[key] for m in metrics], 'o-')
        plt.title(key)
        plt.grid()
        plt.xlabel('temperature')
    plt.suptitle(f'Effect of the temperature on the model {model}')
    plt.tight_layout()

In [None]:
temperature_analysis(model='qwen05')

In [None]:
temperature_analysis(model='qwen15')

## Study influence of output grid shape

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
data = load_arc_data_with_solutions(cfg.dataset_filepath)
global_metrics, task_metrics = evaluate(data, solutions, verbose=False)
task_ids = list(task_metrics.keys())
global_metrics

In [None]:
def get_output_grid_shape(data):
    shapes = dict()
    for task_id, task in data.items():
        output_shapes = [np.array(sample['output']).shape for sample in task['test'] + task['train']]
        shapes[task_id] = np.mean(output_shapes, axis=0)
    return shapes

output_shapes = get_output_grid_shape(data)

In [None]:
list(global_metrics.keys())

In [None]:
plt.figure(figsize=(25, 5))

keys = ['accuracy', 'correct_pixels', 'pass_n', 'unanswered'] #'correct_size', 
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(global_metrics), plot_idx + 1)
    x = [output_shapes[task_id][1] for task_id in task_ids]
    y = [output_shapes[task_id][0] for task_id in task_ids]
    c = [task_metrics[task_id][key] for task_id in task_ids]
    plt.scatter(x, y, c=c, cmap='viridis', alpha=0.5)
    plt.colorbar(orientation='horizontal')
    plt.title(key)
    plt.xlabel('cols')
    plt.ylabel('rows')
    plt.xlim(0)
    plt.ylim(0)
    plt.grid()
plt.suptitle('Effect of the output shape on the model performance')
plt.tight_layout()

## Pseudo beam-search analysis

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)

def pseudo_beam_search_analysis_v1():
    metrics, n = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x8_T01_n*.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        n.append(int(filepath.split('_n')[1].split('.json')[0]))

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.scatter(n, [m[key] for m in metrics], )
        plt.title(key)
        plt.grid()
        plt.xlabel('n')
    # plt.suptitle(f'Effect of the temperature on the model {model}')
    plt.tight_layout()

pseudo_beam_search_analysis_v1()

No clear result, let's do another analysis with different temperatures.

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
def pseudo_beam_search_analysis_v2(x):
    metrics, temperature = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x{x}_n20_T??.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        print(filepath)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        temperature.append(float(filepath.split('_T')[1].split('.json')[0])/10)

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.plot(temperature, [m[key] for m in metrics], 'o-')
        plt.title(key)
        plt.grid()
        plt.xlabel('temperature')
    plt.suptitle(f'Effect of the temperature on the model for {x} predictions')
    plt.tight_layout()

pseudo_beam_search_analysis_v2(8)

In [None]:
pseudo_beam_search_analysis_v2(16)

In [None]:
with open('/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_n20_T08.json', 'r') as f:
    solutions = json.load(f)
print(evaluate(ground_truth, solutions, verbose=False)[0])
evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]

In [None]:
with open('/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15_x128.json', 'r') as f:
    solutions = json.load(f)
print(evaluate(ground_truth, solutions, verbose=False)[0])
evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]

## Validation loss vs metrics

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
filepaths = sorted(glob.glob('/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders/*/*/inference*.json'))
filepaths = [filepath for filepath in filepaths if not filepath.endswith('voting.json') and not filepath.endswith('task_results.json')]
for filepath in filepaths:
    with open(filepath, 'r') as f:
        solutions = json.load(f)
    print(filepath)
    metrics = evaluate(ground_truth, solutions, verbose=False)[0]
    print_metrics(metrics)
    # metrics = evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]
    # print_metrics(metrics)

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
filepaths = sorted(glob.glob('/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders/06*/*/inference*.json'))
filepaths = [filepath for filepath in filepaths if not filepath.endswith('voting.json') and not filepath.endswith('task_results.json')][:-1]
for filepath in filepaths:
    with open(filepath, 'r') as f:
        solutions = json.load(f)
    print(filepath)
    metrics = evaluate(ground_truth, solutions, verbose=False)[0]
    print_metrics(metrics)
    # metrics = evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]
    # print_metrics(metrics)

In [None]:
metrics = {
    "checkpoint_steps": [500, 1000, 2000, 2650, 3000, 3450, 4000, 4850, 6000],
    "accuracy": [0.4, 0.6, 1.1, 1.1, 1.3, 1.4, 2.2, 2.3, 2.3],
    "correct_pixels": [58.3, 62.1, 63.8, 64.2, 65.0, 65.3, 65.4, 65.8, 66.7],
    "max_correct_pixels": [76.7, 76.5, 80.6, 80.6, 80.5, 82.4, 83.0, 81.6, 81.7],
    "correct_size": [78.0, 81.1, 83.4, 82.4, 84.0, 84.2, 84.3, 84.2, 84.9],
    "any_correct_size": [88.5, 89.0, 90.0, 90.5, 90.0, 92.0, 92.0, 90.0, 89.5],
    "pass_n": [6.5, 5.5, 10.0, 11.5, 10.5, 11.0, 15.0, 15.5, 15.0],
    "unanswered": [7.1, 5.1, 5.6, 5.7, 4.3, 4.3, 4.4, 4.8, 4.1],
    'val_loss': [0.237, 0.198, 0.169, 0.171, 0.162, 0.162, 0.159, 0.148, 0.159]
}

keys = ['accuracy', 'correct_pixels', 'correct_size', 'unanswered']
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(keys), plot_idx + 1)
    plt.plot(metrics['checkpoint_steps'], metrics[key], 'o-', label=key)
    plt.grid()
    plt.xlabel('checkpoint_steps')
    plt.ylabel(key)
plt.suptitle('Effect of the number of checkpoint steps on the model performance')
plt.show()

keys = ['pass_n', 'any_correct_size', 'max_correct_pixels', 'val_loss']
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(keys), plot_idx + 1)
    plt.plot(metrics['checkpoint_steps'], metrics[key], 'o-', label=key)
    plt.grid()
    plt.xlabel('checkpoint_steps')
    plt.ylabel(key)
plt.suptitle('Effect of the number of checkpoint steps on the model performance')
plt.show()

keys = ['accuracy', 'correct_pixels', 'correct_size', 'unanswered']
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(keys), plot_idx + 1)
    plt.plot(metrics['val_loss'], metrics[key], 'o-', label=key)
    plt.grid()
    plt.xlabel('val_loss')
    plt.ylabel(key)
plt.suptitle('Effect of the number of checkpoint steps on the model performance')
plt.show()

## Evaluate multiple checkpoints

In [None]:
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders')

In [None]:
# Baseline
#/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders/04_row-number-and-grid-shape_Qwen2-0.5B-Instruct_lr1e-4_r32_6e3steps/checkpoint-6000/inference.json
#accuracy: 2.8%	correct_pixels: 66.3%	max_correct_pixels: 82.2%	correct_size: 84.2%	any_correct_size: 91.0%	pass_n: 18.5%	unanswered: 2.8%
#accuracy: 3.5%	correct_pixels: 67.4%	max_correct_pixels: 72.2%	correct_size: 85.4%	any_correct_size: 87.9%	pass_n: 7.1%	unanswered: 2.5%	
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240828_grid_encoders_ttft')

In [None]:
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240901_data_scaling')

In [None]:
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240902_external_data')

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240905_external_data_v2')
df

In [None]:
df['group'] = [x.split('/')[-3].replace('msl_c', 'msl').replace('msl_b', 'msl') for x in df.index]
df[df.n == 32].groupby('group').mean().style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240910_predict_inputs')
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240903_submission_models')
df.style.format("{:.2%}")

## TODO

- [x] Analyze number of succesfull predictions per task, that is the unanswered metric!
- [x] How the number of predictions affects the metrics
- [x] Sort the tasks by accuracy, correct pixels and correct size
- [x] Visualize the tasks, sorted by accuracy
- [x] Visualize the effect of grid shape in the metrics
- [x] Accuracy of each attempt
- [x] Dynamically choose ground truth based on number of predicted tasks (There's no need)
- [x] Show vote_2 metric always
- [x] Also show vote_1 metric