# Evaluation

## Goal

This notebook will take the inference of a model and evaluate and visualize it.

This will help to:

- understand the failures of the model
- find a better way to combine the model predictions

## Imports

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib as mpl

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (15, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Configuration

In [None]:
class cfg:
    solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_10.json'
    solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission.json'
    dataset_filepath: str = '/mnt/hdd0/Kaggle/arc24/data/arc-agi_evaluation_challenges.json'

## Code

### Data

In [None]:
def load_arc_data_with_solutions(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    solutions_filepath = filepath.replace('challenges.json', 'solutions.json')
    if filepath != solutions_filepath and os.path.exists(solutions_filepath):
        with open(solutions_filepath, 'r') as f:
            solutions = json.load(f)
        for sample_id, task in data.items():
            for idx, sample in enumerate(task['test']):
                sample['output'] = solutions[sample_id][idx]
    else:
        print('No solutions file found, the solutions should already be in the data')
    return data

### Plots

In [None]:
def plot_task(task):
    samples = task['train'] + task['test']
    for plot_idx, sample in enumerate(samples):
        plt.subplot(2, len(samples), plot_idx + 1)
        plot_grid(sample['input'])
        if plot_idx < len(task['train']):
            plt.title(f'Train {plot_idx}')
        else:
            plt.title(f'Test {plot_idx - len(task["train"])}')
        if 'output' in sample:
            plt.subplot(2, len(samples), plot_idx + 1 + len(samples))
            plot_grid(sample['output'])

def plot_grids(grids):
    for plot_idx, grid in enumerate(grids):
        plt.subplot(1, len(grids), plot_idx + 1)
        plot_grid(grid)

def plot_grid(grid):
    grid = np.array(grid)
    cmap = colors.ListedColormap(
        ['#000000', '#0074D9','#FF4136','#2ECC40','#FFDC00',
         '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
    norm = colors.Normalize(vmin=0, vmax=9)
    plt.imshow(grid, cmap=cmap, norm=norm)
    plt.grid(True,which='both',color='lightgrey', linewidth=0.5)
    plt.xticks(np.arange(-0.5, grid.shape[1]), [])
    plt.yticks(np.arange(-0.5, grid.shape[0]), [])
    plt.xlim(-0.5, grid.shape[1]-0.5)

### Evaluation

In [None]:
def analyze_number_of_predictions_per_task(data, texts):
    number_of_predictions = dict()
    for task_id, task in data.items():
        number_of_predictions[task_id] = len(texts[task_id]['responses'])/len(task['test'])
    plt.title('Distribution of the number of predictions per task')
    plt.hist(number_of_predictions.values(), bins=np.arange(1.5, 9))
    plt.xlabel('number of predictions')
    plt.ylabel('count')
    return number_of_predictions

In [None]:
def evaluate(ground_truth, solutions):
    """
    Computes the following metrics:

    - Accuracy
    - Correct pixels
    - Correct size
    """
    metrics = dict()
    for task_id in solutions:
        task_metrics = []
        plot_task(ground_truth[task_id]); plt.suptitle(f'{task_id}'); plt.show()
        for test_idx, test_sample in enumerate(ground_truth[task_id]['test']):
            correct_grid = test_sample['output']
            predicted_grids = list(solutions[task_id][test_idx].values())
            predicted_grids = [grid for grid in predicted_grids if grid]

            task_metrics.append(evaluate_grid(correct_grid, predicted_grids))
            print_metrics(task_metrics[-1], f'{task_id}_{test_idx}')
            plot_grids([correct_grid] + predicted_grids)
            plt.suptitle(f'{task_id}_{test_idx}')
            plt.show()
        metrics[task_id] = average_metrics(task_metrics)
    print('\n'*3 + '# Aggregated metrics:')
    global_metrics = average_metrics(list(metrics.values()))
    print_metrics(global_metrics)
    save_metrics(metrics, solutions)
    #plot_metrics_distribution(metrics)
    print_metrics(global_metrics)


def evaluate(ground_truth, solutions):
    """
    Computes the following metrics:

    - Accuracy
    - Correct pixels
    - Correct size
    """
    metrics = dict()
    for task_id in solutions:
        task_metrics = []
        for test_idx, test_sample in enumerate(ground_truth[task_id]['test']):
            correct_grid = test_sample['output']
            predicted_grids = list(solutions[task_id][test_idx].values())
            task_metrics.append(evaluate_grid(correct_grid, predicted_grids))
            #print_metrics(task_metrics[-1], f'{task_id}_{test_idx}')
        metrics[task_id] = average_metrics(task_metrics)
    print_sorted_task_metrics(metrics)
    print('\n'*2 + '# Aggregated metrics:')
    global_metrics = average_metrics(list(metrics.values()))
    print_metrics(global_metrics)
    save_metrics(metrics)
    #plot_metrics_distribution(metrics)
    print_metrics(global_metrics)


def print_sorted_task_metrics(metrics):
    task_ids = list(metrics.keys())
    task_ids = sorted(task_ids, key=lambda x: (metrics[x]['accuracy'], metrics[x]['correct_pixels'], metrics[x]['correct_size']), reverse=True)
    for task_id in task_ids:
        print_metrics(metrics[task_id], f'Task {task_id} ')


def plot_metrics_distribution(metrics):
    for key in metrics[0]:
        values = [x[key] for x in metrics]
        plt.title(f'Distribution of {key}')
        plt.hist(values, bins=np.linspace(0, 1, 10))
        plt.xlabel(key)
        plt.ylabel('count')
        plt.show()

def average_metrics(metrics):
    averaged_metrics = dict()
    for key in metrics[0]:
        averaged_metrics[key] = np.mean([x[key] for x in metrics])
    return averaged_metrics

def save_metrics(metrics):
    metrics['global_metrics'] = average_metrics(list(metrics.values()))
    with open('metrics.json', 'w') as f:
        json.dump(metrics, f)

def print_metrics(metrics, prefix=''):
    text = f'{prefix}'
    for key, value in metrics.items():
        text += f'{key}: {value*100:.1f}%\t'
    print(text)

def evaluate_grid(correct_grid, predicted_grids):
    correct_grid = np.array(correct_grid)
    valid_predicted_grids = [grid for grid in predicted_grids if grid]
    metrics = dict(accuracy=0, correct_pixels=0, correct_size=0, unanswered=(len(predicted_grids) - len(valid_predicted_grids))/len(predicted_grids))
    for predicted_grid in valid_predicted_grids:
        predicted_grid = np.array(predicted_grid)
        if correct_grid.shape == predicted_grid.shape:
            metrics['accuracy'] = max(metrics['accuracy'], np.all(predicted_grid == correct_grid))
            metrics['correct_pixels'] = max(metrics['correct_pixels'], np.mean(predicted_grid == correct_grid))
            metrics['correct_size'] = max(metrics['correct_size'], correct_grid.shape == predicted_grid.shape)
    return metrics

## Evaluation

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
data = load_arc_data_with_solutions(cfg.dataset_filepath)

In [None]:
evaluate(data, solutions)

## TODO

- [x] Analyze number of succesfull predictions per task, that is the unanswered metric!
- [ ] How the number of predictions affects the metrics
- [ ] Sort the tasks by accuracy, correct pixels and correct size
- [ ] Print shape of grid along the metrics, is there any relationship? Smaller tasks are solved more frequently?