# How to use a verifier?

## Goal

Let's discover how can we use a verifier to improve prediction selection

## Imports

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import sys
import os
import glob
import random
import numpy as np
from itertools import islice
from tqdm.auto import tqdm

sys.path.append(os.path.realpath('../scripts/'))
from arc24.data import load_arc_data_with_solutions
from evaluation import (
    load_arc_data_with_solutions,
    evaluate,
    plot_grid,
    print_metrics,)
from voting import (
    select_most_voted_solutions,
    select_most_voted_solutions_solving_ties_with_logprob,
    get_unique_matrices_and_counts_sorted
)

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Select prediction models

We have to first select which prediction models are we going to use. They should be models that haven't been trained with the test set.

These models could be used for the task, 3 different models trained without the evaluation dataset.

- /mnt/hdd0/Kaggle/arc24/models/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl
- /mnt/hdd0/Kaggle/arc24/models/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl
- /mnt/hdd0/Kaggle/arc24/models/20240921_optimal_train_duration/05_LoRA-128-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl

## How good is voting?

Let's see how good is voting for the models selected above.

In [None]:
def evaluate_voting_accuracy(filepath, ground_truth_filepath='/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'):
    ground_truth = load_arc_data_with_solutions(ground_truth_filepath)

    with open(filepath, 'r') as f:
        solutions = json.load(f)
    print(filepath)
    metrics = evaluate(ground_truth, solutions, verbose=False)[0]
    metrics['n'] = len(list(solutions.values())[0][0])
    metrics['naive_voting'] = dict()
    metrics['advanced_voting'] = dict()

    for i in range(1, 3):
        metrics['naive_voting'][i] = evaluate(ground_truth, select_most_voted_solutions(solutions, i), verbose=False)[0].get('pass_n', 0)


    with open(filepath.replace('.json', '_task_results.json'), 'r') as f:
        task_results = json.load(f)
    for i in range(1, 3):
        metrics['advanced_voting'][i] = evaluate(ground_truth, select_most_voted_solutions_solving_ties_with_logprob(task_results, i), verbose=False)[0].get('pass_n', 0)
    # print_metrics(metrics)
    return metrics

In [None]:
evaluate_voting_accuracy('/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000/inference_evaluation_x064.json')

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-128-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000/inference_evaluation_x032.json',
]
names = ['LoRA-032', 'LoRA-128', 'full fine-tuning']

In [None]:
all_metrics = dict()
for name, filepath in zip(names, filepaths):
    all_metrics[name] = evaluate_voting_accuracy(filepath)

In [None]:
for name, metrics in all_metrics.items():
    print(name, metrics)

In [None]:
plt.figure(figsize=(25, 5))
for plot_idx, top_n in enumerate([1, 2], 1):
    plt.subplot(1, 2, plot_idx)
    categories, values = [], []
    for name, metrics in all_metrics.items():
        for key in ['advanced_voting']:
            categories.append(f'{name} \npass_32={metrics["pass_n"]:.1%}')
            values.append(metrics[key][top_n]/metrics['pass_n'])
    plt.bar(categories, values)
    plt.grid()
    plt.ylabel('Voting accuracy')
    plt.title(f'Top_n={top_n}')
    plt.ylim(bottom=0.5)
plt.suptitle('Voting accuracy of selecting the correct solution among top_n most voted solutions')
plt.tight_layout()

For these models voting is able to select the best response in the first position around 60% of the times, and around 70% in the top two positions.

To see how well the verifiers can select the correct answer I could focus on a single model, optimize everything for that model and then check if it works as well for the other models.
I could exclude for the optimization all the tasks that do not have a correct answer.

In [None]:
plt.figure(figsize=(25, 5))
for plot_idx, top_n in enumerate([1, 2], 1):
    plt.subplot(1, 2, plot_idx)
    categories, values = [], []
    for name, metrics in all_metrics.items():
        for key in ['naive_voting', 'advanced_voting']:
            categories.append(f'{name} \n{key}\npass_32={metrics["pass_n"]:.1%}')
            values.append(metrics[key][top_n])
    plt.bar(categories, values)
    plt.grid()
    plt.ylabel('Absolute accuracy')
    plt.title(f'Top_n={top_n}')
    plt.ylim(bottom=0.15)
plt.suptitle('Absolute accuracy of the system when using voting')
plt.tight_layout()

## Study distribution of the number of unique responses

In [None]:
def study_distribution_of_the_number_of_unique_responses(filepath, name, ground_truth_filepath='/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'):
    ground_truth = load_arc_data_with_solutions(ground_truth_filepath)
    with open(filepath, 'r') as f:
        predictions = json.load(f)
    correct, incorrect = [], []
    for task_id, task_predictions in predictions.items():
        for sample_idx, sample_predictions in enumerate(task_predictions):
            sample_predictions = list(sample_predictions.values())
            sample_predictions = [prediction for prediction in sample_predictions if prediction]
            unique_predictions, _ = get_unique_matrices_and_counts_sorted(sample_predictions)
            if ground_truth[task_id]['test'][sample_idx]['output'] in unique_predictions:
                correct.append(len(unique_predictions))
            else:
                incorrect.append(len(unique_predictions))
    plt.title(f'Distribution of the number of unique responses for {name}')
    plt.hist(correct, bins=range(0, 32), alpha=0.5, label='correct', density=True)
    plt.hist(incorrect, bins=range(0, 32), alpha=0.5, label='incorrect', density=True)
    plt.xlabel('Number of unique responses')
    plt.legend()
    return correct, incorrect

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-128-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000/inference_evaluation_x032.json',
]
names = ['LoRA-032', 'LoRA-128', 'full fine-tuning']
for filepath, name in zip(filepaths, names):
    study_distribution_of_the_number_of_unique_responses(filepath, name); plt.show()

We can see that the distribution of the number of unique responses is different between tasks that have the correct answer and that do not have. On average tasks with a correct response tend to have a smaller number of unique response.

But at the same time there are cases with a lot of unique responses that have the correct answer.

What we can learn from this data is that the method should be able to a number of responses equal to the number of predictions.

## First steps with verifier

## TODO

- [] 