# How to use a verifier?

## Goal

Let's discover how can we use a verifier to improve prediction selection

## Imports

In [None]:
# Use this to reload changes in python scripts
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import sys
import os
import glob
import random
import numpy as np
from itertools import islice
from tqdm.auto import tqdm

sys.path.append(os.path.realpath('../scripts/'))
from arc24.data import load_arc_data_with_solutions
from evaluation import (
    load_arc_data_with_solutions,
    evaluate,
    plot_grid,
    print_metrics,)
from voting import (
    select_most_voted_solutions,
    select_most_voted_solutions_solving_ties_with_logprob,
    get_unique_matrices_and_counts_sorted
)

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Select prediction models

We have to first select which prediction models are we going to use. They should be models that haven't been trained with the test set.

These models could be used for the task, 3 different models trained without the evaluation dataset.

- /mnt/hdd0/Kaggle/arc24/models/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl
- /mnt/hdd0/Kaggle/arc24/models/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl
- /mnt/hdd0/Kaggle/arc24/models/20240921_optimal_train_duration/05_LoRA-128-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl

## How good is voting?

Let's see how good is voting for the models selected above.

In [None]:
def evaluate_voting_accuracy(filepath, ground_truth_filepath='/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'):
    with open(filepath, 'r') as f:
        predictions = json.load(f)

    ground_truth = load_arc_data_with_solutions(ground_truth_filepath)


    print(filepath)
    metrics = evaluate(ground_truth, predictions, verbose=False)[0]
    metrics['n'] = len(list(predictions.values())[0][0])
    metrics['naive_voting'] = dict()
    metrics['advanced_voting'] = dict()

    for i in range(1, 3):
        metrics['naive_voting'][i] = evaluate(ground_truth, select_most_voted_solutions(predictions, i), verbose=False)[0].get('pass_n', 0)


    with open(filepath.replace('.json', '_task_results.json'), 'r') as f:
        task_results = json.load(f)
    for i in range(1, 3):
        metrics['advanced_voting'][i] = evaluate(ground_truth, select_most_voted_solutions_solving_ties_with_logprob(task_results, i), verbose=False)[0].get('pass_n', 0)
    # print_metrics(metrics)
    return metrics

In [None]:
evaluate_voting_accuracy('/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000/inference_evaluation_x064.json')

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-128-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000/inference_evaluation_x032.json',
]
names = ['LoRA-032', 'LoRA-128', 'full fine-tuning']

In [None]:
all_metrics = dict()
for name, predictions in zip(names, filepaths):
    all_metrics[name] = evaluate_voting_accuracy(predictions)

In [None]:
for name, metrics in all_metrics.items():
    print(name, metrics)

In [None]:
plt.figure(figsize=(25, 5))
for plot_idx, top_n in enumerate([1, 2], 1):
    plt.subplot(1, 2, plot_idx)
    categories, values = [], []
    for name, metrics in all_metrics.items():
        for key in ['advanced_voting']:
            categories.append(f'{name} \npass_32={metrics["pass_n"]:.1%}')
            values.append(metrics[key][top_n]/metrics['pass_n'])
    plt.bar(categories, values)
    plt.grid()
    plt.ylabel('Voting accuracy')
    plt.title(f'Top_n={top_n}')
    plt.ylim(bottom=0.5)
plt.suptitle('Voting accuracy of selecting the correct solution among top_n most voted solutions')
plt.tight_layout()

For these models voting is able to select the best response in the first position around 60% of the times, and around 70% in the top two positions.

To see how well the verifiers can select the correct answer I could focus on a single model, optimize everything for that model and then check if it works as well for the other models.
I could exclude for the optimization all the tasks that do not have a correct answer.

In [None]:
plt.figure(figsize=(25, 5))
for plot_idx, top_n in enumerate([1, 2], 1):
    plt.subplot(1, 2, plot_idx)
    categories, values = [], []
    for name, metrics in all_metrics.items():
        for key in ['naive_voting', 'advanced_voting']:
            categories.append(f'{name} \n{key}\npass_32={metrics["pass_n"]:.1%}')
            values.append(metrics[key][top_n])
    plt.bar(categories, values)
    plt.grid()
    plt.ylabel('Absolute accuracy')
    plt.title(f'Top_n={top_n}')
    plt.ylim(bottom=0.15)
plt.suptitle('Absolute accuracy of the system when using voting')
plt.tight_layout()

## Study distribution of the number of unique responses

In [None]:
def study_distribution_of_the_number_of_unique_responses(predictions, name, ground_truth_filepath='/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'):
    if isinstance(predictions, str):
        with open(predictions, 'r') as f:
            predictions = json.load(f)

    ground_truth = load_arc_data_with_solutions(ground_truth_filepath)
    correct, incorrect = [], []
    for task_id, task_predictions in predictions.items():
        for sample_idx, sample_predictions in enumerate(task_predictions):
            sample_predictions = list(sample_predictions.values())
            sample_predictions = [prediction for prediction in sample_predictions if prediction]
            unique_predictions, _ = get_unique_matrices_and_counts_sorted(sample_predictions)
            if ground_truth[task_id]['test'][sample_idx]['output'] in unique_predictions:
                correct.append(len(unique_predictions))
            else:
                incorrect.append(len(unique_predictions))
    plt.title(f'Distribution of the number of unique responses for {name}')
    plt.hist(correct, bins=range(0, 32), alpha=0.5, label='correct', density=True)
    plt.hist(incorrect, bins=range(0, 32), alpha=0.5, label='incorrect', density=True)
    plt.xlabel('Number of unique responses')
    plt.legend()
    return correct, incorrect

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-128-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000/inference_evaluation_x032.json',
]
names = ['LoRA-032', 'LoRA-128', 'full fine-tuning']
for predictions, name in zip(filepaths, names):
    study_distribution_of_the_number_of_unique_responses(predictions, name); plt.show()

We can see that the distribution of the number of unique responses is different between tasks that have the correct answer and that do not have. On average tasks with a correct response tend to have a smaller number of unique response.

But at the same time there are cases with a lot of unique responses that have the correct answer.

What we can learn from this data is that the method should be able to a number of responses equal to the number of predictions.

## First steps with verifier

### Keep only tasks with at least a correct answer

In [None]:
def keep_only_tasks_with_at_least_a_correct_answer(filepath, ground_truth_filepath='/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'):
    ground_truth = load_arc_data_with_solutions(ground_truth_filepath)
    with open(filepath, 'r') as f:
        predictions = json.load(f)
    keep_task_ids = []
    for task_id, task_predictions in predictions.items():
        keep_task = True
        for sample_idx, sample_predictions in enumerate(task_predictions):
            sample_predictions = list(sample_predictions.values())
            if not ground_truth[task_id]['test'][sample_idx]['output'] in sample_predictions:
                keep_task = False
                break
        if keep_task: keep_task_ids.append(task_id)
    predictions = {task_id: predictions[task_id] for task_id in keep_task_ids}
    return predictions

In [None]:
def evaluate_voting_accuracy(predictions, ground_truth_filepath='/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'):
    ground_truth = load_arc_data_with_solutions(ground_truth_filepath)
    metrics = evaluate(ground_truth, predictions, verbose=False)[0]
    metrics['n'] = len(list(predictions.values())[0][0])
    metrics['naive_voting'] = dict()
    for i in range(1, 3):
        metrics['naive_voting'][i] = evaluate(ground_truth, select_most_voted_solutions(predictions, i), verbose=False)[0].get('pass_n', 0)
    return metrics

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-128-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000/inference_evaluation_x032.json',
]
predictions = keep_only_tasks_with_at_least_a_correct_answer(filepaths[0])
print(len(predictions))
evaluate_voting_accuracy(predictions)

In [None]:
with open('/mnt/hdd0/Kaggle/arc24/debug/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl_checkpoint-40000_inference_evaluation_x032_just_correct_tasks.json', 'w') as f:
    json.dump(predictions, f)

This works correctly, when leaving only tasks with at least one correct answer the `pass_n` metric raises very close to 100% as expected.

In [None]:
def leave_only_unique_predictions(predictions):
    unique_predictions = dict()
    for task_id, task_predictions in predictions.items():
        unique_predictions[task_id] = []
        for sample_predictions in task_predictions:
            sample_predictions = list(sample_predictions.values())
            sample_predictions = [prediction for prediction in sample_predictions if prediction]
            sample_predictions, _ = get_unique_matrices_and_counts_sorted(sample_predictions)
            unique_predictions[task_id].append(sample_predictions)
    return unique_predictions

In [None]:
unique_predictions = leave_only_unique_predictions(predictions)
len(unique_predictions)

### Merge LoRA with the model

```bash
python merge_lora.py  --base_model_path /home/gbarbadillo/data/Qwen2-0.5B-Instruct --lora_path /mnt/hdd0/Kaggle/arc24/models/20241023_first_verifiers/05_verify-and-select_lora032-Qwen2-0.5B-Instruct_lr5e-5_bs32_8000steps_2gpus_8192msl/checkpoint-8000 --output_path /home/gbarbadillo/data/Qwen2-0.5B-Instruct-verifier
```

### Use a verifier model to rank the predictions

In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from arc24.encoders import create_grid_encoder

model_path = '/home/gbarbadillo/data/Qwen2-0.5B-Instruct-verifier'

tokenizer = AutoTokenizer.from_pretrained(model_path)
grid_encoder = create_grid_encoder('GridShapeEncoder(RowNumberEncoder(MinimalGridEncoder()))')
llm = LLM(
    model=model_path,
    trust_remote_code=True,
    dtype='half',
    tensor_parallel_size=2, # to use 2 gpus
    max_model_len=10240,
    #kv_cache_dtype='fp8_e5m2', I have disabled kv cache quantization because it is hurtful
    enforce_eager=True, # without this 13.9GB of memory is used on each GPU, with this is 13.3GB,
    disable_log_stats=True,
    max_num_seqs=255, # default is supposed to be 256 I have used it to solve some weird illegal memory error
    swap_space=4, # CPU swap space size (GiB) per GPU, has great influence on RAM but I haven't noticed any performance difference
)

In [None]:
from inference import get_sampling_params, generate_outputs_with_batches

sampling_params = get_sampling_params(best_of=1, temperature=0, n=1, max_output_tokens=5)

In [None]:
ground_truth = load_arc_data_with_solutions('/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json')

In [None]:
from arc24.data_augmentation import apply_data_augmentation, get_random_color_map, get_random_geometric_augmentation_params
from arc24.prompting import create_prompts_from_task
from itertools import product

def create_prompts(predictions, ground_truth, grid_encoder, tokenizer, prompt_version, verifications_per_prediction):
    # TODO: I don't have to revert data augmentation, so I don't care about saving it
    # TODO: merge ground truth and predictions to create prompts
    prompts = []
    for task_id, task_predictions in tqdm(predictions.items(), total=len(predictions), desc='Creating prompts'):
        for sample_idx, sample_predictions in enumerate(task_predictions):
            for prediction_idx, prediction in enumerate(sample_predictions):
                for _ in range(verifications_per_prediction):
                    task = ground_truth[task_id].copy()
                    task['test'] = [dict(input=task['test'][sample_idx]['input'], output=prediction)]
                    data_augmentation_kwargs = get_random_geometric_augmentation_params()
                    data_augmentation_kwargs['color_map'] = get_random_color_map(change_background_probability=0.1)
                    augmented_task = apply_data_augmentation(task, **data_augmentation_kwargs)
                    augmented_task['test_output'] = augmented_task['test'][0]['output']
                    prompt = create_prompts_from_task(
                        augmented_task, grid_encoder=grid_encoder, tokenizer=tokenizer,
                        is_train_prompt=False, prompt_version=prompt_version)[0]
                    prompts.append(dict(task_id=task_id,
                                        data_augmentation_kwargs=data_augmentation_kwargs,
                                        prompt=prompt,
                                        sample_idx=sample_idx,
                                        prediction_idx=prediction_idx))
    return prompts

prompts = create_prompts(unique_predictions, ground_truth, grid_encoder, tokenizer,
                         prompt_version='verify-output-from-examples-v0',
                         verifications_per_prediction=64)

In [None]:
from inference import generate_outputs_with_batches
outputs = generate_outputs_with_batches(llm, prompts, sampling_params, batch_size=512)

Verifying outputs takes on average 27s/it. By comparison the inference with a model generating output grids takes around 39s/it (for model `20241022_no_training/01_lora064-Qwen2.5-0.5B-Instruct_lr1e-4_bs16_20000steps_2gpus_8192msl/checkpoint-20000/inference_all-test-training_x040_t1e+00.json`)

So it is clearly faster, 30% faster, but I would expected much faster because we are generating very few output tokens compared to generating a grid. It seems that the long input prompt is dominating over the inference.

It has taken just 10 minutes to make 8 verifications per prediction for 120 tasks. In Kaggle it would be slower, but I believe it is fast enough.

In [None]:
np.unique([output.outputs[0].text for output in outputs], return_counts=True)

Great, the outputs have only two values: yes or no. We have to collect the predictions and aggregate the results per prediction.

In [None]:
def aggregate_verification_predictions(outputs, prompts, unique_predictions):
    verifications = [output.outputs[0].text == 'yes' for output in outputs]
    print(np.unique(verifications, return_counts=True))
    aggregated_verifications = {task_id: [np.zeros(len(sample_predictions)) for sample_predictions in task_predictions] for task_id, task_predictions in unique_predictions.items()}
    for verification, prompt in zip(verifications, prompts):
        if verification:
            aggregated_verifications[prompt['task_id']][prompt['sample_idx']][prompt['prediction_idx']] += 1
    return aggregated_verifications

aggregated_verifications = aggregate_verification_predictions(outputs, prompts, unique_predictions)

Let's visualize the distribution of verifications.

In [None]:
aggregated_verifications_values = []
for task_id, task_verifications in aggregated_verifications.items():
    for sample_verifications in task_verifications:
        aggregated_verifications_values.extend(sample_verifications)
plt.hist(aggregated_verifications_values, bins=np.arange(0.5, max(aggregated_verifications_values)+1), alpha=0.5, density=True)
plt.xlabel('Number of verifications')
plt.title('Distribution of the number of verifications for prediction');

This distribution looks good! Let's jump to evaluation and see how good this first approach is.

In [None]:
def select_predictions_with_verifications(unique_predictions, aggregated_verifications, n):
    selected_predictions = dict()
    for task_id, task_predictions in unique_predictions.items():
        selected_predictions[task_id] = []
        for sample_predictions, sample_verifications in zip(task_predictions, aggregated_verifications[task_id]):
            ranking = np.argsort(sample_verifications)[::-1][:n]
            selected_predictions[task_id].append({f'attempt_{attempt_idx}': sample_predictions[idx] for attempt_idx, idx in enumerate(ranking, 1)})
    return selected_predictions

In [None]:
for n_top in [1, 2]:
    selected_predictions = select_predictions_with_verifications(unique_predictions, aggregated_verifications, n_top)
    print(f'Accuracy for n_top {n_top}: {evaluate(ground_truth, selected_predictions, verbose=False)[0]["pass_n"]:.1%}')

- Baseline accuracy for voting is 60.8% and 70.8%: `'naive_voting': {1: 0.6083333333333333, 2: 0.7083333333333334}}`
- With 4 predictions we get 46.7% and 65.8%
- With just 8 predictions we get 54.2% and 71.7%, that is very promising. (10m25 of inference)
- With 16 predictions we get 58.3% and 75.8% (20m56 of inference)
- With 32 predictions we get 66.2% to 80% (42m)
- With 64 predictions we get 66.2% and 77.5%, so it seems we have reached the plateau

## Move to script

```bash
python verify_predictions.py \
--model-path /home/gbarbadillo/data/Qwen2-0.5B-Instruct-verifier \
--output-path /mnt/hdd0/Kaggle/arc24/debug/verifier_selected_predictions.json \
--dataset-path /mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json \
--predictions-path  /mnt/hdd0/Kaggle/arc24/debug/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl_checkpoint-40000_inference_evaluation_x032_just_correct_tasks.json \
--verifications-per-prediction 8
python evaluation.py /mnt/hdd0/Kaggle/arc24/debug/verifier_selected_predictions.json

/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/05_LoRA-032-Qwen2-0.5B-Instruct_lr1e-4_4e4steps_2gpus_8192msl/checkpoint-40000/inference_evaluation_x032.json

```


## TODO

- [] 