# Evaluation

## Goal

This notebook will take the inference of a model and evaluate and visualize it.

This will help to:

- understand the failures of the model
- find a better way to combine the model predictions

## Configuration

In [None]:
class cfg:
    solutions_filepath: str = '/mnt/hdd0/Kaggle/arc24/evaluations/20240914_overfit_to_train/02_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_1e5steps/checkpoint-100000/inference_training_x032.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15_x128_voting.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_t01.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_t08.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission.json'
    # solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15.json'
    solutions_filepath: str = '/mnt/hdd0/Kaggle/arc24/debug/first_predictions/checkpoint-14000/inference_evaluation_x032.json'
    solutions_filepath: str = '/mnt/hdd0/Kaggle/arc24/evaluations/20241014_omni-arc_improvements/02_omni-arc-269-Qwen2.5-0.5B-Instruct_lora064_lr1e-4_bs32_20000steps_7168msl/checkpoint-20000/inference_evaluation_x008_t7e-01.json'
    # solutions_filepath: str = '/mnt/hdd0/Kaggle/arc24/debug/third_model/checkpoint-26000/inference_evaluation_x008_t7e-1.json'
    dataset_filepath: str = '/mnt/hdd0/Kaggle/arc24/data/new_partitions/arc-agi_all_challenges.json'

## Imports

In [None]:
import sys
import os
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from tqdm.auto import tqdm

# add path to python path
sys.path.append(os.path.realpath('../scripts/'))

from evaluation import (
    load_arc_data_with_solutions, evaluate,
    study_effect_of_the_number_of_solutions,
    study_attempt_accuracy,
    print_metrics,
    visualize_tasks_and_predictions)
from voting import (
    select_most_voted_solutions,
    select_most_voted_solutions_solving_ties_with_logprob
)
from arc24.prompting import pretty_print_prompt

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 3)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

In [None]:
def evaluate_multiple_checkpoints(parent_folder):
    ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
    all_metrics = dict()
    for folder in sorted(glob.glob(os.path.join(parent_folder, '*'))):
        print(folder)
        filepaths = sorted(glob.glob(os.path.join(folder, '*/inference*.json')), key=lambda x: int(x.split('checkpoint-')[-1].split('/inference')[0]))
        filepaths = [filepath for filepath in sorted(filepaths) if not filepath.endswith('voting.json') and not filepath.endswith('task_results.json')]
        for filepath in filepaths:
            with open(filepath, 'r') as f:
                solutions = json.load(f)
            print(filepath)
            metrics = evaluate(ground_truth, solutions, verbose=False)[0]
            metrics['n'] = len(list(solutions.values())[0][0])
            with open(filepath.replace('.json', '_task_results.json'), 'r') as f:
                task_results = json.load(f)
            for i in range(1, 3):
                metrics[f'vote_{i}'] = evaluate(ground_truth, select_most_voted_solutions_solving_ties_with_logprob(task_results, i), verbose=False)[0].get('pass_n', 0)

            print_metrics(metrics)
            all_metrics[filepath] = {key: value for key, value in metrics.items() if key != 'max_correct_pixels' and key != 'any_correct_size'}
        print()
    df = pd.DataFrame(all_metrics).T
    df = df[['accuracy', 'pass_n', 'vote_2', 'vote_1', 'correct_pixels', 'correct_size', 'unanswered', 'n']]
    return df

In [None]:
def get_temperature_from_filepath(filepath):
    try:
        return float(filepath.split('_t')[1].split('.json')[0])
    except IndexError:
        return 0

In [None]:
def inspect_task_prompt(task_results, task_id, result_idx=0):
    task_id_results = [result for result in task_results if result['task_id'] == task_id]
    result = task_id_results[result_idx]
    pretty_print_prompt(result['prompt'] + result['response'], default_color='white')

In [None]:
raise

## Evaluation

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
data = load_arc_data_with_solutions(cfg.dataset_filepath)
evaluate(data, solutions);

In [None]:
plt.figure(figsize=(25, 5))
study_effect_of_the_number_of_solutions(solutions, data, n_tries=40)

In [None]:
evaluate(data, select_most_voted_solutions(solutions, 2), verbose=False)[0]

In [None]:
study_attempt_accuracy(solutions, data)

In [None]:
visualize_tasks_and_predictions(solutions, data, only_correct=True, ascending=True, max_predictions=4)

## Temperature analysis

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)

In [None]:
def temperature_analysis(model):
    metrics, temperature = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_{model}_x128_t*.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        temperature.append(float(filepath.split('_t')[1].split('.json')[0])/10)

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.plot(temperature, [m[key] for m in metrics], 'o-')
        plt.title(key)
        plt.grid()
        plt.xlabel('temperature')
    plt.suptitle(f'Effect of the temperature on the model {model}')
    plt.tight_layout()

In [None]:
temperature_analysis(model='qwen05')

In [None]:
temperature_analysis(model='qwen15')

## Study influence of output grid shape

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
data = load_arc_data_with_solutions(cfg.dataset_filepath)
global_metrics, task_metrics = evaluate(data, solutions, verbose=False)
task_ids = list(task_metrics.keys())
global_metrics

In [None]:
def get_output_grid_shape(data):
    shapes = dict()
    for task_id, task in data.items():
        output_shapes = [np.array(sample['output']).shape for sample in task['test'] + task['train']]
        shapes[task_id] = np.mean(output_shapes, axis=0)
    return shapes

output_shapes = get_output_grid_shape(data)

In [None]:
list(global_metrics.keys())

In [None]:
plt.figure(figsize=(25, 5))

keys = ['accuracy', 'correct_pixels', 'pass_n', 'unanswered'] #'correct_size', 
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(global_metrics), plot_idx + 1)
    x = [output_shapes[task_id][1] for task_id in task_ids]
    y = [output_shapes[task_id][0] for task_id in task_ids]
    c = [task_metrics[task_id][key] for task_id in task_ids]
    plt.scatter(x, y, c=c, cmap='viridis', alpha=0.5)
    plt.colorbar(orientation='horizontal')
    plt.title(key)
    plt.xlabel('cols')
    plt.ylabel('rows')
    plt.xlim(0)
    plt.ylim(0)
    plt.grid()
plt.suptitle('Effect of the output shape on the model performance')
plt.tight_layout()

## Pseudo beam-search analysis

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)

def pseudo_beam_search_analysis_v1():
    metrics, n = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x8_T01_n*.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        n.append(int(filepath.split('_n')[1].split('.json')[0]))

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.scatter(n, [m[key] for m in metrics], )
        plt.title(key)
        plt.grid()
        plt.xlabel('n')
    # plt.suptitle(f'Effect of the temperature on the model {model}')
    plt.tight_layout()

pseudo_beam_search_analysis_v1()

No clear result, let's do another analysis with different temperatures.

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
def pseudo_beam_search_analysis_v2(x):
    metrics, temperature = [], []
    for filepath in tqdm(sorted(glob.glob(f'/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x{x}_n20_T??.json'))):
        with open(filepath, 'r') as f:
            solutions = json.load(f)
        print(filepath)
        metrics.append(evaluate(ground_truth, solutions, verbose=False)[0])
        temperature.append(float(filepath.split('_T')[1].split('.json')[0])/10)

    keys = ['accuracy', 'pass_n', 'unanswered']
    for plot_idx, key in enumerate(keys):
        plt.subplot(1, len(keys), plot_idx + 1)
        plt.plot(temperature, [m[key] for m in metrics], 'o-')
        plt.title(key)
        plt.grid()
        plt.xlabel('temperature')
    plt.suptitle(f'Effect of the temperature on the model for {x} predictions')
    plt.tight_layout()

pseudo_beam_search_analysis_v2(8)

In [None]:
pseudo_beam_search_analysis_v2(16)

In [None]:
with open('/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen05_x128_n20_T08.json', 'r') as f:
    solutions = json.load(f)
print(evaluate(ground_truth, solutions, verbose=False)[0])
evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]

In [None]:
with open('/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_qwen15_x128.json', 'r') as f:
    solutions = json.load(f)
print(evaluate(ground_truth, solutions, verbose=False)[0])
evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]

## Validation loss vs metrics

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
filepaths = sorted(glob.glob('/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders/*/*/inference*.json'))
filepaths = [filepath for filepath in filepaths if not filepath.endswith('voting.json') and not filepath.endswith('task_results.json')]
for filepath in filepaths:
    with open(filepath, 'r') as f:
        solutions = json.load(f)
    print(filepath)
    metrics = evaluate(ground_truth, solutions, verbose=False)[0]
    print_metrics(metrics)
    # metrics = evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]
    # print_metrics(metrics)

In [None]:
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)
filepaths = sorted(glob.glob('/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders/06*/*/inference*.json'))
filepaths = [filepath for filepath in filepaths if not filepath.endswith('voting.json') and not filepath.endswith('task_results.json')][:-1]
for filepath in filepaths:
    with open(filepath, 'r') as f:
        solutions = json.load(f)
    print(filepath)
    metrics = evaluate(ground_truth, solutions, verbose=False)[0]
    print_metrics(metrics)
    # metrics = evaluate(ground_truth, select_most_voted_solutions(solutions, 2), verbose=False)[0]
    # print_metrics(metrics)

In [None]:
metrics = {
    "checkpoint_steps": [500, 1000, 2000, 2650, 3000, 3450, 4000, 4850, 6000],
    "accuracy": [0.4, 0.6, 1.1, 1.1, 1.3, 1.4, 2.2, 2.3, 2.3],
    "correct_pixels": [58.3, 62.1, 63.8, 64.2, 65.0, 65.3, 65.4, 65.8, 66.7],
    "max_correct_pixels": [76.7, 76.5, 80.6, 80.6, 80.5, 82.4, 83.0, 81.6, 81.7],
    "correct_size": [78.0, 81.1, 83.4, 82.4, 84.0, 84.2, 84.3, 84.2, 84.9],
    "any_correct_size": [88.5, 89.0, 90.0, 90.5, 90.0, 92.0, 92.0, 90.0, 89.5],
    "pass_n": [6.5, 5.5, 10.0, 11.5, 10.5, 11.0, 15.0, 15.5, 15.0],
    "unanswered": [7.1, 5.1, 5.6, 5.7, 4.3, 4.3, 4.4, 4.8, 4.1],
    'val_loss': [0.237, 0.198, 0.169, 0.171, 0.162, 0.162, 0.159, 0.148, 0.159]
}

keys = ['accuracy', 'correct_pixels', 'correct_size', 'unanswered']
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(keys), plot_idx + 1)
    plt.plot(metrics['checkpoint_steps'], metrics[key], 'o-', label=key)
    plt.grid()
    plt.xlabel('checkpoint_steps')
    plt.ylabel(key)
plt.suptitle('Effect of the number of checkpoint steps on the model performance')
plt.show()

keys = ['pass_n', 'any_correct_size', 'max_correct_pixels', 'val_loss']
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(keys), plot_idx + 1)
    plt.plot(metrics['checkpoint_steps'], metrics[key], 'o-', label=key)
    plt.grid()
    plt.xlabel('checkpoint_steps')
    plt.ylabel(key)
plt.suptitle('Effect of the number of checkpoint steps on the model performance')
plt.show()

keys = ['accuracy', 'correct_pixels', 'correct_size', 'unanswered']
for plot_idx, key in enumerate(keys):
    plt.subplot(1, len(keys), plot_idx + 1)
    plt.plot(metrics['val_loss'], metrics[key], 'o-', label=key)
    plt.grid()
    plt.xlabel('val_loss')
    plt.ylabel(key)
plt.suptitle('Effect of the number of checkpoint steps on the model performance')
plt.show()

## Prompt inspection

In [None]:
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20240928_smolLM/01_full-fine-tuning-SmolLM-135M-Instruct_lr4e-4_1e4steps_1gpus_8192msl/checkpoint-10000/inference_evaluation_x032_task_results.json'
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20240929_smolLM/01_full-fine-tuning-SmolLM-135M-Instruct_lr8e-4_1e3steps_1gpus_8192msl/checkpoint-1000/inference_evaluation_x034_task_results.json'
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20241009_optimize_code_generation/01_omni-arc-400-code-from-examples-v2-Qwen2.5-0.5B-Instruct_lora128_lr1e-4_bs32_2000steps_2gpus_8192msl/checkpoint-2000/inference_evaluation_x008_task_results.json'
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20241011_non-instruct_models/02_baseline_instruct/checkpoint-100/inference_smaller_5_tasks_x008_task_results.json'
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20241011_non-instruct_models/03_full-fine-tune/checkpoint-100/inference_smaller_5_tasks_x009_task_results.json'
filepath = '/mnt/hdd0/Kaggle/arc24/evaluations/20241011_non-instruct_models/07_final_experiment_with_lora_longer/checkpoint-500/inference_smaller_5_tasks_x008_task_results.json'
with open(filepath, 'r') as f:
    task_results = json.load(f)
len(task_results)

In [None]:
task_id = task_results[0]['task_id']
print(f'https://arcprize.org/play?task={task_id}')
inspect_task_prompt(task_results, task_id, result_idx=7)

In [None]:
inspect_task_prompt(task_results, '00576224', result_idx=0)

## Deeper study on the number of predictions

In [None]:
def collect_all_predictions(folder):
    filepaths = glob.glob(os.path.join(folder, 'inference*.json'))
    filepaths = [filepath for filepath in filepaths if not filepath.endswith('voting.json') and not filepath.endswith('task_results.json')]
    with open(filepaths[0], 'r') as f:
        predictions = json.load(f)
    for filepath in tqdm(filepaths[1:], desc='loading predictions'):
        with open(filepath, 'r') as f:
            additional_predictions = json.load(f)
        for task_id, solutions in additional_predictions.items():
            for test_idx, solution in enumerate(solutions):
                for prediction in solution.values():
                    attempt_name = f'attempt_{len(predictions[task_id][test_idx]) + 1}'
                    predictions[task_id][test_idx][attempt_name] = prediction
    print(f'Collected {len(list(predictions.keys()))} tasks, each with {len(list(predictions.values())[0][0])} predictions')
    return predictions

In [None]:
folders = [
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_5e3steps_2gpus_8192msl/checkpoint-5000',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_1e4steps_2gpus_8192msl/checkpoint-10000',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_2e4steps_2gpus_8192msl/checkpoint-20000',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_4e4steps_2gpus_8192msl/checkpoint-40000',
    '/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/01_full-fine-tuning-Qwen2-0.5B-Instruct_lr5e-5_8e4steps_2gpus_8192msl/checkpoint-80000',
]
for folder in folders:
    print(folder)
    predictions = collect_all_predictions(folder=folder)
    plt.figure(figsize=(25, 5))
    training_steps = int(float(folder.split('steps_')[0].split('_')[-1]))
    study_effect_of_the_number_of_solutions(predictions, data, n_tries=100, title=f'{training_steps} training steps', min_predictions=16)

## Evaluate multiple checkpoints

### First evaluations

In [None]:
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders')

In [None]:
# Baseline
#/mnt/hdd0/Kaggle/arc24/evaluations/20240826_grid_encoders/04_row-number-and-grid-shape_Qwen2-0.5B-Instruct_lr1e-4_r32_6e3steps/checkpoint-6000/inference.json
#accuracy: 2.8%	correct_pixels: 66.3%	max_correct_pixels: 82.2%	correct_size: 84.2%	any_correct_size: 91.0%	pass_n: 18.5%	unanswered: 2.8%
#accuracy: 3.5%	correct_pixels: 67.4%	max_correct_pixels: 72.2%	correct_size: 85.4%	any_correct_size: 87.9%	pass_n: 7.1%	unanswered: 2.5%	
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240828_grid_encoders_ttft')

In [None]:
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240901_data_scaling')

In [None]:
evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240902_external_data')

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240905_external_data_v2')
df

In [None]:
df['group'] = [x.split('/')[-3].replace('msl_c', 'msl').replace('msl_b', 'msl') for x in df.index]
df[df.n == 32].groupby('group').mean().style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240910_predict_inputs')
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240903_submission_models')
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240913_more_data/')
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240914_overfit_to_train/')
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240921_overfit_to_train/')
df['training_steps'] = [int(float(x.split('steps_')[0].split('_')[-1])) for x in df.index]
df['dataset'] = list(map(lambda x: 'training' if 'inference_training' in x else 'evaluation', df.index))
df.style.format("{:.2%}", subset=['accuracy', 'pass_n', 'vote_2', 'vote_1', 'correct_pixels', 'correct_size', 'unanswered'])

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240921_optimal_train_duration/')
df['training_steps'] = [float(x.split('steps_')[0].split('_')[-1]) for x in df.index]
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20240929_smolLM/')
df['training_steps'] = [float(x.split('steps_')[0].split('_')[-1]) for x in df.index]
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20241006_omniarc_validation/')
df['training_steps'] = [float(x.split('steps_')[0].split('_')[-1]) for x in df.index]
df.style.format("{:.2%}")

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20241007_batch_size/')
df['batch_size'] = [int(x.split('_lr')[0].split('_bs')[-1]) for x in df.index]
df['lr'] = [float(x.split('_Qwen')[0].split('_lr')[-1]) for x in df.index]
df.sort_values(['batch_size', 'lr'])

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/debug')
df.style.format("{:.2%}")

### Optimize code generation

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20241009_optimize_code_generation/')
df['training_steps'] = [float(x.split('steps_')[0].split('_')[-1]) for x in df.index]
df['temperature'] = [get_temperature_from_filepath(x) for x in df.index]
df['experiment'] = [x.split('/')[-3] for x in df.index]
df.style.format("{:.2%}")

In [None]:
df[df.temperature == 0.7]

In [None]:
for experiment, sub_df in df[df.experiment != '04_omni-arc-150-output-from-examples-v1-Qwen2.5-0.5B-Instruct_lora128_lr1e-4_bs32_2000steps_2gpus_8192msl'].groupby('experiment'):
    plt.plot(sub_df['temperature'], sub_df['pass_n'], 'o-', label=experiment)

In [None]:
df[df.experiment != '04_omni-arc-150-output-from-examples-v1-Qwen2.5-0.5B-Instruct_lora128_lr1e-4_bs32_2000steps_2gpus_8192msl'][['temperature', 'pass_n', 'accuracy', 'vote_1']].groupby('temperature').mean()

In [None]:
results = df[df.experiment != '04_omni-arc-150-output-from-examples-v1-Qwen2.5-0.5B-Instruct_lora128_lr1e-4_bs32_2000steps_2gpus_8192msl'][['temperature', 'pass_n']].groupby('temperature').mean()
results
plt.figure(figsize=(10, 5))
plt.plot(results.index, results['pass_n'], 'o-')
plt.grid()
plt.xlabel('temperature')
plt.ylabel('pass_n')
plt.title('Effect of the temperature on the model performance')

In [None]:
results = df[df.temperature == 0].head(5).sort_values('training_steps')

plt.figure(figsize=(10, 5))
plt.plot(results['training_steps'], results['pass_n'], 'o-')
plt.grid()
plt.xlabel('training steps')
plt.ylabel('pass_n')
plt.title('Effect of the training steps on the model performance')

In [None]:
df.groupby('experiment')[['pass_n', 'vote_2', 'vote_1']].max()

In [None]:
df[df.experiment == '03_omni-arc-333-code-from-examples-v2-Qwen2.5-0.5B-Instruct_lora128_lr1e-4_bs32_8000steps_2gpus_8192msl']

In [None]:
df[(df.temperature == 0) & (df.experiment.apply(lambda x: x.startswith('01') or x.startswith('02')))]['pass_n'].mean()

In [None]:
df[(df.temperature == 0) & (df.experiment.apply(lambda x: x.startswith('03') or x.startswith('05') ))]['pass_n'].mean()

In [None]:
df[(df.temperature == 0)]

### Other

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20241012_coding_models/')
df['training_steps'] = [float(x.split('steps_')[0].split('_')[-1]) for x in df.index]
df['temperature'] = [get_temperature_from_filepath(x) for x in df.index]
# df['experiment'] = [x.split('/')[-3] for x in df.index]
df.index = [x.split('/')[-3] for x in df.index]
df['model'] = ['1.5B' if '1.5B' in x else '7B' for x in df.index]
df['lora'] = [int(x.split('lora')[-1].split('_')[0]) for x in df.index]
# df.style.format("{:.2%}")
df.sort_values(['model', 'training_steps', 'lora'])

In [None]:
df = evaluate_multiple_checkpoints('/mnt/hdd0/Kaggle/arc24/evaluations/20241014_omni-arc_improvements/')
df.style.format("{:.2%}")

## TODO

- [x] Analyze number of succesfull predictions per task, that is the unanswered metric!
- [x] How the number of predictions affects the metrics
- [x] Sort the tasks by accuracy, correct pixels and correct size
- [x] Visualize the tasks, sorted by accuracy
- [x] Visualize the effect of grid shape in the metrics
- [x] Accuracy of each attempt
- [x] Dynamically choose ground truth based on number of predicted tasks (There's no need)
- [x] Show vote_2 metric always
- [x] Also show vote_1 metric