# ARC AGI Benchmark Task Solutions Testing

This notebook tests the solve functions against the ARC AGI benchmark tasks from the test_set folder.

We'll test 5 different tasks:
- 05f2a901
- 1cf80156
- 1e0a9b12
- 2bcee788
- 7ddcd7ec

For each task, we'll:
1. Load the task data (train and test examples)
2. Apply the solve function
3. Compare with expected outputs
4. Calculate accuracy score

In [6]:
import sys
import os
import json
import numpy as np
from typing import Tuple

# add dsl folder to path
sys.path.insert(0, os.path.join(os.getcwd(), 'dsl'))

# import all DSL functions
from dsl import *
from constants import *
from arc_types import *

## Define Solve Functions

These are the solve functions that combine atomic transformations to solve specific ARC tasks.

In [7]:
def solve_1cf80156(I):
    x1 = objects(I, T, T, T)
    x2 = first(x1)
    O = subgrid(x2, I)
    return O


def solve_1e0a9b12(I):
    x1 = rot270(I)
    x2 = rbind(order, identity)
    x3 = apply(x2, x1)
    O = rot90(x3)
    return O


def solve_2bcee788(I):
    x1 = mostcolor(I)
    x2 = objects(I, T, F, T)
    x3 = replace(I, x1, THREE)
    x4 = argmax(x2, size)
    x5 = argmin(x2, size)
    x6 = position(x4, x5)
    x7 = first(x6)
    x8 = last(x6)
    x9 = subgrid(x4, x3)
    x10 = hline(x5)
    x11 = hmirror(x9)
    x12 = vmirror(x9)
    x13 = branch(x10, x11, x12)
    x14 = branch(x10, x7, ZERO)
    x15 = branch(x10, ZERO, x8)
    x16 = asobject(x13)
    x17 = matcher(first, THREE)
    x18 = compose(flip, x17)
    x19 = sfilter(x16, x18)
    x20 = ulcorner(x4)
    x21 = shape(x4)
    x22 = astuple(x14, x15)
    x23 = multiply(x21, x22)
    x24 = add(x20, x23)
    x25 = shift(x19, x24)
    O = paint(x3, x25)
    return O


def solve_05f2a901(I):
    x1 = objects(I, T, F, T)
    x2 = colorfilter(x1, TWO)
    x3 = first(x2)
    x4 = colorfilter(x1, EIGHT)
    x5 = first(x4)
    x6 = gravitate(x3, x5)
    O = move(I, x3, x6)
    return O


def solve_7ddcd7ec(I):
    x1 = objects(I, T, F, T)
    x2 = sizefilter(x1, ONE)
    x3 = difference(x1, x2)
    x4 = first(x3)
    x5 = color(x4)
    x6 = lbind(position, x4)
    x7 = fork(shoot, center, x6)
    x8 = mapply(x7, x2)
    O = fill(I, x5, x8)
    return O

## Utility Functions

Helper functions to load task data and compare results.

In [8]:
def load_task(task_id: str) -> dict:
    """load a task from the test_set folder."""
    task_path = os.path.join('dsl', 'test_set', f'{task_id}.json')
    with open(task_path, 'r') as f:
        return json.load(f)


def grid_to_tuple(grid: list) -> Grid:
    """convert a list grid to a tuple grid for DSL functions."""
    return tuple(tuple(row) for row in grid)


def tuple_to_list(grid: Grid) -> list:
    """convert a tuple grid back to a list for comparison."""
    if isinstance(grid, tuple):
        return [list(row) if isinstance(row, tuple) else row for row in grid]
    return grid


def compare_grids(output: Grid, expected: list) -> Tuple[bool, float]:
    """
    compare output grid with expected output.
    returns (is_exact_match, pixel_accuracy)
    """
    output_list = tuple_to_list(output)
    
    # Check if shapes match
    if len(output_list) != len(expected):
        return False, 0.0
    if len(output_list) > 0 and len(output_list[0]) != len(expected[0]):
        return False, 0.0
    
    # Calculate pixel accuracy
    total_pixels = len(expected) * len(expected[0]) if len(expected) > 0 else 0
    if total_pixels == 0:
        return True, 1.0
    
    correct_pixels = 0
    for i in range(len(expected)):
        for j in range(len(expected[0])):
            if output_list[i][j] == expected[i][j]:
                correct_pixels += 1
    
    accuracy = correct_pixels / total_pixels
    is_exact = accuracy == 1.0
    
    return is_exact, accuracy


def test_task(task_id: str, solve_func):
    """
    Test a solve function on a specific task.
    Returns results dict with training and test performance.
    """
    print(f"\n{'='*60}")
    print(f"Testing Task: {task_id}")
    print(f"{'='*60}")
    
    task_data = load_task(task_id)
    results = {
        'task_id': task_id,
        'train_examples': [],
        'test_examples': [],
        'train_accuracy': 0.0,
        'test_accuracy': 0.0,
        'train_exact_matches': 0,
        'test_exact_matches': 0
    }
    
    # Test on training examples
    print(f"\nTraining Examples ({len(task_data['train'])} examples):")
    for idx, example in enumerate(task_data['train']):
        input_grid = grid_to_tuple(example['input'])
        expected_output = example['output']
        
        try:
            output = solve_func(input_grid)
            is_exact, accuracy = compare_grids(output, expected_output)
            
            results['train_examples'].append({
                'example_idx': idx,
                'is_exact': is_exact,
                'accuracy': accuracy
            })
            
            status = "PASS" if is_exact else f"FAIL ({accuracy*100:.1f}%)"
            print(f"  Example {idx+1}: {status}")
            
            if is_exact:
                results['train_exact_matches'] += 1
                
        except Exception as e:
            print(f"  Example {idx+1}: ERROR - {str(e)}")
            results['train_examples'].append({
                'example_idx': idx,
                'is_exact': False,
                'accuracy': 0.0,
                'error': str(e)
            })
    
    # calculate training accuracy
    if results['train_examples']:
        results['train_accuracy'] = np.mean([ex['accuracy'] for ex in results['train_examples']])
    
    # test on test examples
    print(f"\nTest Examples ({len(task_data['test'])} examples):")
    for idx, example in enumerate(task_data['test']):
        input_grid = grid_to_tuple(example['input'])
        expected_output = example['output']
        
        try:
            output = solve_func(input_grid)
            is_exact, accuracy = compare_grids(output, expected_output)
            
            results['test_examples'].append({
                'example_idx': idx,
                'is_exact': is_exact,
                'accuracy': accuracy
            })
            
            status = "PASS" if is_exact else f"FAIL ({accuracy*100:.1f}%)"
            print(f"  Example {idx+1}: {status}")
            
            if is_exact:
                results['test_exact_matches'] += 1
                
        except Exception as e:
            print(f"  Example {idx+1}: ERROR - {str(e)}")
            results['test_examples'].append({
                'example_idx': idx,
                'is_exact': False,
                'accuracy': 0.0,
                'error': str(e)
            })
    
    # Calculate test accuracy
    if results['test_examples']:
        results['test_accuracy'] = np.mean([ex['accuracy'] for ex in results['test_examples']])
    
    # Print summary
    print(f"\n Task Summary:")
    print(f"  Training: {results['train_exact_matches']}/{len(task_data['train'])} exact matches, "
          f"avg accuracy: {results['train_accuracy']*100:.1f}%")
    print(f"  Test: {results['test_exact_matches']}/{len(task_data['test'])} exact matches, "
          f"avg accuracy: {results['test_accuracy']*100:.1f}%")
    
    return results

## Run Tests on All Tasks

In [9]:
# define the tasks and their corresponding solve functions
tasks = [
    ('05f2a901', solve_05f2a901),
    ('1cf80156', solve_1cf80156),
    ('1e0a9b12', solve_1e0a9b12),
    ('2bcee788', solve_2bcee788),
    ('7ddcd7ec', solve_7ddcd7ec)
]

# run tests on all tasks
all_results = []

for task_id, solve_func in tasks:
    try:
        results = test_task(task_id, solve_func)
        all_results.append(results)
    except Exception as e:
        print(f"\n Failed to test task {task_id}: {str(e)}")
        all_results.append({
            'task_id': task_id,
            'error': str(e),
            'train_accuracy': 0.0,
            'test_accuracy': 0.0
        })

print("\n" + "="*60)
print("Testing Complete!")
print("="*60)


Testing Task: 05f2a901

Training Examples (3 examples):
  Example 1: PASS
  Example 2: PASS
  Example 3: PASS

Test Examples (1 examples):
  Example 1: PASS

 Task Summary:
  Training: 3/3 exact matches, avg accuracy: 100.0%
  Test: 1/1 exact matches, avg accuracy: 100.0%

Testing Task: 1cf80156

Training Examples (3 examples):
  Example 1: PASS
  Example 2: PASS
  Example 3: PASS

Test Examples (1 examples):
  Example 1: PASS

 Task Summary:
  Training: 3/3 exact matches, avg accuracy: 100.0%
  Test: 1/1 exact matches, avg accuracy: 100.0%

Testing Task: 1e0a9b12

Training Examples (3 examples):
  Example 1: PASS
  Example 2: PASS
  Example 3: PASS

Test Examples (1 examples):
  Example 1: PASS

 Task Summary:
  Training: 3/3 exact matches, avg accuracy: 100.0%
  Test: 1/1 exact matches, avg accuracy: 100.0%

Testing Task: 2bcee788

Training Examples (4 examples):
  Example 1: PASS
  Example 2: PASS
  Example 3: PASS
  Example 4: PASS

Test Examples (1 examples):
  Example 1: PASS

 

## Overall Performance Summary

Final scores across all tasks.

In [10]:
print("\n" + "="*70)
print("OVERALL PERFORMANCE SUMMARY")
print("="*70)

# Calculate overall statistics
total_train_accuracy = []
total_test_accuracy = []
total_train_exact = 0
total_test_exact = 0
total_train_examples = 0
total_test_examples = 0

print(f"\n{'Task ID':<15} {'Train Acc':<15} {'Test Acc':<15} {'Train Match':<15} {'Test Match':<15}")
print("-" * 70)

for result in all_results:
    task_id = result['task_id']
    train_acc = result.get('train_accuracy', 0.0)
    test_acc = result.get('test_accuracy', 0.0)
    train_match = result.get('train_exact_matches', 0)
    test_match = result.get('test_exact_matches', 0)
    train_total = len(result.get('train_examples', []))
    test_total = len(result.get('test_examples', []))
    
    total_train_accuracy.append(train_acc)
    total_test_accuracy.append(test_acc)
    total_train_exact += train_match
    total_test_exact += test_match
    total_train_examples += train_total
    total_test_examples += test_total
    
    print(f"{task_id:<15} {train_acc*100:>6.1f}%{'':<8} {test_acc*100:>6.1f}%{'':<8} "
          f"{train_match}/{train_total}{'':<10} {test_match}/{test_total}{'':<10}")

print("-" * 70)
print(f"\n{'OVERALL':<15} {np.mean(total_train_accuracy)*100:>6.1f}%{'':<8} "
      f"{np.mean(total_test_accuracy)*100:>6.1f}%{'':<8} "
      f"{total_train_exact}/{total_train_examples}{'':<10} "
      f"{total_test_exact}/{total_test_examples}{'':<10}")

print(f"\n> Summary:")
print(f"   • Average Training Accuracy: {np.mean(total_train_accuracy)*100:.1f}%")
print(f"   • Average Test Accuracy: {np.mean(total_test_accuracy)*100:.1f}%")
print(f"   • Total Training Exact Matches: {total_train_exact}/{total_train_examples} "
      f"({total_train_exact/total_train_examples*100:.1f}%)")
print(f"   • Total Test Exact Matches: {total_test_exact}/{total_test_examples} "
      f"({total_test_exact/total_test_examples*100:.1f}%)")
print(f"   • Tasks Tested: {len(all_results)}")

print("\n" + "="*70)


OVERALL PERFORMANCE SUMMARY

Task ID         Train Acc       Test Acc        Train Match     Test Match     
----------------------------------------------------------------------
05f2a901         100.0%          100.0%         3/3           1/1          
1cf80156         100.0%          100.0%         3/3           1/1          
1e0a9b12         100.0%          100.0%         3/3           1/1          
2bcee788         100.0%          100.0%         4/4           1/1          
7ddcd7ec         100.0%          100.0%         3/3           1/1          
----------------------------------------------------------------------

OVERALL          100.0%          100.0%         16/16           5/5          

> Summary:
   • Average Training Accuracy: 100.0%
   • Average Test Accuracy: 100.0%
   • Total Training Exact Matches: 16/16 (100.0%)
   • Total Test Exact Matches: 5/5 (100.0%)
   • Tasks Tested: 5



## Analysis and Insights

Key findings from the testing process.

In [11]:
print("Task-by-Task Analysis:\n")

for result in all_results:
    task_id = result['task_id']
    print(f"\n>Task {task_id}:")
    
    if 'error' in result and 'train_examples' not in result:
        print(f">>>>Critical Error: {result['error']}")
        continue
    
    # Check if all training examples passed
    train_examples = result.get('train_examples', [])
    test_examples = result.get('test_examples', [])
    
    if train_examples:
        all_train_pass = all(ex.get('is_exact', False) for ex in train_examples)
        if all_train_pass:
            print(f"OK: All training examples solved correctly!")
        else:
            failed = [ex['example_idx']+1 for ex in train_examples if not ex.get('is_exact', False)]
            print(f"Training examples failed: {failed}")
    
    if test_examples:
        all_test_pass = all(ex.get('is_exact', False) for ex in test_examples)
        if all_test_pass:
            print(f"OK: All test examples solved correctly!")
        else:
            failed = [ex['example_idx']+1 for ex in test_examples if not ex.get('is_exact', False)]
            print(f"Test examples failed: {failed}")
    
    # Check for errors
    errors = [ex for ex in train_examples + test_examples if 'error' in ex]
    if errors:
        print(f"Encountered {len(errors)} error(s) during execution")
        for err in errors[:2]:  # Show first 2 errors
            print(f"      - {err.get('error', 'Unknown error')[:80]}")

print("\n" + "="*70)

Task-by-Task Analysis:


>Task 05f2a901:
OK: All training examples solved correctly!
OK: All test examples solved correctly!

>Task 1cf80156:
OK: All training examples solved correctly!
OK: All test examples solved correctly!

>Task 1e0a9b12:
OK: All training examples solved correctly!
OK: All test examples solved correctly!

>Task 2bcee788:
OK: All training examples solved correctly!
OK: All test examples solved correctly!

>Task 7ddcd7ec:
OK: All training examples solved correctly!
OK: All test examples solved correctly!

