In [1]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceM4/RAVEN", "center_single", split="train")

In [2]:
ds

Dataset({
    features: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata'],
    num_rows: 6000
})

In [3]:
# Examine the structure of a single example
example = ds[0]
print("Keys:", list(example.keys()))
print("\nPanels shape:", len(example['panels']))
print("Choices shape:", len(example['choices']))
print("Target:", example['target'])
print("ID:", example['id'])

# Look at the first panel and choice to understand the image format
print("\nFirst panel type:", type(example['panels'][0]))
print("First choice type:", type(example['choices'][0]))


Keys: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata']

Panels shape: 8
Choices shape: 8
Target: 4
ID: 3023

First panel type: <class 'PIL.PngImagePlugin.PngImageFile'>
First choice type: <class 'PIL.PngImagePlugin.PngImageFile'>


In [4]:
# Test the RAVENRunner class
import sys
sys.path.append('..')  # Add parent directory to path

from preprocessing.load_raven import RAVENRunner
import os

# Check if we can import and initialize (without real credentials)
print("Testing RAVENRunner import...")
print("RAVENRunner class imported successfully!")


Testing RAVENRunner import...
RAVENRunner class imported successfully!


In [7]:
# Test RAVENRunner initialization (won't actually call API)
print("Testing RAVENRunner initialization...")

try:
    # This will load the datasets but won't make API calls
    runner = RAVENRunner(
        model_name="azure/gpt-4.1",
        reasoning_effort="high"
    )
    print("✓ RAVENRunner initialized successfully!")
    print(f"✓ Loaded datasets: {[(k, len(v)) for k, v in runner.datasets.items()]}")
    
    # Test dataset access
    val_sample = runner.datasets['validation'][0]
    print(f"✓ Validation sample keys: {list(val_sample.keys())}")
    print(f"✓ Target: {val_sample['target']} (zero-indexed), ID: {val_sample['id']}")
    print(f"✓ Target as 1-indexed choice: {val_sample['target'] + 1}")
    
except Exception as e:
    print(f"✗ Error: {e}")


INFO:preprocessing.load_raven:Loading RAVEN dataset...


Testing RAVENRunner initialization...


INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]


✓ RAVENRunner initialized successfully!
✓ Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
✓ Validation sample keys: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata']
✓ Target: 4 (zero-indexed), ID: 5047
✓ Target as 1-indexed choice: 5


In [15]:
# Set environment variables for your chosen model:

# Initialize runner
runner = RAVENRunner(
    model_name="azure/gpt4.1",  # or "vertex_ai/claude-3-7-sonnet@20250219"
    reasoning_effort="high"
)

# Run small test evaluation
print("Running evaluation on 5 validation samples...")
results = runner.evaluate_dataset("validation", max_samples=5)

print(f"Results:")
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"Correct: {results['correct']}/{results['total']}")
print(f"Invalid responses: {results['invalid_responses']}")
print(f"Valid accuracy: {results['valid_accuracy']:.3f}")

# For larger evaluations, use batch processing
print("\\nFor larger evaluations:")
batch_results = runner.batch_evaluate("validation", batch_size=3, max_samples=10)

INFO:preprocessing.load_raven:Loading RAVEN dataset...
INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
INFO:preprocessing.load_raven:Evaluating 5 samples from validation split...
INFO:llm_logger:PROMPT: This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel mi...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
[92m11:21:18 - LiteLLM:INFO[0m: utils.py:3043 - 
LiteLLM completion() model= o4-mini; provider = azure
INFO:LiteLLM:
LiteLLM completion() model= o4-mini; provider = azure


Running evaluation on 5 validation samples...


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
[92m11:21:20 - LiteLLM:INFO[0m: utils.py:1215 - Wrapper: Completed Call, calling success_handler
INFO:LiteLLM:Wrapper: Completed Call, calling success_handler
[92m11:21:20 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: azure/o4-mini-2025-04-16
INFO:LiteLLM:selected model name for cost calculation: azure/o4-mini-2025-04-16
INFO:llm_logger:RESPONSE: 
INFO:llm_logger:Added to cache
INFO:llm_logger:PROMPT: This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel mi...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
INFO:llm_logger:Cache hit for prompt: This is a visual reasoning task. You are shown a 3...
INFO:llm_logger:PROMPT: This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel mi...
INFO:llm_lo

Results:
Accuracy: 0.000
Correct: 0/5
Invalid responses: 5
Valid accuracy: 0.000
\nFor larger evaluations:


INFO:preprocessing.load_raven:Processing batch 2: samples 3-5
INFO:llm_logger:PROMPT: This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel mi...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
INFO:llm_logger:Cache hit for prompt: This is a visual reasoning task. You are shown a 3...
INFO:llm_logger:PROMPT: This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel mi...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
INFO:llm_logger:Cache hit for prompt: This is a visual reasoning task. You are shown a 3...
INFO:llm_logger:PROMPT: This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel mi...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
INFO:llm_logger:Cache hit for prompt: This is a visual reasoning task. You are shown a 3...
INFO:preprocessing.load_raven:Pausing between batches...
INFO:preprocessing.load_raven:Processing batch 3: samples 6

In [None]:
# Test the RAVENRunner class
import sys
sys.path.append('..')  # Add parent directory to path

from preprocessing.load_raven import RAVENRunner
import os

# Check if we can import and initialize (without real credentials)
print("Testing RAVENRunner import...")
print("RAVENRunner class imported successfully!")

# Test the NEW combined image approach (single image instead of two)
print("Testing NEW combined image approach...")

# Initialize runner with updated code
runner = RAVENRunner(model_name="azure/gpt-4.1", reasoning_effort="high")

example = runner.datasets['validation'][0]
prompt, combined_images = runner.create_raven_prompt(example['panels'], example['choices'])

print(f"✓ Created prompt with {len(combined_images)} combined image(s) (should be 1)")
print(f"✓ Combined image size: {combined_images[0].size}")
print(f"✓ Ground truth target: {example['target']} (zero-indexed) = choice {example['target'] + 1}")

print("\n" + "="*60)
print("NEW COMBINED IMAGE PROMPT:")
print("="*60)
print(prompt)
print("="*60)

# Display and save the combined image for visual inspection
print(f"\nDisplaying the combined image...")

# Display the combined image (this will open in your default image viewer)
combined_images[0].show()

# Save the combined image for inspection
combined_images[0].save("test_combined_raven_image.png")
print("✓ Saved combined image as 'test_combined_raven_image.png'")

# Test image encoding for API readiness
print(f"\nTesting combined image encoding...")
encoded_image = runner.encode_image_to_base64(combined_images[0])
print(f"✓ Successfully encoded combined image: {len(encoded_image)} chars")
print(f"✓ Encoded starts with: {encoded_image[:30]}...")

# Show dimensions comparison
print(f"\nImage dimensions:")
print(f"✓ Combined image: {combined_images[0].size}")
print(f"✓ Width x Height: {combined_images[0].size[0]} x {combined_images[0].size[1]} pixels")


INFO:preprocessing.load_raven:Loading RAVEN dataset...


Testing RAVENRunner import...
RAVENRunner class imported successfully!
Testing NEW combined image approach...


INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]


✓ Created prompt with 1 combined image(s) (should be 1)
✓ Combined image size: (818, 1102)
✓ Ground truth target: 4 (zero-indexed) = choice 5

NEW COMBINED IMAGE PROMPT:
This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your task is to identify which of the 8 numbered choices (1-8) best completes the pattern.

The image shows:
- TOP: A 3x3 matrix with 8 panels arranged as:
  [Panel 1] [Panel 2] [Panel 3]
  [Panel 4] [Panel 5] [Panel 6] 
  [Panel 7] [Panel 8] [   ?   ]

- BOTTOM: 8 numbered choices (1 through 8) that could complete the matrix.

TASK:
Analyze the patterns in the matrix:
- Look for horizontal patterns (left to right across rows)
- Look for vertical patterns (top to bottom down columns)  
- Look for diagonal patterns
- Consider transformations like rotation, reflection, addition/removal of elements
- Consider relationships between shapes, colors, positions, and quantities

Select the choice (1-8) th

In [7]:
# Test a single API call with the combined image
print("Testing single API call with combined image...")

try:
    # Make a single API call to test the new combined image approach
    response = runner.query_azure_openai(prompt, combined_images)
    
    print(f"✓ API call successful!")
    print(f"✓ Raw response: '{response}'")
    
    # Parse the response
    parsed_response = runner.parse_response(response)
    print(f"✓ Parsed response: {parsed_response} (zero-indexed)")
    
    if parsed_response != -1:
        print(f"✓ Model chose option: {parsed_response + 1} (1-indexed)")
        print(f"✓ Ground truth: {example['target'] + 1} (1-indexed)")
        print(f"✓ Correct: {parsed_response == example['target']}")
    else:
        print("✗ Invalid response - could not parse")
        
except Exception as e:
    print(f"✗ API call failed: {e}")
    print("This might be due to missing API credentials or network issues")


INFO:llm_logger:PROMPT: This is a visual reasoning task. You are shown a 3x3 matrix of images with the bottom-right panel mi...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
[92m11:35:11 - LiteLLM:INFO[0m: utils.py:3043 - 
LiteLLM completion() model= gpt-4.1; provider = azure
INFO:LiteLLM:
LiteLLM completion() model= gpt-4.1; provider = azure


Testing single API call with combined image...


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
[92m11:35:14 - LiteLLM:INFO[0m: utils.py:1215 - Wrapper: Completed Call, calling success_handler
INFO:LiteLLM:Wrapper: Completed Call, calling success_handler
[92m11:35:14 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:LiteLLM:selected model name for cost calculation: azure/gpt-4.1-2025-04-14
[92m11:35:14 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:LiteLLM:selected model name for cost calculation: azure/gpt-4.1-2025-04-14
[92m11:35:14 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:LiteLLM:selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:llm_logger:RESPONSE: The answer is **7**.

**Explanati

✓ API call successful!
✓ Raw response: 'The answer is **7**.

**Explanation:**

- Each row has a shape: circles in the first column, pentagons in the second, and triangles in the third.
- Each row’s shape decreases in size from top to bottom,'
✓ Parsed response: 6 (zero-indexed)
✓ Model chose option: 7 (1-indexed)
✓ Ground truth: 5 (1-indexed)
✓ Correct: False


In [None]:
# Compare old vs new approach (for debugging)
print("Comparing old vs new image approach...")

# Create individual composites (old way)
matrix_composite = runner.create_composite_matrix_image(example['panels'])
choices_composite = runner.create_choices_grid(example['choices'])

# Create combined image (new way)
combined_image = runner.create_combined_image(matrix_composite, choices_composite)

print(f"Old approach:")
print(f"  ✓ Matrix composite: {matrix_composite.size}")
print(f"  ✓ Choices composite: {choices_composite.size}")
print(f"  ✓ Total images: 2")

print(f"\nNew approach:")
print(f"  ✓ Combined image: {combined_image.size}")
print(f"  ✓ Total images: 1")

print(f"\nSize comparison:")
print(f"  ✓ Matrix height: {matrix_composite.size[1]}")
print(f"  ✓ Choices height: {choices_composite.size[1]}")
print(f"  ✓ Combined height: {combined_image.size[1]} (should be matrix + choices + padding)")
print(f"  ✓ Expected height: {matrix_composite.size[1] + choices_composite.size[1] + 20}")

# Save individual components for comparison
matrix_composite.save("test_matrix_only.png")
choices_composite.save("test_choices_only.png")
print(f"\n✓ Saved individual components: test_matrix_only.png, test_choices_only.png")
