In [1]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceM4/RAVEN", "center_single", split="train")

In [2]:
ds

Dataset({
    features: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata'],
    num_rows: 6000
})

In [3]:
# Examine the structure of a single example
example = ds[0]
print("Keys:", list(example.keys()))
print("\nPanels shape:", len(example['panels']))
print("Choices shape:", len(example['choices']))
print("Target:", example['target'])
print("ID:", example['id'])

# Look at the first panel and choice to understand the image format
print("\nFirst panel type:", type(example['panels'][0]))
print("First choice type:", type(example['choices'][0]))


Keys: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata']

Panels shape: 8
Choices shape: 8
Target: 4
ID: 3023

First panel type: <class 'PIL.PngImagePlugin.PngImageFile'>
First choice type: <class 'PIL.PngImagePlugin.PngImageFile'>


In [4]:
# Test the RAVENRunner class
import sys
sys.path.append('..')  # Add parent directory to path

from preprocessing.load_raven import RAVENRunner
import os

# Check if we can import and initialize (without real credentials)
print("Testing RAVENRunner import...")
print("RAVENRunner class imported successfully!")


Testing RAVENRunner import...
RAVENRunner class imported successfully!


In [7]:
# Test RAVENRunner initialization (won't actually call API)
print("Testing RAVENRunner initialization...")

try:
    # This will load the datasets but won't make API calls
    runner = RAVENRunner(
        model_name="azure/gpt-4.1",
        reasoning_effort="high"
    )
    print("✓ RAVENRunner initialized successfully!")
    print(f"✓ Loaded datasets: {[(k, len(v)) for k, v in runner.datasets.items()]}")
    
    # Test dataset access
    val_sample = runner.datasets['validation'][0]
    print(f"✓ Validation sample keys: {list(val_sample.keys())}")
    print(f"✓ Target: {val_sample['target']} (zero-indexed), ID: {val_sample['id']}")
    print(f"✓ Target as 1-indexed choice: {val_sample['target'] + 1}")
    
except Exception as e:
    print(f"✗ Error: {e}")


INFO:preprocessing.load_raven:Loading RAVEN dataset...


Testing RAVENRunner initialization...


INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]


✓ RAVENRunner initialized successfully!
✓ Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
✓ Validation sample keys: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata']
✓ Target: 4 (zero-indexed), ID: 5047
✓ Target as 1-indexed choice: 5


In [8]:
import base64
from mimetypes import guess_type

# Function to encode a local image into data URL 
def local_image_to_data_url(image_path):
    # Guess the MIME type of the image based on the file extension
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'  # Default MIME type if none is found

    # Read and encode the image file
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')

    # Construct the data URL
    return f"data:{mime_type};base64,{base64_encoded_data}"

# Example usage
image_path = 'test_combined_raven_image.png'
data_url = local_image_to_data_url(image_path)
# print("Data URL:", data_url)

import os
from openai import AzureOpenAI

endpoint = "https://dalle-declare.openai.azure.com/"
model_name = "o4-mini"
deployment = "o4-mini"

api_version = "2025-01-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=os.getenv("AZURE_API_KEY"),
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What is the answer?"},
                {"type": "image_url", "image_url": {"url": data_url}}
            ]
        }
    ],
    max_completion_tokens=100000,
    model=deployment
)

print(response.choices[0].message.content)

INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


The pattern is that each column keeps the same shape, each row the same size (big, medium, small), and the three fill‐levels (white, light‐gray, dark‐gray) rotate “down” each column.  In column 3 (the triangles) you have:

 Row 1: light‐gray  
 Row 2: dark‐gray  
 Row 3: must be white  

And in row 3 everything is the small version.  Therefore you need the small, unfilled (white) triangle – that is choice 6.


In [1]:
# Test the RAVENRunner class
import sys
sys.path.append('..')  # Add parent directory to path

from preprocessing.load_raven import RAVENRunner
import os

# Check if we can import and initialize (without real credentials)
print("Testing RAVENRunner import...")
print("RAVENRunner class imported successfully!")

# Test the NEW combined image approach (single image instead of two)
print("Testing NEW combined image approach...")

# Initialize runner with updated code
runner = RAVENRunner(model_name="azure/o4-mini", reasoning_effort="high")

example = runner.datasets['validation'][0]
prompt, combined_image = runner.create_raven_prompt(example['panels'], example['choices'])

print(f"✓ Created prompt with {combined_image} combined image(s) (should be 1)")
print(f"✓ Combined image size: {combined_image.size}")
print(f"✓ Ground truth target: {example['target']} (zero-indexed) = choice {example['target'] + 1}")

print("\n" + "="*60)
print("NEW COMBINED IMAGE PROMPT:")
print("="*60)
print(prompt)
print("="*60)

# Display and save the combined image for visual inspection
print(f"\nDisplaying the combined image...")

# Display the combined image (this will open in your default image viewer)
combined_image.show()

# Save the combined image for inspection
combined_image.save("test_combined_raven_image.png")
print("✓ Saved combined image as 'test_combined_raven_image.png'")

# Test image encoding for API readiness
print(f"\nTesting combined image encoding...")
encoded_image = runner.encode_image_to_base64(combined_image)
print(f"✓ Successfully encoded combined image: {len(encoded_image)} chars")
print(f"✓ Encoded starts with: {encoded_image[:30]}...")

# Show dimensions comparison
print(f"\nImage dimensions:")
print(f"✓ Combined image: {combined_image.size}")
print(f"✓ Width x Height: {combined_image.size[0]} x {combined_image.size[1]} pixels")


INFO:preprocessing.load_raven:Loading RAVEN dataset...


Testing RAVENRunner import...
RAVENRunner class imported successfully!
Testing NEW combined image approach...


train-00000-of-00001.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/54.4M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/54.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]


✓ Created prompt with <PIL.Image.Image image mode=RGB size=848x1102 at 0x7B95E2E70860> combined image(s) (should be 1)
✓ Combined image size: (848, 1102)
✓ Ground truth target: 0 (zero-indexed) = choice 1

NEW COMBINED IMAGE PROMPT:
You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). Your task is to identify which of the 8 numbered choices (1-8) in the Answer Set best completes the pattern.

Select the choice (1-8) that best completes the pattern. Respond with only the number (1, 2, 3, 4, 5, 6, 7, or 8).

Displaying the combined image...
✓ Saved combined image as 'test_combined_raven_image.png'

Testing combined image encoding...
✓ Successfully encoded combined image: 49964 chars
✓ Encoded starts with: iVBORw0KGgoAAAANSUhEUgAAA1AAAA...

Image dimensions:
✓ Combined image: (848, 1102)
✓ Width x Height: 848 x 1102 pixels


display-im6.q16: unable to open X server `' @ error/display.c/DisplayImageCommand/412.


In [2]:
# Test a single API call with the combined image
print("Testing single API call with combined image...")

try:
    # Make a single API call to test the new combined image approach
    response = runner.query_azure_openai(prompt, combined_image)
    
    print(f"✓ API call successful!")
    print(f"✓ Raw response: '{response}'")
    
    # Parse the response
    parsed_response = runner.parse_response(response)
    print(f"✓ Parsed response: {parsed_response} (zero-indexed)")
    
    if parsed_response != -1:
        print(f"✓ Model chose option: {parsed_response + 1} (1-indexed)")
        print(f"✓ Ground truth: {example['target'] + 1} (1-indexed)")
        print(f"✓ Correct: {parsed_response == example['target']}")
    else:
        print("✗ Invalid response - could not parse")
        
except Exception as e:
    print(f"✗ API call failed: {e}")
    print("This might be due to missing API credentials or network issues")


INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Testing single API call with combined image...
Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 1


✓ API call successful!
✓ Raw response: '1'
✓ Parsed response: 0 (zero-indexed)
✓ Model chose option: 1 (1-indexed)
✓ Ground truth: 1 (1-indexed)
✓ Correct: True


In [3]:
import sys
sys.path.append('..')  # Add parent directory to path
from preprocessing.load_raven import RAVENRunner

# Set environment variables for your chosen model:
runner = RAVENRunner(model_name="azure/gpt-4.1", reasoning_effort="high")


# Run small test evaluation
print("Running evaluation on 5 validation samples...")
results = runner.evaluate_dataset("validation", max_samples=20)

print(f"Results:")
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"Correct: {results['correct']}/{results['total']}")
print(f"Invalid responses: {results['invalid_responses']}")
print(f"Valid accuracy: {results['valid_accuracy']:.3f}")

# # For larger evaluations, use batch processing
# print("\\nFor larger evaluations:")
# batch_results = runner.batch_evaluate("validation", batch_size=3, max_samples=20)

INFO:preprocessing.load_raven:Loading RAVEN dataset...
INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
INFO:preprocessing.load_raven:Evaluating 20 samples from validation split...
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
[92m14:08:58 - LiteLLM:INFO[0m: utils.py:3043 - 
LiteLLM completion() model= gpt-4.1; provider = azure
INFO:LiteLLM:
LiteLLM completion() model= gpt-4.1; provider = azure


Running evaluation on 5 validation samples...


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
[92m14:09:00 - LiteLLM:INFO[0m: utils.py:1215 - Wrapper: Completed Call, calling success_handler
INFO:LiteLLM:Wrapper: Completed Call, calling success_handler
[92m14:09:00 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:LiteLLM:selected model name for cost calculation: azure/gpt-4.1-2025-04-14
[92m14:09:00 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:LiteLLM:selected model name for cost calculation: azure/gpt-4.1-2025-04-14
[92m14:09:00 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:llm_logger:RESPONSE: 8
INFO:LiteLLM:selected model name for cost calculation: azure/gpt-4.1-2025-04-14
INFO:llm_logger:PROMPT: You are

Results:
Accuracy: 0.100
Correct: 2/20
Invalid responses: 0
Valid accuracy: 0.100


In [4]:
import sys
sys.path.append('..')  # Add parent directory to path
from preprocessing.load_raven import RAVENRunner

# Initialize runner
runner = RAVENRunner(
    model_name="azure/o4-mini",  # or "vertex_ai/claude-3-7-sonnet@20250219"
    reasoning_effort="high"
)

# Run small test evaluation
print("Running evaluation on 5 validation samples...")
results = runner.evaluate_dataset("validation", max_samples=10)

print(f"Results:")
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"Correct: {results['correct']}/{results['total']}")
print(f"Invalid responses: {results['invalid_responses']}")
print(f"Valid accuracy: {results['valid_accuracy']:.3f}")

# # For larger evaluations, use batch processing
# print("\\nFor larger evaluations:")
# batch_results = runner.batch_evaluate("validation", batch_size=3, max_samples=20)

INFO:preprocessing.load_raven:Loading RAVEN dataset...
INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
INFO:preprocessing.load_raven:Evaluating 10 samples from validation split...
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Running evaluation on 5 validation samples...
Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 6
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 7
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 2
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 6
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 3
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 1
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 7
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 8
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 5
INFO:llm_logger:PROMPT: You are shown a 3x3 Problem Matrix of images with the bottom-right panel missing (marked with "?"). ...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Using AzureOpenAI client for o4 models


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 4
INFO:preprocessing.load_raven:Processed 10/10 samples
INFO:preprocessing.load_raven:Detailed results saved to raven_results_validation_10samples.json


Results:
Accuracy: 0.400
Correct: 4/10
Invalid responses: 0
Valid accuracy: 0.400
