In [1]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceM4/RAVEN", "center_single", split="train")

In [2]:
ds

Dataset({
    features: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata'],
    num_rows: 6000
})

In [3]:
# Examine the structure of a single example
example = ds[0]
print("Keys:", list(example.keys()))
print("\nPanels shape:", len(example['panels']))
print("Choices shape:", len(example['choices']))
print("Target:", example['target'])
print("ID:", example['id'])

# Look at the first panel and choice to understand the image format
print("\nFirst panel type:", type(example['panels'][0]))
print("First choice type:", type(example['choices'][0]))


Keys: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata']

Panels shape: 8
Choices shape: 8
Target: 4
ID: 3023

First panel type: <class 'PIL.PngImagePlugin.PngImageFile'>
First choice type: <class 'PIL.PngImagePlugin.PngImageFile'>


In [4]:
# Test the RAVENRunner class
import sys
sys.path.append('..')  # Add parent directory to path

from preprocessing.load_raven import RAVENRunner
import os

# Check if we can import and initialize (without real credentials)
print("Testing RAVENRunner import...")
print("RAVENRunner class imported successfully!")


Testing RAVENRunner import...
RAVENRunner class imported successfully!


In [7]:
# Test RAVENRunner initialization (won't actually call API)
print("Testing RAVENRunner initialization...")

try:
    # This will load the datasets but won't make API calls
    runner = RAVENRunner(
        model_name="azure/gpt-4.1",
        reasoning_effort="high"
    )
    print("✓ RAVENRunner initialized successfully!")
    print(f"✓ Loaded datasets: {[(k, len(v)) for k, v in runner.datasets.items()]}")
    
    # Test dataset access
    val_sample = runner.datasets['validation'][0]
    print(f"✓ Validation sample keys: {list(val_sample.keys())}")
    print(f"✓ Target: {val_sample['target']} (zero-indexed), ID: {val_sample['id']}")
    print(f"✓ Target as 1-indexed choice: {val_sample['target'] + 1}")
    
except Exception as e:
    print(f"✗ Error: {e}")


INFO:preprocessing.load_raven:Loading RAVEN dataset...


Testing RAVENRunner initialization...


INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]


✓ RAVENRunner initialized successfully!
✓ Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
✓ Validation sample keys: ['panels', 'choices', 'structure', 'meta_matrix', 'meta_target', 'meta_structure', 'target', 'id', 'metadata']
✓ Target: 4 (zero-indexed), ID: 5047
✓ Target as 1-indexed choice: 5


In [7]:
import os
from openai import AzureOpenAI

endpoint = "https://dalle-declare.openai.azure.com/"
model_name = "o4-mini"
deployment = "o4-mini"

api_version = "2025-01-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=os.getenv("AZURE_API_KEY"),
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": "I am going to Paris, what should I see?",
        }
    ],
    max_completion_tokens=100000,
    model=deployment
)

print(response.choices[0].message.content)

INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


Here’s a mix of Paris’s “must-sees,” plus a few off-the-beaten-track picks. Feel free to mix and match based on your interests and how many days you have:

1. Iconic Landmarks  
 • Eiffel Tower (consider booking a summit-access slot in advance)  
 • Arc de Triomphe & Champs-Élysées (climb to the top of the arch for great views)  
 • Notre-Dame Cathedral & Île de la Cité (though still under partial restoration, the exterior and nearby Sainte-Chapelle are stunning)  
 • Sacré-Cœur Basilica in Montmartre (sunset over Paris)

2. World-Class Museums  
 • Louvre (buy timed-entry tickets; don’t miss the Denon wing—“Mona Lisa,” “Winged Victory”)  
 • Musée d’Orsay (impressionist masterpieces in a converted railway station)  
 • Centre Pompidou (modern/contemporary art + its twisting exterior escalator)  
 • Musée de l’Orangerie (Monet’s Water Lilies in the Tuileries Gardens)

3. Charming Neighborhoods  
 • Le Marais (medieval streets, Place des Vosges, trendy boutiques & falafel stands)  
 • L

In [1]:
# Test the RAVENRunner class
import sys
sys.path.append('..')  # Add parent directory to path

from preprocessing.load_raven import RAVENRunner
import os

# Check if we can import and initialize (without real credentials)
print("Testing RAVENRunner import...")
print("RAVENRunner class imported successfully!")

# Test the NEW combined image approach (single image instead of two)
print("Testing NEW combined image approach...")

# Initialize runner with updated code
runner = RAVENRunner(model_name="azure/o4-mini", reasoning_effort="high")

example = runner.datasets['validation'][0]
prompt, combined_image = runner.create_raven_prompt(example['panels'], example['choices'])

print(f"✓ Created prompt with {combined_image} combined image(s) (should be 1)")
print(f"✓ Combined image size: {combined_image.size}")
print(f"✓ Ground truth target: {example['target']} (zero-indexed) = choice {example['target'] + 1}")

print("\n" + "="*60)
print("NEW COMBINED IMAGE PROMPT:")
print("="*60)
print(prompt)
print("="*60)

# Display and save the combined image for visual inspection
print(f"\nDisplaying the combined image...")

# Display the combined image (this will open in your default image viewer)
combined_image.show()

# Save the combined image for inspection
combined_image.save("test_combined_raven_image.png")
print("✓ Saved combined image as 'test_combined_raven_image.png'")

# Test image encoding for API readiness
print(f"\nTesting combined image encoding...")
encoded_image = runner.encode_image_to_base64(combined_image)
print(f"✓ Successfully encoded combined image: {len(encoded_image)} chars")
print(f"✓ Encoded starts with: {encoded_image[:30]}...")

# Show dimensions comparison
print(f"\nImage dimensions:")
print(f"✓ Combined image: {combined_image.size}")
print(f"✓ Width x Height: {combined_image.size[0]} x {combined_image.size[1]} pixels")


INFO:preprocessing.load_raven:Loading RAVEN dataset...


Testing RAVENRunner import...
RAVENRunner class imported successfully!
Testing NEW combined image approach...


INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]


✓ Created prompt with <PIL.Image.Image image mode=RGB size=848x1102 at 0x793174A6A990> combined image(s) (should be 1)
✓ Combined image size: (848, 1102)
✓ Ground truth target: 4 (zero-indexed) = choice 5

NEW COMBINED IMAGE PROMPT:
You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your task is to identify which of the 8 numbered choices (1-8) best completes the pattern.

Select the choice (1-8) that best completes the pattern. Respond with only the number (1, 2, 3, 4, 5, 6, 7, or 8).

Displaying the combined image...
✓ Saved combined image as 'test_combined_raven_image.png'

Testing combined image encoding...
✓ Successfully encoded combined image: 79404 chars
✓ Encoded starts with: iVBORw0KGgoAAAANSUhEUgAAA1AAAA...

Image dimensions:
✓ Combined image: (848, 1102)
✓ Width x Height: 848 x 1102 pixels


display-im6.q16: unable to open X server `' @ error/display.c/DisplayImageCommand/412.


In [3]:
# Test a single API call with the combined image
print("Testing single API call with combined image...")

try:
    # Make a single API call to test the new combined image approach
    response = runner.query_azure_openai(prompt, combined_image)
    
    print(f"✓ API call successful!")
    print(f"✓ Raw response: '{response}'")
    
    # Parse the response
    parsed_response = runner.parse_response(response)
    print(f"✓ Parsed response: {parsed_response} (zero-indexed)")
    
    if parsed_response != -1:
        print(f"✓ Model chose option: {parsed_response + 1} (1-indexed)")
        print(f"✓ Ground truth: {example['target'] + 1} (1-indexed)")
        print(f"✓ Correct: {parsed_response == example['target']}")
    else:
        print("✗ Invalid response - could not parse")
        
except Exception as e:
    print(f"✗ API call failed: {e}")
    print("This might be due to missing API credentials or network issues")


INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high


Testing single API call with combined image...


INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 
INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 
INFO:httpx:HTTP Request: POST https://dalle-declare.openai.azure.com/openai/deployments/o4-mini/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:llm_logger:RESPONSE: 
INFO:llm_logger:Added to cache


✓ API call successful!
✓ Raw response: ''
✓ Parsed response: -1 (zero-indexed)
✗ Invalid response - could not parse


In [10]:
# Set environment variables for your chosen model:
runner = RAVENRunner(model_name="azure/gpt-4.1", reasoning_effort="high")


# Run small test evaluation
print("Running evaluation on 5 validation samples...")
results = runner.evaluate_dataset("validation", max_samples=5)

print(f"Results:")
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"Correct: {results['correct']}/{results['total']}")
print(f"Invalid responses: {results['invalid_responses']}")
print(f"Valid accuracy: {results['valid_accuracy']:.3f}")

# For larger evaluations, use batch processing
print("\\nFor larger evaluations:")
batch_results = runner.batch_evaluate("validation", batch_size=3, max_samples=10)

INFO:preprocessing.load_raven:Loading RAVEN dataset...
INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
INFO:preprocessing.load_raven:Evaluating 5 samples from validation split...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
INFO:llm_logger:Cache hit for prompt: You are shown a 3x3 matrix of images with the bott...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
INFO:llm_logger:Cache hit for prompt: You are shown a 3x3 matrix of images with the bott...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
INFO:llm_logger:Cache hit for prompt

Running evaluation on 5 validation samples...
Results:
Accuracy: 0.200
Correct: 1/5
Invalid responses: 0
Valid accuracy: 0.200
\nFor larger evaluations:


INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
INFO:llm_logger:Cache hit for prompt: You are shown a 3x3 matrix of images with the bott...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
INFO:llm_logger:Cache hit for prompt: You are shown a 3x3 matrix of images with the bott...
INFO:preprocessing.load_raven:Pausing between batches...
INFO:preprocessing.load_raven:Processing batch 2: samples 3-5
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/gpt-4.1, REASONING: high
INFO:llm_logger:Cache hit for prompt: You are shown a 3x3 matrix of images with the bott...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of image

In [12]:
# Set environment variables for your chosen model:

# Initialize runner
runner = RAVENRunner(
    model_name="azure/o_series/o4-mini",  # or "vertex_ai/claude-3-7-sonnet@20250219"
    reasoning_effort="high"
)

# Run small test evaluation
print("Running evaluation on 5 validation samples...")
results = runner.evaluate_dataset("validation", max_samples=5)

print(f"Results:")
print(f"Accuracy: {results['accuracy']:.3f}")
print(f"Correct: {results['correct']}/{results['total']}")
print(f"Invalid responses: {results['invalid_responses']}")
print(f"Valid accuracy: {results['valid_accuracy']:.3f}")

# # For larger evaluations, use batch processing
# print("\\nFor larger evaluations:")
# batch_results = runner.batch_evaluate("validation", batch_size=3, max_samples=10)

INFO:preprocessing.load_raven:Loading RAVEN dataset...
INFO:preprocessing.load_raven:Loaded datasets: [('train', 6000), ('validation', 2000), ('test', 2000)]
INFO:preprocessing.load_raven:Evaluating 5 samples from validation split...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
INFO:llm_logger:Cache hit for prompt: You are shown a 3x3 matrix of images with the bott...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
INFO:llm_logger:Cache hit for prompt: You are shown a 3x3 matrix of images with the bott...
INFO:llm_logger:PROMPT: You are shown a 3x3 matrix of images with the bottom-right panel missing (marked with "?"). Your tas...
INFO:llm_logger:MODEL: azure/o4-mini, REASONING: high
INFO:llm_logger:Cache hit for prompt

Running evaluation on 5 validation samples...
Results:
Accuracy: 0.000
Correct: 0/5
Invalid responses: 5
Valid accuracy: 0.000
