In [1]:
# Import required modules and functions from rollout.py
import os
import sys
import json
import logging
import base64
import time
import threading
from mimetypes import guess_type
from pprint import pprint

# Azure OpenAI imports
from openai import AzureOpenAI
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
    wait_random
)
from concurrent.futures import ThreadPoolExecutor, as_completed

# Add the parent directory to path to import rollout module
sys.path.append(os.path.dirname(os.path.abspath('.')))

# Import specific functions from rollout.py (not the API functions)
from rollout import (
    RAVENDataset,
    parse_response_to_perception_and_reasoning_steps_and_correct_answer
)

# Add the tools directory to import accuracy functions
sys.path.append('/data/users/brandon/ob1-projects/InternVL/internvl_chat/tools')
from reasoning_data_pipeline.utils.accuracy_reward import check_answer, parse_answer

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('rollout_inspection')

# Verify Azure API key is set
if not os.getenv("AZURE_API_KEY"):
    raise ValueError("AZURE_API_KEY environment variable not set!")

# Azure OpenAI Configuration
endpoint = "https://declaregpt4.openai.azure.com/"
deployment = "gpt-4.1"
api_version = "2025-01-01-preview"

# Create standalone Azure OpenAI client
client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=os.getenv("AZURE_INSPECT_API_KEY"),
    timeout=60.0
)

print("All imports successful!")
print(f"Using deployment: {deployment}")
print(f"Using endpoint: {endpoint}")


All imports successful!
Using deployment: gpt-4.1
Using endpoint: https://declaregpt4.openai.azure.com/


In [2]:
# Helper functions copied from rollout.py

def local_image_to_data_url(image_path):
    """Convert a local image into data URL"""
    # Guess the MIME type of the image based on the file extension
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'  # Default MIME type if none is found

    # Read and encode the image file
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')

    # Construct the data URL
    return f"data:{mime_type};base64,{base64_encoded_data}"

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=4, max=300) + wait_random(0, 30),
    retry=retry_if_exception_type((Exception,)),
    reraise=True
)
def make_azure_request(messages, max_tokens, temperature, estimated_tokens=1000):
    """Make Azure OpenAI request with retry logic"""
    try:
        response = client.chat.completions.create(
            messages=messages,
            max_completion_tokens=max_tokens,
            model=deployment,
            temperature=temperature,
            timeout=120.0
        )
        
        return response.choices[0].message.content
        
    except Exception as e:
        error_type = type(e).__name__
        error_msg = str(e)
        
        # Check for content filter violation - DO NOT RETRY
        if ('BadRequestError' in error_type and 
            'Error code: 400' in error_msg and 
            ('ResponsibleAIPolicyViolation' in error_msg or 'content_filter' in error_msg)):
            logger.warning(f"Content filter violation detected: {error_msg}")
            return "Error code 400: content filter violation returned"
        
        logger.debug(f"API request failed: {error_type}: {error_msg}")
        raise

def build_responses_azure_simple(inputs, num_return_sequences=1, prefixes=None, max_new_tokens=4096, temperature=1.0):
    """
    Simplified version of build_responses_azure for inspection purposes
    """
    total_requests = len(inputs) * num_return_sequences
    logger.info(f"Generating {total_requests} responses")
    
    responses = []
    
    for seq_idx in range(num_return_sequences):
        for input_idx, (prompt, image_path) in enumerate(inputs):
            try:
                # Convert image path to data URL
                data_url = local_image_to_data_url(image_path)
                
                # Prepare messages
                content = [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": data_url}}
                ]
                
                messages = [
                    {"role": "user", "content": content}
                ]
                
                # Add prefix if provided - handle both single prefix and list of prefixes
                current_prefix = None
                if prefixes:
                    if isinstance(prefixes, list):
                        # Multiple prefixes - use index based on current request
                        request_idx = seq_idx * len(inputs) + input_idx
                        if request_idx < len(prefixes) and prefixes[request_idx]:
                            current_prefix = prefixes[request_idx]
                    else:
                        # Single prefix for all requests
                        current_prefix = prefixes
                
                if current_prefix:
                    messages.append({"role": "assistant", "content": current_prefix})
                
                # Make the request
                response_text = make_azure_request(messages, max_new_tokens, temperature)
                responses.append(response_text)
                
                logger.info(f"Generated response {len(responses)}/{total_requests}")
                
            except Exception as e:
                logger.error(f"Failed to generate response {len(responses)+1}: {e}")
                responses.append("")
    
    return responses

print("Helper functions loaded successfully!")


Helper functions loaded successfully!


In [3]:
# Configure dataset path - adjust this to your actual dataset
dataset_path = '/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/raven_processed_jsonl/last_four_jsonl/in_center_single_out_center_single_test.jsonl'

# Load the dataset (just first few samples for inspection)
dataset = RAVENDataset(
    data=dataset_path,
    sample_start_idx=1,
    sample_end_idx=1  # Just load 3 samples for inspection
)

print(f"Loaded {len(dataset)} samples from dataset")

# Get the first sample
sample = dataset[0]

print("\nSample structure:")
print(f"- ID: {sample['id']}")
print(f"- Subset/Split: {sample['subset_split']}")
print(f"- Correct Answer: {sample['correct_answer']}")
print(f"- Image Path: {sample['image_path']}")
print(f"\nRollout User Prompt Preview:")
print(sample['rollout_user_prompt'][:500] + "...")


Filtered 2000 lines to 1 samples in range [1, 1]
Loaded 1 samples from dataset

Sample structure:
- ID: 5571
- Subset/Split: in_center_single_out_center_single
- Correct Answer: G
- Image Path: /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/RAVEN/processed_raven_images/in_center_single_out_center_single/5571.jpg

Rollout User Prompt Preview:
You are an abstract reasoning puzzle expert. The puzzle you will receive is presented in a standard Raven's Progressive Matrices format: a 3×3 matrix of related images, with the bottom-right cell (the ninth tile) missing. There are eight possible answer choices provided separately, and your task is to decide which of those eight images correctly completes the 3×3 matrix pattern.

I will provide you with an image containing:
- Problem Matrix: An accompanying image that shows the eight tiles and h...


In [4]:
# Prepare inputs for rollout generation
num_return_sequences = 2  # Generate 2 different rollouts for the same input

# Create input list (prompt, image_path)
inputs = [(sample['rollout_user_prompt'], sample['image_path'])]

print(f"Generating {num_return_sequences} initial rollout responses...")

# Generate responses using the Azure API
response_list = build_responses_azure_simple(
    inputs=inputs,
    num_return_sequences=num_return_sequences,
    prefixes=None,  # No prefix for initial generation
    max_new_tokens=8192,
    temperature=1.0
)

print(f"\nGenerated {len(response_list)} responses")

# Display first response
print("\n" + "="*80)
print("ROLLOUT RESPONSE 1:")
print("="*80)
print(response_list[0][:1000] + "..." if len(response_list[0]) > 1000 else response_list[0])


INFO:rollout_inspection:Generating 2 responses


Generating 2 initial rollout responses...


INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 1/2
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 2/2



Generated 2 responses

ROLLOUT RESPONSE 1:
[Perception]
<step_1>
The matrix is 3x3. The bottom-right cell is missing. There are eight answer choices (A-H) shown below the main matrix.
</step_1>
<step_2>
Row 1: Each cell shows a circle containing a black shape:
- 1st column: Black hexagon.
- 2nd column: Black downward-pointing triangle.
- 3rd column: Black pentagon.
</step_2>
<step_3>
Row 2: Each cell shows an upright triangle containing a shape:
- 1st column: Hexagon (outline, not filled).
- 2nd column: Hexagon (outline, not filled), the triangle rotates or shifts.
- 3rd column: Black triangle (filled), triangle rotates.
</step_3>
<step_4>
Row 3: Each cell shows a square turned as a diamond (rotated square) with a shape inside:
- 1st column: Outline triangle inside.
- 2nd column: Black hexagon inside.
- 3rd column: Cell is missing.
</step_4>
<step_5>
Answer choices A-H: All are diamonds (rotated squares) with a shape inside, except D which is a circle with a hexagon inside.
- A: Diamo

In [5]:
# Parse the first response
# response_to_parse = response_list[0]
response_to_parse = """[Perception]\n<step_1>\nI observe the first row of the matrix. Each cell depicts a large circle with a smaller shape inside. The shapes from left to right: a hexagon, a triangle, and a pentagon.\n</step_1>\n<step_2>\nI observe the second row. Each cell now shows a large triangle, containing a smaller shape inside. From left to right: a pentagon, a hexagon, and a triangle.\n</step_2>\n<step_3>\nI observe the third row. Each cell presents a large diamond (square rotated 45 degrees with respect to the lower edge), with a smaller shape inside. The first contains a triangle, the middle one a hexagon, and the last one is missing.\n</step_3>\n<step_4>\nI look at the answer choices A-H. Each shows a large diamond with a shape inside except D, which has a large circle. Shapes inside are various (hexagons, pentagons, squares) and some are filled, others are not.\n</step_4>\n\n[Reasoning]\n<step_1>\nIdentify the logic in each row: The outer shape progresses as: circle (row 1), triangle (row 2), diamond (row 3). The inner shape shifts as well. Track the relationships of the inner shapes across rows and columns.\n</step_1>\n<step_2>\nLook down each column:\n- 1st column: Hexagon inside circle; pentagon inside triangle; triangle inside diamond.\n- 2nd column: Triangle inside circle; hexagon inside triangle; hexagon inside diamond.\n- 3rd column: Pentagon inside circle; triangle inside triangle; ? inside diamond.\n</step_2>\n<step_3>\nNotice the pattern: In each column, the inner shape of the top row moves to the middle row, the middle row inner shape moves to the bottom row, and the bottom row inner shape moves to the top row (cyclical shift).\n- 1st column: Hexagon (circle) → pentagon (triangle) → triangle (diamond)\n- 2nd column: Triangle (circle) → hexagon (triangle) → hexagon (diamond)\n- 3rd column: Pentagon (circle) → triangle (triangle) → ? (diamond)\nSo, following the shift: Pentagon (circle) → triangle (triangle) → ? (diamond). The answer must have a pentagon inside the diamond.\n</step_4>\n<step_4>\nExamine the answer choices for a diamond containing a pentagon. Option F is a diamond with a pentagon inside.\n</step_4>\n\n<correct_answer>\n$\\boxed{F}$\n</correct_answer>\n"""

try:
    parsed = parse_response_to_perception_and_reasoning_steps_and_correct_answer(
        response_to_parse,
        max_perception_steps=12,
        max_reasoning_steps=12
    )
    
    print("Successfully parsed response!")
    print(f"\nNumber of perception steps: {len(parsed['perception_steps'])}")
    print(f"Number of reasoning steps: {len(parsed['reasoning_steps'])}")
    print(f"LLM Answer: {parsed['llm_answer']}")
    print(f"Ground Truth Answer: {sample['correct_answer']}")
    
    # Display first perception step
    print("\n" + "="*60)
    print("FIRST PERCEPTION STEP:")
    print("="*60)
    print(parsed['perception_steps'][0])
   
    print("\n" + "="*60)
    print("LAST PERCEPTION STEP:")
    print("="*60)
    print(parsed['perception_steps'][-1])
    
    # Display first reasoning step
    print("\n" + "="*60)
    print("FIRST REASONING STEP:")
    print("="*60)
    print(parsed['reasoning_steps'][0])
    
    print("\n" + "="*60)
    print("LAST REASONING STEP:")
    print("="*60)
    print(parsed['reasoning_steps'][-1])
    
except Exception as e:
    print(f"Failed to parse response: {e}")
    print("\nResponse format may not match expected structure.")


Successfully parsed response!

Number of perception steps: 4
Number of reasoning steps: 3
LLM Answer: $\boxed{F}$
Ground Truth Answer: G

FIRST PERCEPTION STEP:
I observe the first row of the matrix. Each cell depicts a large circle with a smaller shape inside. The shapes from left to right: a hexagon, a triangle, and a pentagon.

FIRST REASONING STEP:
Identify the logic in each row: The outer shape progresses as: circle (row 1), triangle (row 2), diamond (row 3). The inner shape shifts as well. Track the relationships of the inner shapes across rows and columns.


In [6]:
# Let's run MC for the first perception step (step index 0)
step_idx = 0
num_mc_sequences = 4  # Number of MC samples per step

# Combine all steps for easier access
all_steps = parsed['perception_steps'] + parsed['reasoning_steps']
perception_count = len(parsed['perception_steps'])

print(f"Running Monte Carlo for step {step_idx} (Perception step {step_idx + 1})")
print(f"Will generate {num_mc_sequences} MC continuations")

# Build the prefix for this step (all steps up to and including current step)
prefix_steps = all_steps[:step_idx + 1]

# Format the prefix according to the expected structure
formatted_prefix = "[Perception]\n"
for i, step in enumerate(prefix_steps):
    formatted_prefix += f"<step_{i+1}>\n{step}\n</step_{i+1}>\n"

print("\n" + "="*60)
print("PREFIX FOR MC GENERATION:")
print("="*60)
print(formatted_prefix)
print("="*60)


Running Monte Carlo for step 0 (Perception step 1)
Will generate 4 MC continuations

PREFIX FOR MC GENERATION:
[Perception]
<step_1>
I observe the first row of the matrix. Each cell depicts a large circle with a smaller shape inside. The shapes from left to right: a hexagon, a triangle, and a pentagon.
</step_1>



In [7]:
# Generate MC continuations from this prefix
mc_inputs = [(sample['rollout_user_prompt'], sample['image_path'])] * num_mc_sequences
mc_prefixes = [formatted_prefix] * num_mc_sequences

print(f"\nGenerating {num_mc_sequences} MC continuations...")

mc_responses = build_responses_azure_simple(
    inputs=mc_inputs,
    num_return_sequences=1,  # 1 response per input (we duplicate inputs instead)
    prefixes=mc_prefixes,
    max_new_tokens=8192,
    temperature=1.0
)

print(f"Generated {len(mc_responses)} MC responses")

# Display first MC continuation
print("\n" + "="*80)
print("MC CONTINUATION 1 (after prefix):")
print("="*80)
print(mc_responses[0][:500] + "..." if len(mc_responses[0]) < 500 else mc_responses[0])


INFO:rollout_inspection:Generating 4 responses



Generating 4 MC continuations...


INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 1/4
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 2/4
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 3/4
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 4/4


Generated 4 MC responses

MC CONTINUATION 1 (after prefix):
[Perception]
<step_1>
The first row contains circles with different small shapes inside them:
- First cell: Large circle with a small black hexagon inside.
- Second cell: Large circle with a small downward triangle inside.
- Third cell: Large circle with a small black pentagon inside.
</step_1>
<step_2>
The second row contains large triangles with small shapes inside:
- First cell: Large triangle with a small outlined pentagon inside.
- Second cell: Large triangle with a small hexagon inside.
- Third cell: Large triangle with a small triangle inside.
</step_2>
<step_3>
The third row contains large diamonds (squares rotated 45 degrees) with small shapes inside:
- First cell: Large diamond with a small outlined triangle inside.
- Second cell: Large diamond with a small black hexagon inside.
- Third cell: Large diamond, but the inside shape is missing (question mark).
</step_3>
<step_4>
The answer set below contains 8 options (A-

In [8]:
# Parse and score each MC response
mc_correctness = []
mc_details = []

ground_truth = str(sample['correct_answer'])
print(f"Ground Truth Answer: {ground_truth}")
print("\nProcessing MC responses:")
print("-" * 60)

for mc_idx, mc_response in enumerate(mc_responses):
    try:
        # Parse the answer from the MC response
        # The response should contain the prefix + continuation
        full_response = formatted_prefix + mc_response
        
        # Extract answer using parse_answer function
        parsed_answer = parse_answer(full_response, prompt_version='raven_v2')
        answer_pred = parsed_answer[-1] if parsed_answer else ""
        
        # Check correctness
        correctness = check_answer(
            answer_pred=answer_pred,
            answer_gt=ground_truth,
            mode='raven_score_alphabet_only'
        )
        
        mc_correctness.append(correctness)
        mc_details.append({
            'mc_idx': mc_idx,
            'predicted': answer_pred,
            'correct': correctness == 1
        })
        
        print(f"MC {mc_idx}: Predicted '{answer_pred}' -> {'CORRECT' if correctness == 1 else 'INCORRECT'}")
        
    except Exception as e:
        print(f"MC {mc_idx}: Failed to parse - {e}")
        mc_correctness.append(0)
        mc_details.append({
            'mc_idx': mc_idx,
            'predicted': 'PARSE_ERROR',
            'correct': False
        })

# Calculate step score
step_score = sum(mc_correctness) / len(mc_correctness) if mc_correctness else 0.0

print("-" * 60)
print(f"\nStep {step_idx} MC Score: {step_score:.3f} ({sum(mc_correctness)}/{len(mc_correctness)} correct)")
print(f"This means {step_score * 100:.1f}% of MC continuations from this step reached the correct answer")


Ground Truth Answer: G

Processing MC responses:
------------------------------------------------------------
MC 0: Predicted 'G' -> CORRECT
MC 1: Predicted 'A' -> INCORRECT
MC 2: Predicted 'B' -> INCORRECT
MC 3: Predicted 'G' -> CORRECT
------------------------------------------------------------

Step 0 MC Score: 0.500 (2/4 correct)
This means 50.0% of MC continuations from this step reached the correct answer


In [9]:
# Run MC for multiple steps
max_steps_to_test = min(3, len(all_steps))  # Test first 3 steps
num_mc_per_step = 16
steps_with_scores = []

print(f"Running MC evaluation for first {max_steps_to_test} steps")
print("=" * 80)

for step_idx in range(max_steps_to_test):
    print(f"\n\nSTEP {step_idx} EVALUATION")
    print("-" * 60)
    
    # Determine if this is a perception or reasoning step
    if step_idx < perception_count:
        step_type = "Perception"
        step_num = step_idx + 1
    else:
        step_type = "Reasoning"
        step_num = step_idx - perception_count + 1
    
    print(f"Step Type: {step_type} Step {step_num}")
    print(f"Step Content: {all_steps[step_idx][:100]}...")
    
    # Build prefix for this step
    prefix_steps = all_steps[:step_idx + 1]
    
    # Format prefix based on step type
    if step_idx < perception_count:
        # Still in perception phase
        formatted_prefix = "[Perception]\n"
        for i, step in enumerate(prefix_steps):
            formatted_prefix += f"<step_{i+1}>\n{step}\n</step_{i+1}>\n"
    else:
        # In reasoning phase
        formatted_prefix = "[Perception]\n"
        for i, step in enumerate(parsed['perception_steps']):
            formatted_prefix += f"<step_{i+1}>\n{step}\n</step_{i+1}>\n"
        formatted_prefix += "\n[Reasoning]\n"
        reasoning_steps = prefix_steps[perception_count:]
        for i, step in enumerate(reasoning_steps):
            formatted_prefix += f"<step_{i+1}>\n{step}\n</step_{i+1}>\n"
    
    # Generate MC continuations
    mc_inputs = [(sample['rollout_user_prompt'], sample['image_path'])] * num_mc_per_step
    mc_prefixes = [formatted_prefix] * num_mc_per_step
    
    print(f"\nGenerating {num_mc_per_step} MC continuations...")
    
    mc_responses = build_responses_azure_simple(
        inputs=mc_inputs,
        num_return_sequences=1,
        prefixes=mc_prefixes,
        max_new_tokens=8192,
        temperature=1.0
    )
    
    # Score MC responses
    mc_correctness = []
    for mc_idx, mc_response in enumerate(mc_responses):
        try:
            full_response = formatted_prefix + mc_response
            parsed_answer = parse_answer(full_response, prompt_version='raven_v2')
            answer_pred = parsed_answer[-1] if parsed_answer else ""
            
            correctness = check_answer(
                answer_pred=answer_pred,
                answer_gt=str(sample['correct_answer']),
                mode='raven_score_alphabet_only'
            )
            
            mc_correctness.append(correctness)
            print(f"  MC {mc_idx}: {answer_pred} -> {'✓' if correctness == 1 else '✗'}")
            
        except Exception as e:
            mc_correctness.append(0)
            print(f"  MC {mc_idx}: Parse error")
    
    # Calculate and store step score
    step_score = sum(mc_correctness) / len(mc_correctness) if mc_correctness else 0.0
    
    steps_with_scores.append({
        'step_idx': step_idx,
        'step_type': step_type,
        'step_num': step_num,
        'step_content': all_steps[step_idx][:100] + "...",
        'score': step_score,
        'num_correct': sum(mc_correctness),
        'num_total': len(mc_correctness)
    })
    
    print(f"\nStep Score: {step_score:.3f} ({sum(mc_correctness)}/{len(mc_correctness)})")

# Summary
print("\n" + "=" * 80)
print("MC EVALUATION SUMMARY")
print("=" * 80)
for step_result in steps_with_scores:
    print(f"Step {step_result['step_idx']} ({step_result['step_type']} {step_result['step_num']}): "
          f"Score = {step_result['score']:.3f} "
          f"({step_result['num_correct']}/{step_result['num_total']} correct)")

# Check if early stopping would trigger
print("\nEarly Stopping Analysis:")
for i, step_result in enumerate(steps_with_scores):
    if step_result['score'] == 0.0:
        print(f"Early stopping would trigger at step {i} (score = 0.0)")
        break
else:
    print("No early stopping triggered (all steps have score > 0)")


INFO:rollout_inspection:Generating 16 responses


Running MC evaluation for first 3 steps


STEP 0 EVALUATION
------------------------------------------------------------
Step Type: Perception Step 1
Step Content: I observe the first row of the matrix. Each cell depicts a large circle with a smaller shape inside....

Generating 16 MC continuations...


INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 1/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 2/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 3/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 4/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rol

  MC 0: G -> ✓
  MC 1: G -> ✓
  MC 2: A -> ✗
  MC 3: A -> ✗
  MC 4: B -> ✗
  MC 5: B -> ✗
  MC 6: F -> ✗
  MC 7: G -> ✓
  MC 8: F -> ✗
  MC 9: G -> ✓
  MC 10: G -> ✓
  MC 11: G -> ✓
  MC 12: F -> ✗
  MC 13: B -> ✗
  MC 14: G -> ✓
  MC 15: H -> ✗

Step Score: 0.438 (7/16)


STEP 1 EVALUATION
------------------------------------------------------------
Step Type: Perception Step 2
Step Content: I observe the second row. Each cell now shows a large triangle, containing a smaller shape inside. F...

Generating 16 MC continuations...


INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 1/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 2/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 3/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 4/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rol

  MC 0: A -> ✗
  MC 1: C -> ✗
  MC 2: C -> ✗
  MC 3: G -> ✓
  MC 4: F -> ✗
  MC 5: E -> ✗
  MC 6: G -> ✓
  MC 7: A -> ✗
  MC 8: A -> ✗
  MC 9: B -> ✗
  MC 10: B -> ✗
  MC 11: A -> ✗
  MC 12: G -> ✓
  MC 13: B -> ✗
  MC 14: G -> ✓
  MC 15: C -> ✗

Step Score: 0.250 (4/16)


STEP 2 EVALUATION
------------------------------------------------------------
Step Type: Perception Step 3
Step Content: I observe the third row. Each cell presents a large diamond (square rotated 45 degrees with respect ...

Generating 16 MC continuations...


INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 1/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 2/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 3/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rollout_inspection:Generated response 4/16
INFO:httpx:HTTP Request: POST https://declaregpt4.openai.azure.com/openai/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
INFO:rol

  MC 0: C -> ✗
  MC 1: F -> ✗
  MC 2: G -> ✓
  MC 3: G -> ✓
  MC 4: G -> ✓
  MC 5: F -> ✗
  MC 6: B -> ✗
  MC 7: H -> ✗
  MC 8: B -> ✗
  MC 9: B -> ✗
  MC 10: A -> ✗
  MC 11: A -> ✗
  MC 12: B -> ✗
  MC 13: F -> ✗
  MC 14: C -> ✗
  MC 15: F -> ✗

Step Score: 0.188 (3/16)

MC EVALUATION SUMMARY
Step 0 (Perception 1): Score = 0.438 (7/16 correct)
Step 1 (Perception 2): Score = 0.250 (4/16 correct)
Step 2 (Perception 3): Score = 0.188 (3/16 correct)

Early Stopping Analysis:
No early stopping triggered (all steps have score > 0)


In [10]:
# Demonstrate answer parsing in detail
test_response = response_list[0]  # Use the first generated response

print("ANSWER PARSING DEMONSTRATION")
print("=" * 80)

# Extract the answer section from the response
import re
answer_pattern = r'<correct_answer>(.*?)</correct_answer>'
answer_match = re.search(answer_pattern, test_response, re.DOTALL)

if answer_match:
    answer_section = answer_match.group(1).strip()
    print(f"Raw answer section:\n{answer_section}")
    print("\n" + "-" * 60)
    
    # Parse answer using the parse_answer function
    print("Parsing with parse_answer function:")
    parsed_answers = parse_answer(test_response, prompt_version='raven_v2')
    print(f"Parsed answers: {parsed_answers}")
    
    if parsed_answers:
        final_answer = parsed_answers[-1]
        print(f"Final answer extracted: '{final_answer}'")
        
        # Check answer correctness
        print("\n" + "-" * 60)
        print("ANSWER CHECKING:")
        print(f"Ground truth: '{sample['correct_answer']}'")
        print(f"Predicted: '{final_answer}'")
        
        correctness = check_answer(
            answer_pred=final_answer,
            answer_gt=str(sample['correct_answer']),
            mode='raven_score_alphabet_only'
        )
        
        print(f"Correctness score: {correctness}")
        print(f"Answer is: {'CORRECT' if correctness == 1 else 'INCORRECT'}")
        
        # Show what the scoring mode does
        print("\n" + "-" * 60)
        print("SCORING MODE: 'raven_score_alphabet_only'")
        print("This mode extracts alphabetic characters from the answer and compares them.")
        print("For example:")
        print("  - '$\\boxed{D}$' → 'D'")
        print("  - 'Answer: D' → 'D'")
        print("  - 'D is correct' → 'D'")

else:
    print("Could not find answer section in response")

# Final summary
print("\n" + "=" * 80)
print("COMPLETE MC ROLLOUT PROCESS SUMMARY:")
print("=" * 80)
print("1. Generate initial rollout with perception and reasoning steps")
print("2. For each step, create a prefix containing all steps up to that point")
print("3. Generate multiple MC continuations from each prefix")
print("4. Parse the final answer from each MC continuation")
print("5. Check correctness against ground truth")
print("6. Calculate MC score as (# correct) / (# total MC samples)")
print("7. Use scores to evaluate quality of each reasoning step")
print("\nThe MC score indicates how likely the model is to reach the correct answer")
print("when continuing from that specific step in the reasoning process.")


ANSWER PARSING DEMONSTRATION
Raw answer section:
$\boxed{A}$

------------------------------------------------------------
Parsing with parse_answer function:
Parsed answers: (None, 'A')
Final answer extracted: 'A'

------------------------------------------------------------
ANSWER CHECKING:
Ground truth: 'G'
Predicted: 'A'
Correctness score: 0
Answer is: INCORRECT

------------------------------------------------------------
SCORING MODE: 'raven_score_alphabet_only'
This mode extracts alphabetic characters from the answer and compares them.
For example:
  - '$\boxed{D}$' → 'D'
  - 'Answer: D' → 'D'
  - 'D is correct' → 'D'

COMPLETE MC ROLLOUT PROCESS SUMMARY:
1. Generate initial rollout with perception and reasoning steps
2. For each step, create a prefix containing all steps up to that point
3. Generate multiple MC continuations from each prefix
4. Parse the final answer from each MC continuation
5. Check correctness against ground truth
6. Calculate MC score as (# correct) / (# to