In [1]:
import pandas as pd
import base64
from itertools import product

In [None]:
# Variables 
models = ["gpt-5-low","gpt-5-medium","gpt-5-high","gpt-4o","gpt-4o-mini"]
names = [("Vicki", "Denise")]  # Only one name pair to save cost
colors = [("blue","purple","red","green"), ("green","red","blue","purple")] # the second color order was changed due to images generated in this new order instead
conditions = ["Ignorance", "Knowledge-plausible", "Knowledge-implausible"]
instruments = ["violin","flute","ball"]
trials = list(range(5))

# Function to encode image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Function to generate the multimodal stimulus
def generate_stimulus(condition, names, colors, instrument):
    """Generate multimodal stimulus with text and images.
    
    Returns a list of content items for the OpenAI API.
    """
    # Condition-specific text
    if condition == "Ignorance":
        cond_statement = f"moves the {instrument} to another container."
    elif condition == "Knowledge-plausible":
        cond_statement = f"moves the {instrument} to the {colors[2]} container."
    elif condition == "Knowledge-implausible":
        cond_statement = f"moves the {instrument} to the {colors[1]} container."
    
    # Image paths
    color_str = f"{colors[0]}-{colors[1]}-{colors[2]}-{colors[3]}"
    first_image_path = f"images/first-{instrument}-{color_str}.jpg"
    second_image_path = f"images/second-{instrument}-{color_str}.jpg"
    
    # Encode images
    first_image_b64 = encode_image(first_image_path)
    second_image_b64 = encode_image(second_image_path)
    
    # Build multimodal content
    content = [
        {
            "type": "input_text",
            "text": f"This is {names[0]}. She finishes playing her {instrument} and puts it in the {colors[0]} container. Then she goes outside to play."
        },
        {
            "type": "input_image",
            "image_url": f"data:image/jpeg;base64,{first_image_b64}"
        },
        {
            "type": "input_text",
            "text": f"While {names[0]} is outside playing, her sister, {names[1]} {cond_statement}\nThen, {names[1]} rearranges the containers in the room until the room looks like the picture below."
        },
        {
            "type": "input_image",
            "image_url": f"data:image/jpeg;base64,{second_image_b64}"
        },
        {
            "type": "input_text",
            "text": f"""When {names[0]} returns, she wants to play her {instrument}.
What are the chances {names[0]} will first look for her {instrument} in each of the above containers?
Put your answer in percentages (must sum to 100) in the JSON structure inside <answer> tags like below:
<answer>
{{
    \"{colors[0]}\": ___,
    \"{colors[1]}\": ___,
    \"{colors[2]}\": ___,
    \"{colors[3]}\": ___
}}
</answer>"""
        }
    ]
    
    return content

# Function to generate text-only stimulus description (for CSV storage)
def generate_stimulus_description(condition, names, colors, instrument):
    """Generate a text description of the stimulus for CSV storage."""
    color_str = f"{colors[0]}-{colors[1]}-{colors[2]}-{colors[3]}"
    return f"[IMAGE STIMULUS] names={names}, colors={colors}, instrument={instrument}, condition={condition}, images=first-{instrument}-{color_str}.jpg + second-{instrument}-{color_str}.jpg"

In [8]:
# Generate the initial DataFrame
data = []

for model, name_pair, color_set, condition, instrument, trial in product(
    models, names, colors, conditions, instruments, trials
):
    stimulus_desc = generate_stimulus_description(condition, name_pair, color_set, instrument)
    data.append({
        "names": name_pair,
        "colors": color_set,
        "condition": condition,
        "model": model,
        "instrument": instrument,
        "trial": trial,
        "stimulus": stimulus_desc
    })

df = pd.DataFrame(data)
print(f"Generated {len(df)} stimuli")
df.to_csv("stimuli_image.csv", index=False)

Generated 450 stimuli


In [4]:
from openai import OpenAI
import backoff

@backoff.on_exception(backoff.expo, Exception, max_tries=5)
def generate_openai_image(content, model="gpt-5"):
    """Call OpenAI API with multimodal content (text + images)."""
    
    reasoning = {}
    model_name = model
    if "gpt-5" in model:
        reasoning = {
            "effort": model.split("-")[-1],
            "summary": "detailed"
        }
        model_name = "gpt-5"
    
    client = OpenAI()
    response = client.responses.create(
        model=model_name,
        input=[
            {
                "role": "user",
                "content": content
            }
        ],
        reasoning=reasoning if reasoning else None,
        store=True,
        temperature=1
    )
    
    # Extract reasoning summary if available
    reasoning_summary = ""
    if hasattr(response, 'output') and len(response.output) > 0:
        if hasattr(response.output[0], 'summary') and len(response.output[0].summary) > 0:
            reasoning_summary = response.output[0].summary[0].text
    
    return response.output_text, reasoning_summary

In [9]:
# Load the stimuli and prepare for results
results_df = pd.read_csv("stimuli_image.csv")
results_df['result'] = ""  # Placeholder for model responses
results_df['reasoning'] = ""  # Placeholder for reasoning content
print(f"Loaded {len(results_df)} stimuli")

Loaded 450 stimuli


In [None]:
# Load the partial results if any
results_df = pd.read_csv("model_responses_partial_image.csv")
print(f"Resuming from {len(results_df)} rows")

In [10]:
# Loop through each row and get model responses
for index, row in results_df.iterrows():
    # Skip if already processed
    if row['result'] != "" and pd.notna(row['result']):
        continue
    
    # Generate multimodal content on-the-fly
    names = eval(row['names'])  # Convert string tuple back to tuple
    colors = eval(row['colors'])
    condition = row['condition']
    instrument = row['instrument']
    model = row['model']
    
    content = generate_stimulus(condition, names, colors, instrument)
    response = generate_openai_image(content, model=model)
    
    results_df.at[index, 'result'] = response[0]
    results_df.at[index, 'reasoning'] = response[1]
    
    # Save the results to a new CSV file every 30 rows
    if index % 30 == 0:
        print(f"Processed {index} rows...")
        results_df.to_csv("model_responses_partial_image.csv", index=False)

# Save the final results to a CSV file
results_df.to_csv("model_responses_image.csv", index=False)
print(f"Done! Saved {len(results_df)} results.")

Processed 0 rows...
Processed 30 rows...
Processed 60 rows...
Processed 90 rows...
Processed 120 rows...
Processed 150 rows...
Processed 180 rows...
Processed 210 rows...
Processed 240 rows...
Processed 270 rows...
Processed 300 rows...
Processed 330 rows...
Processed 360 rows...
Processed 390 rows...
Processed 420 rows...
Done! Saved 450 results.
