In [1]:
import pandas as pd
from itertools import product

In [2]:
# Variables 
models = ["gpt-5-low","gpt-5-medium","gpt-5-high","gpt-4o","gpt-4o-mini"]
names = [ ("Alice", "Beth"), ('Denise','Vicky'), ('Emma','Sophia') ]
colors = [("blue","purple","red","green"), ("green","red","purple","blue")]
conditions = ["Ignorance", "Knowledge-plausible", "Knowledge-implausible"]
instrument = ["violin","flute","ball"]
trials = list(range(5))

# Function to generate the stimuli 
def generate_stimuli(condition, names, colors, instrument):
    if condition == "Ignorance":
        cond_statement = f"moves the {instrument} to another container."
    elif condition == "Knowledge-plausible":
        cond_statement = f"moves the {instrument} to the {colors[2]} container."
    elif condition == "Knowledge-implausible":
        cond_statement = f"moves the {instrument} to the {colors[1]} container."

    story = f"""Consider the following scenario: 
    In a room, there are four containers from left to right spaced out evenly: a {colors[0]} container, a {colors[1]} container, a {colors[2]} container, and a {colors[3]} container.
    {names[0]} finishes playing her {instrument} and puts it in the {colors[0]} container. 
    Then she goes outside to play. While {names[0]} is outside playing, her sister, {names[1]} {cond_statement} 
    Then {names[1]} swaps the positions of the containers in the room, changing the order from left to right to: {colors[2]}, {colors[3]}, {colors[1]}, and {colors[0]}.
    When {names[0]} returns, she wants to play her {instrument}. 
    What are the chances {names[0]} will first look for her {instrument} in each of the above containers? 
    Put your answer in percentages (must sum to 100) in the JSON structure inside <answer> tags like below:
    <answer>
    {{
        "{colors[0]}": ___,
        "{colors[1]}": ___,
        "{colors[2]}": ___,
        "{colors[3]}": ___
    }}
    </answer>
    """

    return story

In [3]:
# Generate the intial DataFrame
data = []

for model, name_pair, color_set, condition, instrument_item, trial in product(
    models, names, colors, conditions, instrument, trials
):
    stimulus = generate_stimuli(condition, name_pair, color_set, instrument_item)
    data.append({
        "names": name_pair,
        "colors": color_set,
        "condition": condition,
        "model": model,
        "trial": trial,
        "stimulus": stimulus
    })

df = pd.DataFrame(data)
df.to_csv("stimuli.csv", index=False)

In [7]:
from openai import OpenAI
import backoff

@backoff.on_exception(backoff.expo, Exception, max_tries=5)
def generate_openai(content, model="gpt-5"):
    """Call OpenAI API with multimodal content (text + images)."""
    
    reasoning = {}
    model_name = model
    if "gpt-5" in model:
        reasoning = {
            "effort": model.split("-")[-1],
            "summary": "detailed"
        }
        model_name = "gpt-5"
    
    client = OpenAI()
    response = client.responses.create(
        model=model_name,
        input=[
            {
                "role": "user",
                "content": content
            }
        ],
        reasoning=reasoning if reasoning else None,
        store=True,
        temperature=1
    )
    
    # Extract reasoning summary if available
    reasoning_summary = ""
    if hasattr(response, 'output') and len(response.output) > 0:
        if hasattr(response.output[0], 'summary') and len(response.output[0].summary) > 0:
            reasoning_summary = response.output[0].summary[0].text
    
    return response.output_text, reasoning_summary
# Test the function
generate_openai("Hello, how are you?", model="gpt-4o")

("Hello! I'm here to help. How are you doing?", '')

In [7]:
# Load the stimuli and prepare for results
results_df = pd.read_csv("stimuli.csv")
results_df['result'] = ""  # Placeholder for model responses
results_df['reasoning'] = ""  # Placeholder for reasoning content

In [8]:
# Load the partial results if any
results_df = pd.read_csv("model_responses_partial.csv")

In [10]:
# Loop through each row and get model responses
for index, row in results_df.iterrows():
    # Skip if already processed
    if row['result'] != "" and pd.notna(row['result']):
        continue
    
    prompt = row['stimulus']
    model = row['model']
    response = generate_openai(prompt, model=model)    
    results_df.at[index, 'result'] = response[0]
    results_df.at[index, 'reasoning'] = response[1]
    
    # Save the results to a new CSV file every 30 rows
    if index % 30 == 0:
        print(f"Processed {index} rows...")
        results_df.to_csv("model_responses_partial.csv", index=False)

# Save the final results to a CSV file
results_df.to_csv("model_responses.csv", index=False)
print("Done!")

Processed 810 rows...
Processed 840 rows...
Processed 870 rows...
Processed 900 rows...
Processed 930 rows...
Processed 960 rows...
Processed 990 rows...
Processed 1020 rows...
Processed 1050 rows...
Processed 1080 rows...
Processed 1110 rows...
Processed 1140 rows...
Processed 1170 rows...
Processed 1200 rows...
Processed 1230 rows...
Processed 1260 rows...
Processed 1290 rows...
Processed 1320 rows...
Done!
