In [1]:
%pip install openai python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from openai import OpenAI
import os
import json
import time
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

test_path = os.path.abspath(os.getcwd()).split('gpt5_completion_scripts')[0] + 'processed_data/gpt5/processed_test.jsonl'
output_dir = os.path.abspath(os.getcwd()).split('gpt5_completion_scripts')[0] + 'generated_data_gpt5'
os.makedirs(output_dir, exist_ok=True)

test_data = []
with open(test_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

print(f"Loaded {len(test_data)} test items")

Loaded 950 test items


In [3]:
def check_batch_status():
    batches = client.batches.list(limit=20)
    print("Current batches:")
    for batch in batches.data:
        metadata = batch.metadata or {}
        model = metadata.get('model', 'unknown')
        print(f"ID: {batch.id}, Status: {batch.status}, Model: {model}")
    return batches.data

current_batches = check_batch_status()

Current batches:
ID: batch_68b46faa0d888190b788c206f1f2320b, Status: completed, Model: unknown
ID: batch_68b2938a95308190a245e6c4350e900d, Status: completed, Model: unknown
ID: batch_68b264320da08190b0e7e7f79d18fce0, Status: completed, Model: unknown
ID: batch_68b249509ec081908c54e5b3c2133f4c, Status: completed, Model: unknown
ID: batch_68af7dab05208190b24f46e801a5b2cd, Status: completed, Model: unknown
ID: batch_68aeb599b4a08190bd034d80599f2c91, Status: completed, Model: unknown
ID: batch_68aeb405dc84819099117c079886f1b2, Status: completed, Model: unknown
ID: batch_68aeaff743a88190b76ae9fdd9c6ac68, Status: cancelled, Model: unknown
ID: batch_68aeaf8861448190b6bacdc7f85a601d, Status: cancelled, Model: unknown
ID: batch_68abbb6cb42481909b20f970a06033db, Status: failed, Model: unknown
ID: batch_68abbb5219388190917e9002abe23b86, Status: failed, Model: unknown
ID: batch_68abb9948e3c8190b0acb2820d737d2d, Status: failed, Model: unknown
ID: batch_68ab9e3b39c0819092c6fd3ec55dcff7, Status: comp

In [5]:
def load_prompt(filename):
    with open(filename, 'r') as f:
        return f.read()

generator_system = load_prompt('generator_system_prompt.md')
evaluator_system = load_prompt('evaluator_system_prompt.md')

def extract_passage(prompt_text):
    return prompt_text.split('###')[0].strip()

def wait_for_batch(batch_id, description="batch"):
    print(f"Waiting for {description} to complete...")
    while True:
        batch = client.batches.retrieve(batch_id)
        print(f"Status: {batch.status}")
        if batch.status == "completed":
            return batch
        elif batch.status == "failed":
            print(f"Batch failed: {batch}")
            return None
        time.sleep(30)

def process_completed_batch(batch_id, scenario_name, output_filename):
    try:
        batch = client.batches.retrieve(batch_id)
        
        if batch.status != "completed":
            print(f"Batch {batch_id} status: {batch.status}")
            return None
        
        result_file_id = batch.output_file_id
        result = client.files.content(result_file_id)
        
        results = {}
        for line in result.text.strip().split('\n'):
            response = json.loads(line)
            custom_id = response['custom_id']
            
            if 'error' in response and response['error']:
                print(f"Error in {custom_id}: {response['error']}")
                continue
                
            if response['response']['status_code'] != 200:
                print(f"API Error in {custom_id}: {response['response']['body']}")
                continue
                
            quiz_content = response['response']['body']['choices'][0]['message']['content']
            
            results[custom_id] = {
                "item_id": custom_id,
                "variant": scenario_name,
                "round": 1,
                "quiz": quiz_content
            }
        
        with open(f"{output_dir}/{output_filename}", 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"{scenario_name} completed: {len(results)} items saved to {output_filename}")
        return results
        
    except Exception as e:
        print(f"Error processing batch {batch_id}: {e}")
        return None
        
print("Setup complete")

Setup complete


In [6]:
def download_and_transform_gpt5_batch(batch_ids, file_name):
    # Handle both single batch_id and list of batch_ids
    if isinstance(batch_ids, str):
        batch_ids = [batch_ids]
    
    combined_results = {}
    
    for idx, batch_id in enumerate(batch_ids):
        if batch_id is None:
            print(f"Skipping batch {idx+1} (failed to create)")
            continue
            
        print(f"Downloading batch {idx+1}/{len(batch_ids)}: {batch_id}")
        
        try:
            batch = client.batches.retrieve(batch_id)
            output_file_id = batch.output_file_id
            print(f"Output file ID: {output_file_id}")
            file_response = client.files.content(output_file_id)
            
            for line in file_response.text.strip().split('\n'):
                row = json.loads(line)
                try:
                    quiz_text = row["response"]["body"]["output"][1]["content"][0]["text"]
                    # Use the custom_id as the key to maintain original indexing
                    custom_id = row["custom_id"]
                    combined_results[custom_id] = quiz_text
                except Exception as e:
                    print(f"Error processing row in batch {idx+1}: {e}")
                    continue
                    
        except Exception as e:
            print(f"Error downloading batch {idx+1}: {e}")
            continue
    
    # Save combined results
    output_path = os.path.join(os.path.dirname(os.getcwd()), "generated_data_gpt5", file_name)
    with open(output_path, "w") as f:
        json.dump(combined_results, f, indent=2)
    
    print(f"Results saved to {file_name}: {len(combined_results)} items")
    return combined_results

In [14]:
# Unified Batch File Creation Function
def create_batch_files(num_items, request_body_template, custom_id_prefix, batch_name_prefix, batch_suffix=""):
    print(f"Creating {batch_name_prefix} batch files ({num_items} items)")
    
    batch_size = 475
    file_ids = []
    
    # Calculate number of batches needed
    num_batches = (num_items + batch_size - 1) // batch_size
    
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_items, len(test_data))
        
        requests = []
        for i in range(start_idx, end_idx):
            passage = extract_passage(test_data[i]['prompt'])
            user_prompt = f"Generate a quiz for this passage:\n\n{passage}"
            
            # Create request body from template
            request_body = request_body_template.copy()
            request_body["input"] = user_prompt
            
            requests.append({
                "custom_id": f"{custom_id_prefix}{batch_suffix}_{i+1}",
                "method": "POST",
                "url": "/v1/responses",
                "body": request_body
            })
        
        # Create batch file with batch index suffix
        batch_file_suffix = f"{batch_suffix}_batch{batch_idx+1}" if num_batches > 1 else batch_suffix
        batch_file = f"{output_dir}/batch_{batch_name_prefix.lower()}{batch_file_suffix}.jsonl"
        
        with open(batch_file, 'w') as f:
            for request in requests:
                f.write(json.dumps(request) + '\n')
        
        print(f"Batch {batch_idx+1}/{num_batches}: {len(requests)} items, estimated tokens: {len(requests) * 200}")
        
        # Upload file
        with open(batch_file, 'rb') as f:
            file_response = client.files.create(file=f, purpose="batch")
        
        print(f"File uploaded: {file_response.id}")
        file_ids.append(file_response.id)
    
    print(f"Total files created: {len(file_ids)}")
    return file_ids

In [7]:
# Unified batch runner function
def run_batch(file_id, scenario_name, batch_suffix=""):
    print(f"Running {scenario_name} batch with file: {file_id}")
    
    try:
        batch = client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/responses",
            completion_window="24h",
            metadata={"scenario": f"{scenario_name}{batch_suffix}"}
        )
        
        print(f"{scenario_name}{batch_suffix} batch created: {batch.id}")
        return batch.id
        
    except Exception as e:
        print(f"Error: {e}")
        return None

In [8]:
# Helper functions for specific scenarios
def run_a1(file_id, batch_suffix=""):
    """A1: Single-agent, standard model"""
    return run_batch(file_id, "A1", batch_suffix)

def create_a1_batch_files(num_items, batch_suffix=""):
    """A1: Single-agent, standard model"""
    request_body_template = {
        "model": "gpt-5-2025-08-07",
        "instructions": generator_system,
        "reasoning": {"effort": "minimal"},
        "max_output_tokens": 200
    }
    return create_batch_files(num_items, request_body_template, "A1", "A1", batch_suffix)

In [None]:
# Upload A1 test file (5 items)
a1_test_file_id = create_a1_batch_files(5, "_test")

In [None]:
# A1 TEST run - manual invocation for each batch
# Run first batch
a1_test_batch1 = run_a1(a1_test_file_id[0], "_test_batch1")

Running A1 batch with file: file-Wks1SdEwPgiYQRF7zYReDN
A1_test_batch1 batch created: batch_68aeaf8861448190b6bacdc7f85a601d


In [None]:
# Download and process the batch files (combines multiple batches into single file)
# Combine batch IDs when both are complete
#a1_test_batches = [a1_test_batch1]  # Add a1_test_batch2 when ready
download_and_transform_gpt5_batch(a1_test_batch1, "a1_test.json")

file-LVXb5Pchu7eS32XEazVoh5


In [16]:
# Upload A1 full file (950 items) - commented to avoid accidental upload
a1_full_file_id = create_a1_batch_files(950)

Creating A1 batch files (950 items)
Batch 1/2: 475 items, estimated tokens: 95000
File uploaded: file-JbuWvVgqSaNAqsxhTSTZJn
Batch 2/2: 475 items, estimated tokens: 95000
File uploaded: file-1t3jZ2CyjzWJuz1FydFosx
Total files created: 2


In [17]:
# A1 FULL run - manual invocation for each batch
# Run first batch
a1_full_batch1 = run_a1(a1_full_file_id[0], "_batch1")


Running A1 batch with file: file-JbuWvVgqSaNAqsxhTSTZJn
A1_batch1 batch created: batch_68b2938a95308190a245e6c4350e900d


In [18]:
# Run second batch (uncomment when first batch is done)
a1_full_batch2 = run_a1(a1_full_file_id[1], "_batch2")

Running A1 batch with file: file-1t3jZ2CyjzWJuz1FydFosx
A1_batch2 batch created: batch_68b46faa0d888190b788c206f1f2320b


In [None]:
# Combine batch IDs when both are complete
a1_full_batches = [a1_full_batch1, a1_full_batch2]  # Add a1_full_batch2 when ready
download_and_transform_gpt5_batch(a1_full_batches, "a1.json")

Downloading batch 1/2: batch_68b2938a95308190a245e6c4350e900d
Output file ID: file-BKgkHFYcHM5H7n1sheg96q
Downloading batch 2/2: batch_68b46faa0d888190b788c206f1f2320b
Output file ID: file-BQRwHxUPEUfJdnwbcqsQF5
Results saved to a1.json: 950 items


{'A1_1': 'Question: Why did the narrator buy two dolls in 1982?\nTrue answer: To give one to Katie and one to her old mother.\nFalse answer: To donate both dolls to a local charity.\nFalse answer: To replace a broken doll Katie had lost.\nFalse answer: To surprise Santa Claus during his visit.',
 'A1_2': 'Question: Why did the narrator buy a doll for Alice?\nTrue answer: She never received a doll as a child.\nFalse answer: She collected dolls as a hobby.\nFalse answer: She asked for a new doll at Christmas.\nFalse answer: She wanted to match Katie’s favorite toy.',
 'A1_3': 'Question: Why was Alice moved to tears on Christmas Day?\nTrue answer: She received the doll she had waited fifty-seven years to receive.\nFalse answer: She finally learned who played Santa Claus in her area.\nFalse answer: She was surprised that Katie did not like her present.\nFalse answer: She remembered that her family couldn’t afford a Christmas tree.',
 'A1_4': 'Question: Why did January 23 make sense to Joel

In [11]:
def create_a2_batch_files(num_items, batch_suffix=""):
    """A2: Single-agent, reasoning model"""
    request_body_template = {
        "model": "gpt-5-2025-08-07",
        "instructions": generator_system,
        "reasoning": {"effort": "high"},
        #"max_output_tokens": 200
    }
    return create_batch_files(num_items, request_body_template, "A2", "A2", batch_suffix)

def run_a2(file_id, batch_suffix=""):
    """A2: Single-agent, reasoning model"""
    return run_batch(file_id, "A2", batch_suffix)

In [13]:
# Upload A2 test file (3 items)
a2_test_file_id = create_a2_batch_files(3, "_test")

Creating A2 batch files (3 items)
Batch 1/1: 3 items, estimated tokens: 600
File uploaded: file-WkWwXERxhKdMjcmoJDMNxg
Total files created: 1


In [12]:
# A2 TEST run
a2_test_batch = run_a2(a2_test_file_id[0], "_test")

NameError: name 'a2_test_file_id' is not defined

In [15]:
# Upload A2 full file (150 items) - to limit cost
a2_full_file_id = create_a2_batch_files(150)

Creating A2 batch files (150 items)
Batch 1/1: 150 items, estimated tokens: 30000
File uploaded: file-WzZ8cyYya1Di9FAhzTkYpF
Total files created: 1


In [16]:
# A2 FULL run - manual invocation for each batch
# Run first batch
a2_full_batch1 = run_a2(a2_full_file_id[0], "_batch1")

Running A2 batch with file: file-WzZ8cyYya1Di9FAhzTkYpF
A2_batch1 batch created: batch_68b78785b28c819099fbfcd76851d54c


In [None]:
# Run second batch
# a2_full_batch2 = run_a2(a2_full_file_id[1], "_batch2")

Running A2 batch with file: file-LrJB6xgXhUEkAEJ2dgrPG6
A2_batch2 batch created: batch_68af7dab05208190b24f46e801a5b2cd


In [None]:
# Combine batch IDs when both are complete
a2_full_batches = [a2_full_batch1]
download_and_transform_gpt5_batch(a2_full_batches, "a2_full.json")

Downloading batch 1/2: batch_68aeb599b4a08190bd034d80599f2c91
Output file ID: file-Q9s5hpCjDjQ9aaQj28TfyA
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Downloading batch 2/2: batch_68af7dab05208190b24f46e801a5b2cd
Output file ID: file-WW5SJv181vu2i7YmhVdsZ8
Results saved to a2_full.json: 939 items


{'A2_1': "```\nQuestion: Why did Alice cry when she received the doll from Santa?\nTrue answer: Because she finally received the doll she had waited 57 years for but couldn't have as a child.\nFalse answer: Because the note said the doll was from her late mother.\nFalse answer: Because Katie insisted that Alice take Katie's own doll.\nFalse answer: Because her brothers and sisters had arranged the surprise.\n```",
 'A2_2': "```\nQuestion: Why did Alice become so emotional when Santa handed her the extra gift?\nTrue answer: It fulfilled her childhood wish for a doll she never received because her family couldn't afford one, a gift she'd waited 57 years for.\nFalse answer: She recognized it as a doll handmade by her late mother.\nFalse answer: She had asked for a doll that Christmas and didn't expect anyone to remember.\nFalse answer: It was meant to celebrate her birth in 1925, making it a birthday surprise.\n```",
 'A2_3': '```\nQuestion: Why did Alice cry on Christmas Day?\nTrue answe