In [1]:
%pip install openai python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from openai import OpenAI
import os
import json
import time
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

test_path = os.path.abspath(os.getcwd()).split('gpt5_completion_scripts')[0] + 'processed_data/gpt5/processed_test.jsonl'
output_dir = os.path.abspath(os.getcwd()).split('gpt5_completion_scripts')[0] + 'generated_data_gpt5'
os.makedirs(output_dir, exist_ok=True)

test_data = []
with open(test_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

print(f"Loaded {len(test_data)} test items")

Loaded 950 test items


In [3]:
def check_batch_status():
    batches = client.batches.list(limit=20)
    print("Current batches:")
    for batch in batches.data:
        metadata = batch.metadata or {}
        model = metadata.get('model', 'unknown')
        print(f"ID: {batch.id}, Status: {batch.status}, Model: {model}")
    return batches.data

current_batches = check_batch_status()

Current batches:
ID: batch_68aaae854b1c8190be07351316efc4c8, Status: completed, Model: unknown
ID: batch_68aaae764f2c8190a32adadff2d5dde7, Status: failed, Model: unknown
ID: batch_68aaa8c22cc48190a6b7c117e435ba8e, Status: completed, Model: unknown
ID: batch_68aa98a33710819092b6c1b71dd24e29, Status: completed, Model: unknown
ID: batch_68aa802fcae08190a9d356a269a351a9, Status: failed, Model: unknown
ID: batch_68aa7e56f9808190894187ac1b946b08, Status: completed, Model: unknown
ID: batch_68aa4f0cb8148190b5e393bbe9f5b4c3, Status: failed, Model: unknown
ID: batch_68aa4d03f9248190b08868fd781a7c45, Status: failed, Model: unknown
ID: batch_68aa4d0399288190939118243c146b47, Status: cancelled, Model: unknown
ID: batch_68aa4d03001c819094dd995f8797d8fb, Status: cancelled, Model: unknown
ID: batch_68aa4d0269548190b88457117f50de47, Status: cancelled, Model: unknown
ID: batch_68aa4ca7c3d88190881a8691e25c2822, Status: completed, Model: unknown
ID: batch_68aa49b9da98819095b61ef4bdd1b5ec, Status: failed,

In [28]:
def load_prompt(filename):
    with open(filename, 'r') as f:
        return f.read()

generator_system = load_prompt('generator_system_prompt.md')
evaluator_system = load_prompt('evaluator_system_prompt.md')

def extract_passage(prompt_text):
    return prompt_text.split('###')[0].strip()

def wait_for_batch(batch_id, description="batch"):
    print(f"Waiting for {description} to complete...")
    while True:
        batch = client.batches.retrieve(batch_id)
        print(f"Status: {batch.status}")
        if batch.status == "completed":
            return batch
        elif batch.status == "failed":
            print(f"Batch failed: {batch}")
            return None
        time.sleep(30)

def process_completed_batch(batch_id, scenario_name, output_filename):
    try:
        batch = client.batches.retrieve(batch_id)
        
        if batch.status != "completed":
            print(f"Batch {batch_id} status: {batch.status}")
            return None
        
        result_file_id = batch.output_file_id
        result = client.files.content(result_file_id)
        
        results = {}
        for line in result.text.strip().split('\n'):
            response = json.loads(line)
            custom_id = response['custom_id']
            
            if 'error' in response and response['error']:
                print(f"Error in {custom_id}: {response['error']}")
                continue
                
            if response['response']['status_code'] != 200:
                print(f"API Error in {custom_id}: {response['response']['body']}")
                continue
                
            quiz_content = response['response']['body']['choices'][0]['message']['content']
            
            results[custom_id] = {
                "item_id": custom_id,
                "variant": scenario_name,
                "round": 1,
                "quiz": quiz_content
            }
        
        with open(f"{output_dir}/{output_filename}", 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"{scenario_name} completed: {len(results)} items saved to {output_filename}")
        return results
        
    except Exception as e:
        print(f"Error processing batch {batch_id}: {e}")
        return None
        
print("Setup complete")

Setup complete


In [6]:
def download_and_transform_gpt5_batch(batch_ids, file_name):
    # Handle both single batch_id and list of batch_ids
    if isinstance(batch_ids, str):
        batch_ids = [batch_ids]
    
    combined_results = {}
    
    for idx, batch_id in enumerate(batch_ids):
        if batch_id is None:
            print(f"Skipping batch {idx+1} (failed to create)")
            continue
            
        print(f"Downloading batch {idx+1}/{len(batch_ids)}: {batch_id}")
        
        try:
            batch = client.batches.retrieve(batch_id)
            output_file_id = batch.output_file_id
            print(f"Output file ID: {output_file_id}")
            file_response = client.files.content(output_file_id)
            
            for line in file_response.text.strip().split('\n'):
                row = json.loads(line)
                try:
                    quiz_text = row["response"]["body"]["output"][1]["content"][0]["text"]
                    # Use the custom_id as the key to maintain original indexing
                    custom_id = row["custom_id"]
                    combined_results[custom_id] = quiz_text
                except Exception as e:
                    print(f"Error processing row in batch {idx+1}: {e}")
                    continue
                    
        except Exception as e:
            print(f"Error downloading batch {idx+1}: {e}")
            continue
    
    # Save combined results
    output_path = os.path.join(os.path.dirname(os.getcwd()), "generated_data_gpt5", file_name)
    with open(output_path, "w") as f:
        json.dump(combined_results, f, indent=2)
    
    print(f"Results saved to {file_name}: {len(combined_results)} items")
    return combined_results

In [43]:
# Unified Batch File Creation Function
def create_batch_files(num_items, request_body_template, custom_id_prefix, batch_name_prefix, batch_suffix=""):
    print(f"Creating {batch_name_prefix} batch files ({num_items} items)")
    
    batch_size = 475
    file_ids = []
    
    # Calculate number of batches needed
    num_batches = (num_items + batch_size - 1) // batch_size
    
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_items, len(test_data))
        
        requests = []
        for i in range(start_idx, end_idx):
            passage = extract_passage(test_data[i]['prompt'])
            user_prompt = f"Generate a quiz for this passage:\n\n{passage}"
            
            # Create request body from template
            request_body = request_body_template.copy()
            request_body["input"] = user_prompt
            
            requests.append({
                "custom_id": f"{custom_id_prefix}{batch_suffix}_{i+1}",
                "method": "POST",
                "url": "/v1/responses",
                "body": request_body
            })
        
        # Create batch file with batch index suffix
        batch_file_suffix = f"{batch_suffix}_batch{batch_idx+1}" if num_batches > 1 else batch_suffix
        batch_file = f"{output_dir}/batch_{batch_name_prefix.lower()}{batch_file_suffix}.jsonl"
        
        with open(batch_file, 'w') as f:
            for request in requests:
                f.write(json.dumps(request) + '\n')
        
        print(f"Batch {batch_idx+1}/{num_batches}: {len(requests)} items, estimated tokens: {len(requests) * 200}")
        
        # Upload file
        with open(batch_file, 'rb') as f:
            file_response = client.files.create(file=f, purpose="batch")
        
        print(f"File uploaded: {file_response.id}")
        file_ids.append(file_response.id)
    
    print(f"Total files created: {len(file_ids)}")
    return file_ids

In [23]:
# Unified batch runner function
def run_batch(file_id, scenario_name, batch_suffix=""):
    print(f"Running {scenario_name} batch with file: {file_id}")
    
    try:
        batch = client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/responses",
            completion_window="24h",
            metadata={"scenario": f"{scenario_name}{batch_suffix}"}
        )
        
        print(f"{scenario_name}{batch_suffix} batch created: {batch.id}")
        return batch.id
        
    except Exception as e:
        print(f"Error: {e}")
        return None

In [44]:
# Helper functions for specific scenarios
def run_a1(file_id, batch_suffix=""):
    """A1: Single-agent, standard model"""
    return run_batch(file_id, "A1", batch_suffix)

def create_a1_batch_files(num_items, batch_suffix=""):
    """A1: Single-agent, standard model"""
    request_body_template = {
        "model": "gpt-5-2025-08-07",
        "instructions": generator_system,
        "reasoning": {"effort": "minimal"},
        "max_output_tokens": 200
    }
    return create_batch_files(num_items, request_body_template, "A1", "A1", batch_suffix)

In [None]:
# Upload A1 test file (5 items)
a1_test_file_id = create_a1_batch_files(5, "_test")

In [None]:
# A1 TEST run - manual invocation for each batch
# Run first batch
a1_test_batch1 = run_a1(a1_test_file_id[0], "_test_batch1")

Running A1 batch with file: file-1BFrhUPT4pA5fBw5vzE8qi
A1_test batch created: batch_68aa4ca7c3d88190881a8691e25c2822
['batch_68aa4ca7c3d88190881a8691e25c2822']


In [None]:
# Download and process the batch files (combines multiple batches into single file)
# Combine batch IDs when both are complete
a1_test_batches = [a1_test_batch1]  # Add a1_test_batch2 when ready
download_and_transform_gpt5_batch(a1_test_batches, "a1_test.json")

file-LVXb5Pchu7eS32XEazVoh5


In [None]:
# Upload A1 full file (950 items) - commented to avoid accidental upload
a1_full_file_id = create_a1_batch_files(950)

Creating A1 batch files (950 items)
Batch 1/2: 500 items, estimated tokens: 100000
File uploaded: file-4L3qxiiSy2pXUYitYYS821
Batch 2/2: 450 items, estimated tokens: 90000
File uploaded: file-XN7adbt3DAKtsH5FfCAaio
Total files created: 2


In [11]:
# A1 FULL run - manual invocation for each batch
# Run first batch
a1_full_batch1 = run_a1(a1_full_file_id[0], "_batch1")


Running A1 batch with file: file-4L3qxiiSy2pXUYitYYS821
A1_batch1 batch created: batch_68aaf2e95b9881908f527e323bcd1e99


In [15]:
# Run second batch (uncomment when first batch is done)
a1_full_batch2 = run_a1(a1_full_file_id[1], "_batch2")

Running A1 batch with file: file-XN7adbt3DAKtsH5FfCAaio
A1_batch2 batch created: batch_68aaf7e2ec9c819096391d0e9261e2cb


In [16]:
# Combine batch IDs when both are complete
a1_full_batches = [a1_full_batch1, a1_full_batch2]  # Add a1_full_batch2 when ready
download_and_transform_gpt5_batch(a1_full_batches, "a1.json")

Downloading batch 1/2: batch_68aaf2e95b9881908f527e323bcd1e99
Output file ID: file-6bvcH7pYTbyJcExbyR9unb
Downloading batch 2/2: batch_68aaf7e2ec9c819096391d0e9261e2cb
Output file ID: file-UZPhkAXqJXsQoMu7wHhsf2
Results saved to a1.json: 950 items


{'A1_1': 'Question: Why did the narrator arrange for Santa to bring an extra gift addressed to Alice on Christmas Day?\nTrue answer: To fulfill the childhood wish her mother Alice never had met—receiving a doll she couldn’t afford in 1925.\nFalse answer: To surprise Katie with a second present after she lost interest in her first gift.\nFalse answer: To apologize to his mother for forgetting her birthday earlier that year.\nFalse answer: Because Santa accidentally brought an extra package and needed someone to claim it.',
 'A1_2': 'Question: Why did the narrator arrange for Santa to bring an extra gift for Alice on Christmas Day?\nTrue answer: To finally give her the doll she couldn’t afford as a child and fulfill a long-delayed wish.\nFalse answer: To surprise her with news that her long-lost mother was found.\nFalse answer: To encourage her to dress up as Santa for her granddaughter.\nFalse answer: To celebrate her recent promotion at work.',
 'A1_3': 'Question: Why did the narrator 

In [57]:
def create_a2_batch_files(num_items, batch_suffix=""):
    """A2: Single-agent, reasoning model"""
    request_body_template = {
        "model": "gpt-5-2025-08-07",
        "instructions": generator_system,
        "reasoning": {"effort": "high"},
        "max_output_tokens": 200
    }
    return create_batch_files(num_items, request_body_template, "A2", "A2", batch_suffix)

def run_a2(file_id, batch_suffix=""):
    """A2: Single-agent, reasoning model"""
    return run_batch(file_id, "A2", batch_suffix)

In [58]:
# Upload A2 test file (3 items)
a2_test_file_id = create_a2_batch_files(3, "_test")

Creating A2 batch files (3 items)
Batch 1/1: 3 items, estimated tokens: 600
File uploaded: file-Wks1SdEwPgiYQRF7zYReDN
Total files created: 1


In [60]:
# A2 TEST run
a2_test_batch = run_a2(a2_test_file_id[0], "_test")

Running A2 batch with file: file-Wks1SdEwPgiYQRF7zYReDN
A2_test batch created: batch_68abb9948e3c8190b0acb2820d737d2d


In [51]:
# Upload A2 full file (950 items) - should create 2 files for 950 items
a2_full_file_id = create_a2_batch_files(950)

Creating A2 batch files (950 items)
Batch 1/2: 475 items, estimated tokens: 95000
File uploaded: file-Wuy9SxmwxCWFdpfSwqW8xi
Batch 2/2: 475 items, estimated tokens: 95000
File uploaded: file-Khd19DtLGff1CMixcDUEF9
Total files created: 2


In [52]:
# A2 FULL run - manual invocation for each batch
# Run first batch
a2_full_batch1 = run_a2(a2_full_file_id[0], "_batch1")

Running A2 batch with file: file-Wuy9SxmwxCWFdpfSwqW8xi
A2_batch1 batch created: batch_68ab122f1ad88190bae5b696df136562


In [54]:
# Run second batch
a2_full_batch2 = run_a2(a2_full_file_id[1], "_batch2")

Running A2 batch with file: file-Khd19DtLGff1CMixcDUEF9
A2_batch2 batch created: batch_68ab975b4d0c8190bf3e079e7a386d92


In [56]:
# Combine batch IDs when both are complete
a2_full_batches = ["batch_68ab122f1ad88190bae5b696df136562", "batch_68ab9e3b39c0819092c6fd3ec55dcff7"]
download_and_transform_gpt5_batch(a2_full_batches, "a2_full.json")

Downloading batch 1/2: batch_68ab122f1ad88190bae5b696df136562
Output file ID: file-J886WEm4dPJzz93qbSsBKo
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list index out of range
Error processing row in batch 1: list i

{}