In [2]:
%pip install openai python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [52]:
from openai import OpenAI
import os
import json
import time
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

test_path = os.path.abspath(os.getcwd()).split('gpt5_completion_scripts')[0] + 'processed_data/gpt5/processed_test.jsonl'
output_dir = os.path.abspath(os.getcwd()).split('gpt5_completion_scripts')[0] + 'generated_data_gpt5'
os.makedirs(output_dir, exist_ok=True)

test_data = []
with open(test_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

print(f"Loaded {len(test_data)} test items")

Loaded 950 test items


In [6]:
def check_batch_status():
    batches = client.batches.list(limit=20)
    print("Current batches:")
    for batch in batches.data:
        metadata = batch.metadata or {}
        model = metadata.get('model', 'unknown')
        print(f"ID: {batch.id}, Status: {batch.status}, Model: {model}")
    return batches.data

current_batches = check_batch_status()

Current batches:
ID: batch_68a9af26794881909cf83cc02a5e8d2b, Status: completed, Model: unknown
ID: batch_68a9ae66123c81909049bacee537926e, Status: completed, Model: unknown
ID: batch_68a9ae2e0be08190b0b94df1938451b5, Status: completed, Model: unknown
ID: batch_68a9ad067d908190bdae45deee49a144, Status: completed, Model: unknown
ID: batch_68a9ac41d25c8190b855e25f04e8c5a3, Status: completed, Model: unknown
ID: batch_68a9863304b48190850c54a6a68810c7, Status: completed, Model: unknown
ID: batch_68a983d16f348190b7839e1868b5eb51, Status: completed, Model: unknown
ID: batch_68a97d6448f08190b59a76459800562d, Status: completed, Model: gpt-4
ID: batch_68a9783b1ed881908350fdd57609f3e8, Status: completed, Model: gpt-4
ID: batch_68a975ebd06c8190815d559b5037c31c, Status: failed, Model: unknown


In [46]:
def load_prompt(filename):
    with open(filename, 'r') as f:
        return f.read()

generator_system = load_prompt('generator_system_prompt.md')
evaluator_system = load_prompt('evaluator_system_prompt.md')

def extract_passage(prompt_text):
    return prompt_text.split('###')[0].strip()

import random
random.seed(42)
def create_k_shot(data, k=3):
    examples = []
    k = min(k, len(data))
    sampled = random.sample(data, k)
    for item in sampled:
        passage = extract_passage(item['prompt'])
        completion = item['completion'].strip()
        examples.append(f"Passage: {passage}\n\nOutput: {completion}")
    return "\n\n---\n\n".join(examples)

def wait_for_batch(batch_id, description="batch"):
    print(f"Waiting for {description} to complete...")
    while True:
        batch = client.batches.retrieve(batch_id)
        print(f"Status: {batch.status}")
        if batch.status == "completed":
            return batch
        elif batch.status == "failed":
            print(f"Batch failed: {batch}")
            return None
        time.sleep(30)

def process_completed_batch(batch_id, scenario_name, output_filename):
    try:
        batch = client.batches.retrieve(batch_id)
        
        if batch.status != "completed":
            print(f"Batch {batch_id} status: {batch.status}")
            return None
        
        result_file_id = batch.output_file_id
        result = client.files.content(result_file_id)
        
        results = {}
        for line in result.text.strip().split('\n'):
            response = json.loads(line)
            custom_id = response['custom_id']
            
            if 'error' in response and response['error']:
                print(f"Error in {custom_id}: {response['error']}")
                continue
                
            if response['response']['status_code'] != 200:
                print(f"API Error in {custom_id}: {response['response']['body']}")
                continue
                
            quiz_content = response['response']['body']['choices'][0]['message']['content']
            
            results[custom_id] = {
                "item_id": custom_id,
                "variant": scenario_name,
                "round": 1,
                "quiz": quiz_content
            }
        
        with open(f"{output_dir}/{output_filename}", 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"{scenario_name} completed: {len(results)} items saved to {output_filename}")
        return results
        
    except Exception as e:
        print(f"Error processing batch {batch_id}: {e}")
        return None
        
k_shot_examples = create_k_shot(test_data)
print("Setup complete")
print(k_shot_examples)

Setup complete
Passage: 108 Wensan Road London, 85 A 100  England
March 1st, 2013
Dear Lin Tao,
I am writing to you in English. I hope you can understand  it.
I love studying in London and I have many new friends. Most of them are my classmates. From Monday to Friday, we have English, math, physics and P. E. in the morning. I like English and physics, because they're interesting. I don't like math. It's too boring. At noon, I have to have lunch at school because my home is far from my school. We usually have two classes in the afternoon--art and politics. We finish our classes at 3:30 p. m. After school, my friends and I always play football on the playground. And then we go home by bus.
On weekends, we have no classes. We often go to the park and sometimes we go to the movies in the evening. We see movies twice a month. I like some famous  actors like Jackie Chan.
Oh, I have no time to write more. Please write back soon.
Best wishes,
Wang Gang
,.

Output: Question: Where is Wang Gang 

In [55]:
def download_and_transform_gpt5_batch(batch_ids, file_name):
    # Handle both single batch_id and list of batch_ids
    if isinstance(batch_ids, str):
        batch_ids = [batch_ids]
    
    combined_results = {}
    
    for idx, batch_id in enumerate(batch_ids):
        if batch_id is None:
            print(f"Skipping batch {idx+1} (failed to create)")
            continue
            
        print(f"Downloading batch {idx+1}/{len(batch_ids)}: {batch_id}")
        
        try:
            batch = client.batches.retrieve(batch_id)
            output_file_id = batch.output_file_id
            print(f"Output file ID: {output_file_id}")
            file_response = client.files.content(output_file_id)
            
            for line in file_response.text.strip().split('\n'):
                row = json.loads(line)
                try:
                    quiz_text = row["response"]["body"]["output"][1]["content"][0]["text"]
                    # Use the custom_id as the key to maintain original indexing
                    custom_id = row["custom_id"]
                    combined_results[custom_id] = quiz_text
                except Exception as e:
                    print(f"Error processing row in batch {idx+1}: {e}")
                    continue
                    
        except Exception as e:
            print(f"Error downloading batch {idx+1}: {e}")
            continue
    
    # Save combined results
    output_path = os.path.join(os.path.dirname(os.getcwd()), "generated_data_gpt5", file_name)
    with open(output_path, "w") as f:
        json.dump(combined_results, f, indent=2)
    
    print(f"Results saved to {file_name}: {len(combined_results)} items")
    return combined_results

In [61]:
# A1 File Upload - Create and upload batch files (supports multiple files for large datasets)
def create_a1_batch_files(num_items, batch_suffix=""):
    print(f"Creating A1 batch files ({num_items} items)")
    
    batch_size = 500
    file_ids = []
    
    # Calculate number of batches needed
    num_batches = (num_items + batch_size - 1) // batch_size
    
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_items, len(test_data))
        
        requests = []
        for i in range(start_idx, end_idx):
            passage = extract_passage(test_data[i]['prompt'])
            user_prompt = f"Here are some examples:\n\n{k_shot_examples}\n\n---\n\nGenerate a quiz for this passage:\n\n{passage}"
            
            requests.append({
                "custom_id": f"A1{batch_suffix}_{i+1}",
                "method": "POST",
                "url": "/v1/responses",
                "body": {
                    "model": "gpt-5-2025-08-07",
                    "input": user_prompt,
                    "instructions": generator_system,
                    "reasoning":{"effort": "minimal"},
                    "max_output_tokens": 200
                }
            })
        
        # Create batch file with batch index suffix
        batch_file_suffix = f"{batch_suffix}_batch{batch_idx+1}" if num_batches > 1 else batch_suffix
        batch_file = f"{output_dir}/batch_a1{batch_file_suffix}.jsonl"
        
        with open(batch_file, 'w') as f:
            for request in requests:
                f.write(json.dumps(request) + '\n')
        
        print(f"Batch {batch_idx+1}/{num_batches}: {len(requests)} items, estimated tokens: {len(requests) * 200}")
        
        # Upload file
        with open(batch_file, 'rb') as f:
            file_response = client.files.create(file=f, purpose="batch")
        
        print(f"File uploaded: {file_response.id}")
        file_ids.append(file_response.id)
    
    print(f"Total files created: {len(file_ids)}")
    return file_ids

In [49]:
# Upload A1 test file (5 items)
a1_test_file_id = create_a1_batch_files(5, "_test")

Creating A1 batch file (5 items)
Estimated tokens: 1000
File uploaded: file-1BFrhUPT4pA5fBw5vzE8qi


In [57]:
# A1: Single-agent, standard model
def run_a1(file_id, batch_suffix=""):
    print(f"Running A1 batch with file: {file_id}")
    
    try:
        batch = client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/responses",
            completion_window="24h",
            metadata={"scenario": f"A1{batch_suffix}"}
        )
        
        print(f"A1{batch_suffix} batch created: {batch.id}")
        return batch.id
        
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
# A1 TEST run - manual invocation for each batch
# Run first batch
a1_test_batch1 = run_a1(a1_test_file_id[0], "_test_batch1")

Running A1 batch with file: file-1BFrhUPT4pA5fBw5vzE8qi
A1_test batch created: batch_68aa4ca7c3d88190881a8691e25c2822
['batch_68aa4ca7c3d88190881a8691e25c2822']


In [None]:
# Download and process the batch files (combines multiple batches into single file)
# Combine batch IDs when both are complete
a1_test_batches = [a1_test_batch1]  # Add a1_test_batch2 when ready
download_and_transform_gpt5_batch(a1_test_batches, "a1_test.json")

file-LVXb5Pchu7eS32XEazVoh5


In [62]:
# Upload A1 full file (950 items) - commented to avoid accidental upload
a1_full_file_id = create_a1_batch_files(950)

Creating A1 batch files (950 items)
Batch 1/2: 500 items, estimated tokens: 100000
File uploaded: file-GhDX5ueQ4h3vt4obqQDaEH
Batch 2/2: 450 items, estimated tokens: 90000
File uploaded: file-E6fXFYKsQUENgSvkp9NENp
Total files created: 2


In [64]:
# A1 FULL run - manual invocation for each batch
# Run first batch
a1_full_batch1 = run_a1(a1_full_file_id[0], "_batch1")


Running A1 batch with file: file-GhDX5ueQ4h3vt4obqQDaEH
A1_batch1 batch created: batch_68aa7e56f9808190894187ac1b946b08


In [66]:
# Run second batch (uncomment when first batch is done)
a1_full_batch2 = run_a1(a1_full_file_id[1], "_batch2")

Running A1 batch with file: file-E6fXFYKsQUENgSvkp9NENp
A1_batch2 batch created: batch_68aa98a33710819092b6c1b71dd24e29


In [67]:
# Combine batch IDs when both are complete
a1_full_batches = [a1_full_batch1, a1_full_batch2]  # Add a1_full_batch2 when ready
download_and_transform_gpt5_batch(a1_full_batches, "a1.json")

Downloading batch 1/2: batch_68aa7e56f9808190894187ac1b946b08
Output file ID: file-5r3Nf85miXQiEvDPeufPoV
Downloading batch 2/2: batch_68aa98a33710819092b6c1b71dd24e29
Output file ID: file-Bszjg5Knt4Tuc8CbTJvbo7
Results saved to a1.json: 950 items


{'A1_1': 'Question: Why did the gift to Alice include a note apologizing for lateness?\nTrue answer: It pretended the doll was meant for her in 1925 but arrived decades late.\nFalse answer: It explained the store had run out of dolls that year.\nFalse answer: It said the package was mistakenly sent to the wrong country.\nFalse answer: It claimed Alice had refused the gift when she was young.',
 'A1_2': 'Question: Why did the gift to Alice from “Santa” include a note apologizing for lateness?\nTrue answer: It pretended to be a doll meant for Alice in 1925 that was delivered decades late.\nFalse answer: It explained the post office lost the package that week.\nFalse answer: It said the store was out of dolls until after Christmas 1982.\nFalse answer: It blamed bad weather on Christmas morning for the delay.',
 'A1_3': 'Question: Why did the doll make Alice so emotional on Christmas Day?\nTrue answer: It fulfilled her childhood wish for a doll she couldn’t afford in 1925.\nFalse answer: I

In [None]:
# A2 File Upload - Create and upload batch files
def create_a2_batch_files(num_items, batch_suffix=""):
    print(f"Creating A2 batch file ({num_items} items)")
    
    requests = []
    for i in range(min(num_items, len(test_data))):
        passage = extract_passage(test_data[i]['prompt'])
        user_prompt = f"Here are some examples:\n\n{k_shot_examples}\n\n---\n\nGenerate a quiz for this passage:\n\n{passage}"
        
        requests.append({
            "custom_id": f"A2{batch_suffix}_{i+1}",
            "method": "POST",
            "url": "/v1/responses",
            "body": {
                "model": "gpt-5-2025-08-07",
                "input": user_prompt,
                "instruction": generator_system,
                "reasoning":{"effort": "high", "summary": "detailed"},
                "max_output_tokens": 200
            }
        })
    
    batch_file = f"{output_dir}/batch_a2{batch_suffix}.jsonl"
    with open(batch_file, 'w') as f:
        for request in requests:
            f.write(json.dumps(request) + '\n')
    
    # Upload file
    with open(batch_file, 'rb') as f:
        file_response = client.files.create(file=f, purpose="batch")
    
    print(f"File uploaded: {file_response.id}")
    return file_response.id

# Upload A2 test file (3 items)
a2_test_file_id = create_a2_batch_files(3, "_test")

In [None]:
# Upload A2 full file (150 items) - commented to avoid accidental upload
a2_full_file_id = create_a2_batch_files(150)

In [None]:
# A2: Single-agent, reasoning model
def run_a2(file_id, batch_suffix=""):
    print(f"Running A2 batch with file: {file_id}")
    
    try:
        batch = client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"scenario": f"A2{batch_suffix}", "model": "gpt-5-2025-08-07"}
        )
        
        print(f"A2{batch_suffix} batch created: {batch.id}")
        return batch.id
        
    except Exception as e:
        print(f"Error: {e}")
        print("Note: GPT-5 model name might be incorrect")
        return None

In [None]:
# A2 TEST run
a2_test_batch = run_a2(a2_test_file_id, "_test")

In [None]:
# A2 FULL run
a2_full_batch = run_a2(a2_full_file_id)

In [None]:
download_and_transform_gpt5_batch(a2_test_batch, "a2_test.json")

In [None]:
download_and_transform_gpt5_batch(a2_full_batch, "a2.json")

In [None]:
# Process results when batches complete
# Example usage:
# a1_test_results = process_completed_batch(a1_test_batch, "A1_test", "generated_quiz_A1_test.json")
# a1_full_results = process_completed_batch("batch_id_here", "A1", "generated_quiz_A1.json")
# a2_test_results = process_completed_batch(a2_test_batch, "A2_test", "generated_quiz_A2_test.json")
# a2_full_results = process_completed_batch("batch_id_here", "A2", "generated_quiz_A2.json")

print("Separated file upload notebook ready!")
print("")
print("WORKFLOW:")
print("1. File upload cells run once (create + upload batch files)")
print("2. Run scenario cells multiple times without re-uploading")
print("3. Files are reused, no duplicates created")
print("4. Use process_completed_batch() to get results")