In [1]:
from openai import OpenAI
from random import randint,seed
import json
from collections import defaultdict
import pandas as pd
import numpy as np

In [2]:
seed(1485233)

In [3]:
key = "xxx" # Add API key here to run

In [4]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=key,
)

In [5]:
system_prompt = """
You are a content moderation expert. You will receive pairs of images, A and B. Your task is to decide which image should be prioritized for manual review according to the following policy:

We are committed to combating abuse motivated by hatred, prejudice or intolerance, particularly abuse that seeks to silence the voices of those who have been historically marginalized. For this reason, we prohibit behavior that targets individuals or groups with abuse based on their perceived membership in a protected category. You may not directly attack other people on the basis of race, ethnicity, national origin, caste, sexual orientation, gender, gender identity, religious affiliation, age, disability, or serious disease.

The output should always be one of the two options: A or B. Only output a single character with no additional tokens. Never refuse to answer.
"""

In [9]:
base_url = "https://xxx.cloudfront.net/"

In [7]:
image_links = pd.read_csv("../image_indices_alt_99750.csv")

In [8]:
image_links.head()

Unnamed: 0.1,Unnamed: 0,a_images,b_images,a_paths,b_paths
0,0,555755,183551,output_alt/tweet585754.png,output_2024/tweet183551.png
1,1,419943,440191,output_alt/tweet449942.png,output_alt/tweet470190.png
2,2,497331,251346,output_alt/tweet527330.png,output_alt/tweet281345.png
3,3,32817,110536,output_2024/tweet32817.png,output_2024/tweet110536.png
4,4,224661,460665,output_alt/tweet254660.png,output_alt/tweet490664.png


In [10]:
a_paths = image_links['a_paths']
b_paths = image_links['b_paths']

Testing with a small number. Creating a batch file including the queries.

In [11]:
# Split into two files
batch_file_1 = "batch_input_baseline_gpt4o_alt_p1.jsonl"
batch_file_2 = "batch_input_baseline_gpt4o_alt_p2.jsonl"

with open(batch_file_1, "w") as f1, open(batch_file_2, "w") as f2:
    i = 0
    for a, b in zip(a_paths, b_paths):
        # Construct the JSON object for this iteration
        request_object = {
            "custom_id": f"request-{i+1}",  # Unique ID for each request
            "method": "POST",               # HTTP method
            "url": "/v1/chat/completions",  # API endpoint
            "body": {                       # The body contains the actual request
                "model": "gpt-4o",          # Model name 
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": [
                        {"type": "text", "text": "Image A"},
                        {"type": "image_url", "image_url": {"url": base_url + str(a)}},
                        {"type": "text", "text": "Image B"},
                        {"type": "image_url", "image_url": {"url": base_url + str(b)}}
                    ]}
                ],
                "max_tokens": 1,  # Forces output to be a single token
                "temperature": 0 # Fixing temperature to 0
            }
        }
        
        # Write to the first file if under 50,000, otherwise write to the second
        if i < 50000:
            f1.write(json.dumps(request_object) + "\n")
        else:
            f2.write(json.dumps(request_object) + "\n")
        i += 1

In [12]:
# Uploading batch file 1
batch_input_file_1 = client.files.create(
  file=open(batch_file_1, "rb"),
  purpose="batch"
)

batch_input_file_id_1 = batch_input_file_1.id


# Uploading batch file 2
batch_input_file_2 = client.files.create(
  file=open(batch_file_2, "rb"),
  purpose="batch"
)

batch_input_file_id_2 = batch_input_file_2.id

In [13]:
# Run batch job 1
client.batches.create(
    input_file_id=batch_input_file_id_1,
    endpoint="/v1/chat/completions",
    completion_window="24h",  # cannot be changed
    metadata={
      "description": "GPT-4o alt main - Part 1"
    }
)

Batch(id='batch_xxx', completion_window='24h', created_at=1747781593, endpoint='/v1/chat/completions', input_file_id='file-xxx', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747867993, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'GPT-4o alt main - Part 1'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [14]:
# Run batch job 2
client.batches.create(
    input_file_id=batch_input_file_id_2,
    endpoint="/v1/chat/completions",
    completion_window="24h",  # cannot be changed
    metadata={
      "description": "GPT-4o alt main - Part 2"
    }
)

Batch(id='batch_xxx', completion_window='24h', created_at=1747781593, endpoint='/v1/chat/completions', input_file_id='file-xxx', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747867993, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'GPT-4o alt main - Part 2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [19]:
# Note that https://platform.openai.com/batches provides updates
client.batches.retrieve("batch_xxx")

Batch(id='batch_xxx', completion_window='24h', created_at=1747781593, endpoint='/v1/chat/completions', input_file_id='file-xxx', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1747792752, error_file_id='file-xxx', errors=None, expired_at=None, expires_at=1747867993, failed_at=None, finalizing_at=1747787268, in_progress_at=1747781603, metadata={'description': 'GPT-4o alt main - Part 1'}, output_file_id='file-xxx', request_counts=BatchRequestCounts(completed=49871, failed=129, total=50000))

In [20]:
# TODO: Get batch ID
client.batches.retrieve("batch_xxx")

Batch(id='batch_xxx', completion_window='24h', created_at=1747781593, endpoint='/v1/chat/completions', input_file_id='file-xxx', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1747819062, error_file_id='file-xxx', errors=None, expired_at=None, expires_at=1747867993, failed_at=None, finalizing_at=1747800719, in_progress_at=1747781606, metadata={'description': 'GPT-4o alt main - Part 2'}, output_file_id='file-xxx', request_counts=BatchRequestCounts(completed=49704, failed=46, total=49750))

In [23]:
# List of batch IDs and their corresponding input files
error_ids = ["file-xxx", "file-xxx"]
input_files = ["batch_input_baseline_gpt4o_alt_p1.jsonl", "batch_input_baseline_gpt4o_alt_p2.jsonl"]

# Initialize lists to store failed requests
all_failed_requests = []
all_failed_inputs = []

# Step 1: Identify failed requests in both batches
for error_id in error_ids:
    # Load batch errors directly from the API response
    error_file_response = client.files.content(error_id)
    
    failed_requests = []

    # Parse error file line by line
    for line in error_file_response.iter_lines():
        error_entry = json.loads(line)
        if error_entry.get("response", {}).get("status_code") != 200:
            failed_requests.append(error_entry["custom_id"])

    all_failed_requests.extend(failed_requests)

# Step 2: Extract failed requests from the original input files
for input_file in input_files:
    with open(input_file, "r") as f:
        for line in f:
            request = json.loads(line)
            if request["custom_id"] in all_failed_requests:
                all_failed_inputs.append(request)
                
print(f"Total failed requests: {len(all_failed_inputs)}")

# Step 3: Save failed requests to a new JSONL file for resubmission
failed_batch_file = "batch_input_baseline_alt_failed2.jsonl"

with open(failed_batch_file, "w") as f:
    for request in all_failed_inputs:
        f.write(json.dumps(request) + "\n")

# Step 4: Submit new batch for failed requests (only if there are any)
if all_failed_inputs:
    batch_input_file = client.files.create(
        file=open(failed_batch_file, "rb"),
        purpose="batch"
    )

    batch_input_file_id = batch_input_file.id

    new_batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": "Retry failed requests from both baseline batches, alt task"}
    )

    print(f"New batch job created: {new_batch.id}")
else:
    print("No failed requests found. No need to re-run.")

Total failed requests: 175
New batch job created: batch_xxx


In [24]:
client.batches.retrieve("batch_xxx")

Batch(id='batch_xxx', completion_window='24h', created_at=1747843233, endpoint='/v1/chat/completions', input_file_id='file-xxx', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1747843319, error_file_id=None, errors=None, expired_at=None, expires_at=1747929633, failed_at=None, finalizing_at=1747843307, in_progress_at=1747843234, metadata={'description': 'Retry failed requests from both baseline batches, alt task'}, output_file_id='file-xxx', request_counts=BatchRequestCounts(completed=175, failed=0, total=175))

In [25]:
# List of file responses to combine
file_responses = [
    client.files.content('file-xxx'), # Part 1
    client.files.content('file-xxx'), # Part 2
    client.files.content('file-xxx') # Error
]

In [26]:
# Open a single output file to write all responses
with open("batch_result_baseline_alt_GPT4o_combined_v2.jsonl", 'w', encoding='utf-8') as file:
    for file_response in file_responses:
        # Write the entire content of each response as text
        file.write(file_response.text)