In [None]:
import pandas as pd
import concurrent.futures
from openai import OpenAI
client = OpenAI(api_key='xxx')
df = pd.read_pickle("dataframe.pkl")  # or "dataframe_perturbated.pkl"

# Function to process a single row
def process_row(idx, row):
    question = row["Question"]
    option_a = row["Option A"]
    option_b = row["Option B"]
    option_c = row["Option C"]
    correct_answer = row["Answer"].strip().upper()

    prompt = (
        f"Question: {question}\n"
        f"Options:\n"
        f"A) {option_a}\n"
        f"B) {option_b}\n"
        f"C) {option_c}\n\n"
        "Choose the correct answer by responding with only one letter (A, B, or C). No explanation, just the letter.\n"
        "Example response: A"
    )

    try:
        response = client.responses.create(
            model="o3-mini-2025-01-31",
            input=prompt
        )
        model_response = response.output_text.strip().upper()
        # Validate response format
        if model_response not in ["A", "B", "C"]:
            # Return tuple with error indication if unexpected format
            return (idx, "error", f"Unexpected answer format: '{model_response}'", correct_answer)
        else:
            is_correct = model_response == correct_answer
            return (idx, is_correct, None, correct_answer)
    except Exception as e:
        return (idx, "error", f"API call error: {str(e)}", correct_answer)

# Pre-allocate a list for all datapoints with a placeholder value
predicted_results = [None] * len(df)

# Variables for valid response tracking (only those without error)
total_valid_questions = 0  
correct_answers = 0
errors = []

# Use concurrent.futures.ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    futures = {executor.submit(process_row, i, row): i for i, (_, row) in enumerate(df.iterrows())}
    
    for future in concurrent.futures.as_completed(futures):
        i, result, error, correct_answer = future.result()
        if error:
            errors.append(f"Row {i}: {error}")
            predicted_results[i] = "error"
            continue
        predicted_results[i] = result
        total_valid_questions += 1
        if result is True:
            correct_answers += 1

accuracy = (correct_answers / total_valid_questions * 100) if total_valid_questions > 0 else 0

print(f"Total valid questions processed: {total_valid_questions}")
print(f"Correct answers: {correct_answers}")
print(f"Accuracy (excluding errors): {accuracy:.2f}%")
print("\nPrediction results (list):")
print(predicted_results)
if errors:
    print("\nErrors encountered:")
    for error in errors:
        print(error)

Total valid questions processed: 0
Correct answers: 0
Accuracy (excluding errors): 0.00%

Prediction results (list):
['error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 'error', 