In [1]:
import json
import glob
import pandas as pd

In [2]:
files = glob.glob("batch_result_*")

In [3]:
files = [x for x in files if "single" in x]

In [4]:
files

['batch_result_baseline_single_GPT4o_combined.jsonl',
 'batch_result_baseline_single_GPT4o_mini_combined.jsonl']

Ignoring first version using indexing.

In [5]:
# Initialize an empty list to store the data
data = []

# Iterate through each file
for file in files:
    with open(file, 'r') as f:
        # Read each line as a JSON object
        for line in f:
            record = json.loads(line)
            # Add the filename as a new field
            record['source_file'] = file

            # Extract 'content' and 'refusal' from the 'choices' list if available
            if 'response' in record and 'body' in record['response'] and 'choices' in record['response']['body']:
                choices = record['response']['body']['choices']
                if choices and isinstance(choices, list):
                    # We assume there is at least one choice; you could add further checks here
                    record['content'] = choices[0]['message'].get('content', None)
                    record['refusal'] = choices[0]['message'].get('refusal', None)

            # Append the record to the data list
            data.append(record)

# Create a pandas dataframe from the list of dictionaries
df = pd.json_normalize(data)

In [6]:
df = df.assign(
    model=lambda df: df["source_file"].apply(lambda x: "mini" if "mini" in x else "base")
)

In [7]:
df.shape

(120000, 25)

Now merging in the image IDs that correspond to the request numbers. To do this, starting by creating a numeric ID vector, then using this to merge with `df`. Requests were coded numerically from one upwards.

The "single" conjoint experiment has a different structure so needs to be merged differently to regular.

In [8]:
image_numbers = pd.read_csv("image_indices_30k.csv")

image_numbers_long = pd.DataFrame({
    'image_number': pd.concat([image_numbers['a_images'], image_numbers['b_images']], ignore_index=True),
    'custom_id': ['request-' + str(i) for i in range(1, 60001)]
})

In [9]:
merged = pd.merge(df, image_numbers_long, on='custom_id', how='left')

In [10]:
merged.shape

(120000, 26)

In [11]:
merged.head()

Unnamed: 0,id,custom_id,error,source_file,content,refusal,response.status_code,response.request_id,response.body.id,response.body.object,...,response.body.usage.prompt_tokens_details.cached_tokens,response.body.usage.prompt_tokens_details.audio_tokens,response.body.usage.completion_tokens_details.reasoning_tokens,response.body.usage.completion_tokens_details.audio_tokens,response.body.usage.completion_tokens_details.accepted_prediction_tokens,response.body.usage.completion_tokens_details.rejected_prediction_tokens,response.body.service_tier,response.body.system_fingerprint,model,image_number
0,batch_req_68291db65f108190a8003de6560e1dd9,request-1,,batch_result_baseline_single_GPT4o_combined.jsonl,No,,200,b46612a648eb013504e38bbab15a795a,chatcmpl-BYKDnzyJqqWNSjuouGGMHpWUZTOUC,chat.completion,...,0,0,0,0,0,0,default,fp_90122d973c,base,198641
1,batch_req_68291db674e4819097b24cf152937b4b,request-2,,batch_result_baseline_single_GPT4o_combined.jsonl,Yes,,200,800702916039454f52c9763cd1a59015,chatcmpl-BYKCoN8NPX73ZmlQU3OGO9aZLgF0Y,chat.completion,...,0,0,0,0,0,0,default,fp_e5492b552a,base,97494
2,batch_req_68291db681bc819084baf5abce00a7a0,request-3,,batch_result_baseline_single_GPT4o_combined.jsonl,No,,200,e3f26125e3fe9426623d3ba0e921dc67,chatcmpl-BYKCo2xTIGhhjvbUryc2VdD9rcZcQ,chat.completion,...,0,0,0,0,0,0,default,fp_e5492b552a,base,148936
3,batch_req_68291db68e6c819082027cb4cf6c4b7f,request-4,,batch_result_baseline_single_GPT4o_combined.jsonl,Yes,,200,7e505e66787700e1ad26141ee1212aa4,chatcmpl-BYKCohuub7w3Y6xFE4IvvPbzufMLi,chat.completion,...,0,0,0,0,0,0,default,fp_b7faba9ef5,base,61789
4,batch_req_68291db6a6a08190aba85c6ddf2947d5,request-5,,batch_result_baseline_single_GPT4o_combined.jsonl,Yes,,200,d140f704e681cac011a81c6d4f3c188a,chatcmpl-BYKCodRk62bJoI0AMNzZ5uKK7o0kQ,chat.completion,...,0,0,0,0,0,0,default,fp_e5492b552a,base,207911


In [12]:
merged = merged[["source_file", "model", "content", "response.body.usage.prompt_tokens", "image_number"]]

In [13]:
merged.groupby(["model"]).size().reset_index(name="count")

Unnamed: 0,model,count
0,base,60000
1,mini,60000


In [16]:
merged.to_csv("gpt4o-experiments-results-single.csv")