### openai-batch notebook for gpt-4o-2024-08-06 results

In [None]:
import json
import os

In [None]:
from dotenv import load_dotenv
load_dotenv(".env")
api_key = os.getenv("OPENAI_API_KEY")

In [None]:
"""
check what models the key has access to

curl https://api.openai.com/v1/models \
  -H "Authorization: Bearer $OPENAI_API_KEY"

"""

In [None]:
# all questions
input_file = '../csvs/questions.csv'
output_file = '../csvs/questions.jsonl'

In [None]:
def define_general_prompt():
    general_prompt = """
    
    'Only use results from the genomic data commons in your response and provide frequencies \
     as a percentage in the result. Report the result in the following output JSON format, strictly using \
     the structure "The final answer is: <frequency %>", followed by top references to publications from which you \
     obtained your response:

    {
        result: The final answer is: <frequency %>
        references: <list of references>
    }

    """
    return general_prompt

In [None]:
general_prompt = define_general_prompt()

In [None]:
general_prompt

In [None]:
def get_jsonl(input_file, output_file):
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        next(infile)  # skip the header line
        for i, line in enumerate(infile, 1):
            question = line.strip().strip("'\"")  # remove quotes and newline
            jsonl = {
                "custom_id": f"request-{i}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-2024-08-06",
                    "temperature": 0,
                    "seed": 2000,
                    "response_format": {"type": "json_object"},
                    "messages": [
                        {"role": "system", "content": general_prompt},
                        {"role": "user", "content": question}
                    ]
                }
            }
            outfile.write(json.dumps(jsonl) + "\n")


### process results from batch

#### any errors from batch, reprocess them
- look at error.jsonl and process the error queries separately
- sometimes errors result due to the suffix "-batch" added to model name which doesn't exist
- openai probably does this to get discounts in batch API
- rerunning error queries fixes it and model name doesnt change

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
questions = pd.read_csv('../csvs/questions.csv')
error_questions = questions.iloc[[5, 6, 24, 80, 141, 169, 238, 246, 247, 265, 272, 287, 289, 309, 311, 407, 408, 416, 458, 485, 488]]

In [None]:
error_questions.to_csv('../csvs/error_questions_gpt4.csv', index=0)

In [None]:
get_jsonl(input_file='../csvs/error_questions_gpt4.csv', output_file='../csvs/error_questions_gpt4.jsonl')

### process results

In [None]:
results = []
references_list = []

with open("/opt/gpudata/aartiv/qag/gpt-4o-2024-08-06/batch_gpt4o_latest.jsonl", "r") as file:
    for line in file:
        record = json.loads(line)

        try:
            # step 1: get the assistant message content (which is a JSON string)
            content_str = record["response"]["body"]["choices"][0]["message"]["content"]

            # step 2: parse that content string as JSON
            content_json = json.loads(content_str)

            # step 3: extract result and references
            result = content_json.get("result")
            references = content_json.get("references", [])

            results.append(result)
            references_list.append(references)

        except (KeyError, json.JSONDecodeError) as e:
            print("Error parsing line:", e)
            results.append(None)
            references_list.append([])

# Output examples
print("Results:\n", results)
print("\nReferences:\n", references_list)


In [None]:
import pandas as pd
questions = pd.read_csv(
    '../csvs/questions.csv'
)
questions['gpt-4o-2024-08-06_base_output'] = results
questions['gpt-4o-2024-08-06_references'] = references_list

In [None]:
questions.head(n=2)

In [None]:
questions.shape

In [None]:
questions.to_csv('../csvs/gpt4o_base_results.csv', index=0)