# Benchmark GPT-3.5 and GPT-4-turbo on Kaggle LLM Science Exam

In [1]:
# !pip install -q openai kaggle

In [2]:
import requests, json
import openai
import pandas as pd
import warnings

from IPython.display import display, HTML, Markdown

In [3]:
from openai.types.chat.chat_completion import ChatCompletion

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
import os
import subprocess
from pathlib import Path
import zipfile

COMPETITION='kaggle-llm-science-exam'

# fix as needed
data_dir = Path(os.getcwd()) / 'data'
data_dir.mkdir(exist_ok=True)

file_path = data_dir / f'{COMPETITION}.zip'

if not os.path.exists(file_path):
    # download dataset
    subprocess.run(['kaggle', 'competitions', 'download', '-p', data_dir, '-c', COMPETITION], check=True)
    # subprocess.run(['unzip', 'kaggle-llm-science-exam.zip'], check=True)
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)


In [6]:
train = pd.read_csv(data_dir / 'train.csv')
train.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [7]:
row = train.iloc[0]
row

id                                                        0
prompt    Which of the following statements accurately d...
A         MOND is a theory that reduces the observed mis...
B         MOND is a theory that increases the discrepanc...
C         MOND is a theory that explains the missing bar...
D         MOND is a theory that reduces the discrepancy ...
E         MOND is a theory that eliminates the observed ...
answer                                                    D
Name: 0, dtype: object

Experiment with a row and GPT-3.5

In [8]:
row = train.iloc[0]

system_message = 'Answer the following multiple-choice question by providing the three most likely answers, in order or likelihood, specified by their letter followed by a space, e.g. A B C'
user_message = f'Question: {row.prompt}\n'
user_message += '\n'.join([f'{l}) {row[l]}' for l in 'ABCDE'])
user_message += '\n\nProvide the answer in the form of the three letter choices only, without explanation, e.g. A D B\n'
print(system_message)
print(user_message)


Answer the following multiple-choice question by providing the three most likely answers, in order or likelihood, specified by their letter followed by a space, e.g. A B C
Question: Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
B) MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
C) MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
D) MOND is a theory that reduces the discrepancy between the observed missing 

## Get functions needed for API calling setup

In [42]:
# GPT_MODEL='gpt-3.5-turbo-1106'
# cost_prompt_tokens = 0.001 / 1000
# cost_completion_tokens = 0.002 / 1000
GPT_MODEL='gpt-4-1106-preview' # gpt-4 turbo preview
cost_prompt_tokens = 0.01 / 1000
cost_completion_tokens = 0.03 / 1000

In [9]:
# prepare the messages array
def create_messages_input(system_message, user_message, existing_messages=None) -> list:
    """
    Create a list of messages for the OpenAI Chat Completions API.

    :param system_message: A string containing the system message.
    :param user_message: A string containing the user's message.
    :param existing_messages: An optional list of existing messages to append to.
    :return: A list of messages including the system message, any existing messages, and the new user message.
    """
    messages = [None] if existing_messages is None else existing_messages
    # Always start with the system message
    messages[0] = {"role": "system", "content": system_message}
    # Append the new user message
    messages.append({"role": "user", "content": user_message})

    return messages

In [11]:
client = openai.OpenAI()
# client = openai.AsyncClient()

In [43]:
def get_chat_response(system_message, user_message, existing_messages=None, temperature=0.7, seed=None, verbose=False) -> ChatCompletion | None:
    messages = create_messages_input(system_message, user_message, existing_messages)
    response = None
    try:
        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=messages,
            seed=seed,
            max_tokens=1024,
            temperature=temperature,
        )

        if response.choices[0].finish_reason == 'length':
            warnings.warn('Warning: Reached max tokens')

        if verbose:
            response_content = response.choices[0].message.content
            system_fingerprint = response.system_fingerprint
            prompt_tokens = response.usage.prompt_tokens if response.usage else None
            completion_tokens = response.usage.completion_tokens if response.usage else None

            table = f"""
            <table>
            <tr><th>Response</th><td>{response_content}</td></tr>
            <tr><th>System Fingerprint</th><td>{system_fingerprint}</td></tr>
            <tr><th>Number of prompt tokens</th><td>{prompt_tokens}</td></tr>
            <tr><th>Number of completion tokens</th><td>{completion_tokens}</td></tr>
            </table>
            """
            display(HTML(table))

    except Exception as e:
        print(f'An error occurred: {e}')
        # return None
        return response
    return response

In [52]:
# TODO use JSON return in a useful format https://platform.openai.com/docs/guides/text-generation/json-mode
system_messaage = 'You are a helpful assistant that generates short stories about science fiction.'
user_message = 'Generate a story about a bladerunner'
response = get_chat_response(system_messaage, user_message)
response

0,1
Response,"In the year 2150, Earth had become a place where technology and humanity intertwined in ways previously unimaginable. The rise of artificial intelligence had led to the creation of lifelike androids, known as replicants, that were virtually indistinguishable from humans. However, as their presence grew, so did the fear of their potential to rebel against their creators. Enter Jax, a seasoned bladerunner, tasked with tracking down and ""retiring"" rogue replicants. Jax had seen it all, from the most cunning and deceptive models to the most violent and ruthless. His job was dangerous, but he was the best at what he did. One day, Jax was given a new assignment: to track down a group of replicants who had escaped from a high-security facility. These replicants were different, though. They were designed to be stronger, faster, and more intelligent than any before them. As Jax delved deeper into the case, he began to question the morality of his mission. Were these replicants truly dangerous, or were they simply fighting for their own freedom? As Jax pursued the fugitive replicants across the city, he found himself forming an unexpected bond with them. They were not the mindless machines he had been led to believe. They had hopes, dreams, and desires just like humans. Jax began to see them as more than just targets to be eliminated. In a dramatic showdown, Jax confronted the replicants, but instead of carrying out his mission, he chose to stand with them. Together, they worked to expose the corrupt practices of the corporation that had created them, and in doing so, they sparked a revolution that would change the course of history for both humans and replicants alike. Jax became a symbol of hope for a new era of coexistence between humans and their creations, and his actions paved the way for a future where understanding and compassion triumphed over fear and prejudice. And as the sun set on the city, Jax knew that he had played a pivotal role in shaping the destiny of his world."
System Fingerprint,fp_eeff13170a
Number of prompt tokens,32
Number of completion tokens,422


ChatCompletion(id='chatcmpl-8ONkPVqp0ellKDoekaHro8rBQ4YM4', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='In the year 2150, Earth had become a place where technology and humanity intertwined in ways previously unimaginable. The rise of artificial intelligence had led to the creation of lifelike androids, known as replicants, that were virtually indistinguishable from humans. However, as their presence grew, so did the fear of their potential to rebel against their creators.\n\nEnter Jax, a seasoned bladerunner, tasked with tracking down and "retiring" rogue replicants. Jax had seen it all, from the most cunning and deceptive models to the most violent and ruthless. His job was dangerous, but he was the best at what he did.\n\nOne day, Jax was given a new assignment: to track down a group of replicants who had escaped from a high-security facility. These replicants were different, though. They were designed to be stronger, faster, and more intelli

Test individually some questions to get a feel for the results

In [13]:
row = train.iloc[0]

system_message = 'Answer the following multiple-choice question by providing the three most likely answers, in order or likelihood, specified by their letter followed by a space, e.g. A B C'
user_message = f'Question: {row.prompt}\n'
user_message += '\n'.join([f'{l}) {row[l]}' for l in 'ABCDE'])
user_message += '\n\nProvide the answer in the form of the three letter choices only, without explanation, e.g. A D B\n'
print(system_message)
print(user_message)


Answer the following multiple-choice question by providing the three most likely answers, in order or likelihood, specified by their letter followed by a space, e.g. A B C
Question: Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
B) MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
C) MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
D) MOND is a theory that reduces the discrepancy between the observed missing 

In [32]:
response = get_chat_response(system_message, user_message)

0,1
Response,B D E
System Fingerprint,fp_eeff13170a
Number of prompt tokens,303
Number of completion tokens,3


In [36]:
print(user_message)
print(response.choices[0].message.content)
print('Answer: ', row.answer)

Question: Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
B) MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
C) MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
D) MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 2.
E) MOND is a theory that eliminates the observed m

Get result over questions

In [44]:
system_message = 'Answer the following multiple-choice question by providing the three most likely answers, in order or likelihood, specified by their letter followed by a space, e.g. A B C'
verbose = True
score = 0
n_questions = 200
prompt_tokens = completion_tokens = 0
temperature = 0.0

for index, row in train.head(n_questions).iterrows():
    user_message = f'Question: {row.prompt}\n'
    user_message += '\n'.join([f'{l}) {row[l]}' for l in 'ABCDE'])
    user_message += '\n\nProvide the answer in the form of the three letter choices only, without explanation, e.g. A D B\n'

    response = get_chat_response(system_message, user_message, verbose=verbose, temperature=temperature)

    assistant_message = response.choices[0].message.content

    if verbose:
        print(f'User Message: {user_message}')
        print(f'Response: {assistant_message}')
        print(f'Answer: {row.answer}')
        print('---')

    # Score
    answers = assistant_message.split(' ')[:3]
    correct_answer = row.answer
    for i, pred in enumerate(answers):
        if pred == correct_answer:
            score += 1 / (i + 1)  # 1 for first being correct, 1/2 for second, 1/3 for third

    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens

    if index >= 3:
        verbose = False


total_cost = prompt_tokens * cost_prompt_tokens + completion_tokens * cost_completion_tokens
print(f'Average score: {score / n_questions}')
print(f'Cost: ${total_cost}, prompt_tokens={prompt_tokens}, completion_tokens={completion_tokens}')

0,1
Response,D E B
System Fingerprint,fp_a24b4d720c
Number of prompt tokens,303
Number of completion tokens,3


User Message: Question: Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
B) MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
C) MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
D) MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 2.
E) MOND is a theory that eliminates 

0,1
Response,A B E
System Fingerprint,fp_a24b4d720c
Number of prompt tokens,352
Number of completion tokens,3


User Message: Question: Which of the following is an accurate definition of dynamic scaling in self-similar systems?
A) Dynamic scaling refers to the evolution of self-similar systems, where data obtained from snapshots at fixed times exhibits similarity to the respective data taken from snapshots of any earlier or later time. This similarity is tested by a certain time-dependent stochastic variable x.
B) Dynamic scaling refers to the non-evolution of self-similar systems, where data obtained from snapshots at fixed times is similar to the respective data taken from snapshots of any earlier or later time. This similarity is tested by a certain time-dependent stochastic variable x.
C) Dynamic scaling refers to the evolution of self-similar systems, where data obtained from snapshots at fixed times is dissimilar to the respective data taken from snapshots of any earlier or later time. This dissimilarity is tested by a certain time-independent stochastic variable y.
D) Dynamic scaling ref

0,1
Response,A B D
System Fingerprint,fp_a24b4d720c
Number of prompt tokens,476
Number of completion tokens,3


User Message: Question: Which of the following statements accurately describes the origin and significance of the triskeles symbol?
A) The triskeles symbol was reconstructed as a feminine divine triad by the rulers of Syracuse, and later adopted as an emblem. Its usage may also be related to the Greek name of Sicily, Trinacria, which means "having three headlands." The head of Medusa at the center of the Sicilian triskeles represents the three headlands.
B) The triskeles symbol is a representation of three interlinked spirals, which was adopted as an emblem by the rulers of Syracuse. Its usage in modern flags of Sicily has its origins in the ancient Greek name for the island, Trinacria, which means "Sicily with three corners." The head of Medusa at the center is a representation of the island's rich cultural heritage.
C) The triskeles symbol is a representation of a triple goddess, reconstructed by the rulers of Syracuse, who adopted it as an emblem. Its significance lies in the fact t

0,1
Response,A C D
System Fingerprint,fp_a24b4d720c
Number of prompt tokens,302
Number of completion tokens,3


User Message: Question: What is the significance of regularization in terms of renormalization problems in physics?
A) Regularizing the mass-energy of an electron with a finite radius can theoretically simplify calculations involving infinities or singularities, thereby providing explanations that would otherwise be impossible to achieve.
B) Regularizing the mass-energy of an electron with an infinite radius allows for the breakdown of a theory that is valid under one set of conditions. This approach can be applied to other renormalization problems as well.
C) Regularizing the mass-energy of an electron with a finite radius is a means of demonstrating that a system below a certain size can be explained without the need for further calculations. This approach can be applied to other renormalization problems as well.
D) Regularizing the mass-energy of an electron with an infinite radius can be used to provide a highly accurate description of a system under specific conditions. This appro

### Use function calling interface to get more structured output
(could also just use the new response_format='json')

In [86]:
from pydantic import BaseModel, Field

class Answer(BaseModel):
    answer: list[str] = Field(str, min_length=3, max_length=3)

class AnswerQuestion(BaseModel):
    question: str = Field(str, description='The question to answer')
    choices: list[str] = Field(str, description='The choices to choose from')
    answer: Answer = Field(Answer, description='The answer to the question')

class AnswerQuestionResponse(BaseModel):
    answer: Answer = Field(Answer, description='The answer to the question')
    reasoning: str = Field(str, description='The reasoning for the answer')

aqr = AnswerQuestionResponse(answer=Answer(answer=['A', 'B', 'C']), reasoning='Because I said so')
print(aqr.model_dump_json())
AnswerQuestionResponse.model_validate_json(aqr.model_dump_json())
AnswerQuestionResponse.model_validate_json('{"answer_question_response": {"answer": ["A", "B", "C"]}, "reasoning": "Because I said so"}')

{"answer":{"answer":["A","B","C"]},"reasoning":"Because I said so"}


AnswerQuestionResponse(answer=<class '__main__.Answer'>, reasoning='Because I said so')

In [48]:
def answer_question(answer: str, reasoning: str):
    table = f"""
    <table>
    <tr><th>Answer</th><td>{answer}</td></tr>
    <tr><th>Reasoning</th><td>{reasoning}</td></tr>
    </table>
    """
    display(HTML(table))
    return answer, reasoning

def get_function_call_arguments(system_messaage, user_message, verbose=False, temperature=0.7, seed=None):
    messages = create_messages_input(system_messaage, user_message)

    tools = [
        {
            "type": "function",
            "function": {
                "name": "answer_question",
                "description": "Answers the provided question",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "reasoning": {
                            "type": "string",
                            "description": "Reasining for what the answer could be. Keep it short."
                        },
                        "answers": {
                            "type": "array",
                            "items": {
                                "type": "string",
                                "enum": ["A", "B", "C", "D", "E"],
                            },
                            "description": "Your top 3 guesses, from most to least likely. e.g. ['A', 'D', 'C']"
                        }
                    },
                    "required": ["reasoning", "answers"],
                },
            }
        }
    ]

    answers_reasoning = []
    response = None
    try:
        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=messages,
            seed=seed,
            max_tokens=1024,
            temperature=temperature,
            tools=tools,
            tool_choice={"type": "function", "function": {"name": "answer_question"}})
        response_message = response.choices[0].message
        tool_calls = response_message.tool_calls

        if not tool_calls or len(tool_calls) > 1:
            raise Exception(f'Expected exactly one tool call but got {0 if not tools_calls else len(tool_calls)}')

        for tool_call in tool_calls:
            if tool_call.function.name == 'answer_question':
                function_args = json.loads(tool_call.function.arguments)
                answers_reasoning.append(function_args)

        if response.choices[0].finish_reason == 'length':
            warnings.warn('Warning: Reached max tokens')

        if verbose:
            response_content = response.choices[0].message.content
            system_fingerprint = response.system_fingerprint
            prompt_tokens = response.usage.prompt_tokens if response.usage else None
            completion_tokens = response.usage.completion_tokens if response.usage else None

            table = f"""
            <table>
            <tr><th>Response</th><td>{response_content}</td></tr>
            <tr><th>System Fingerprint</th><td>{system_fingerprint}</td></tr>
            <tr><th>Number of prompt tokens</th><td>{prompt_tokens}</td></tr>
            <tr><th>Number of completion tokens</th><td>{completion_tokens}</td></tr>
            </table>
            """
            display(HTML(table))
    except Exception as e:
        print(f'An error occurred: {e}')
        print(messages)
        print(response)
        # return None
    return answers_reasoning

In [51]:
row = train.iloc[20]
system_message = 'Answer the following multiple-choice question by providing the three most likely answers, in order or likelihood, specified by their letter followed by a space, e.g. A B C'
user_message = f'Question: {row.prompt}\n'
user_message += '\n'.join([f'{l}) {row[l]}' for l in 'ABCDE'])
user_message += '\n\nProvide the answer in the form of the three letter choices only, without explanation, e.g. A D B\n'

answers_reasoning = get_function_call_arguments(system_message, user_message, temperature=0.7, seed=42, verbose=True)

print('Predicted answer: ', answers_reasoning[0]['answers'])
print('Predicted reasoning: ', answers_reasoning[0]['reasoning'])
print('Answer: ', row.answer)

0,1
Response,
System Fingerprint,fp_a24b4d720c
Number of prompt tokens,313
Number of completion tokens,117


Predicted answer:  ['D', 'E', 'C']
Predicted reasoning:  Planetary systems are often categorized based on their orbital dynamics to understand the interactions between the planets and the stability of the system. Categories such as resonant, non-resonant-interacting, and hierarchical are common, and a system may exhibit characteristics of more than one category. The categorization is not based solely on the number of planets, nor are the categories limited to just resonant or hierarchical. The statement that planetary systems cannot be categorized based on their orbital dynamics is false.
Answer:  D


Test again on just the first fows of the training dataset

In [47]:
score = 0
n_questions = 200
temperature = 0.0
all_answers_reasoning = []

system_message = 'Answer the following multiple-choice question by providing the three most likely answers, in order or likelihood, specified by their letter followed by a space, e.g. A B C'

for index, row in train.head(n_questions).iterrows():
    user_message = f'Question: {row.prompt}\n'
    user_message += '\n'.join([f'{l}) {row[l]}' for l in 'ABCDE'])
    user_message += '\n\nProvide the answer in the form of the three letter choices only, without explanation, e.g. A D B\n'

    # Returns a list of potential function calls for each message, so we're just expecting the 0th one
    answers_reasoning = get_function_call_arguments(system_message, user_message, temperature=temperature)[0]
    all_answers_reasoning.append(answers_reasoning)

    correct_answer = row.answer
    for i, pred in enumerate(answers_reasoning['answers']):
        if pred == correct_answer:
            score += 1 / (i + 1)  # 1 for first being correct, 1/2 for second, 1/3 for third

    print('\n'.join(str(x) for x in [index, row.prompt, correct_answer, answers_reasoning['answers'], answers_reasoning['reasoning']]))
    print('---')

print(f'Average score: {score / n_questions:.4f}')

0
Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
D
['E', 'D', 'B']
MOND (Modified Newtonian Dynamics) is a theory that proposes a modification to Newton's laws of gravity to explain the rotation curves of galaxies without invoking dark matter. It does not postulate new forms of matter like 'fuzzy dark matter' or neutrinos and axions as the missing mass. Instead, it modifies the gravitational force law at low accelerations to account for the observed dynamics. Therefore, it aims to reduce or eliminate the need for dark matter in explaining galactic rotation curves and the missing baryonic mass discrepancy.
---
1
Which of the following is an accurate definition of dynamic scaling in self-similar systems?
A
['A', 'E', 'B']
Dynamic scaling in self-similar systems refers to the property where the system evolves over time, but the statistical properties at differen

IndexError: list index out of range

## Results

**gpt-3.5-turbo-1106**

Chat messages (no function calling):
  - Average score 100 questions: 0.8033
  - Average score 200 questions: 0.7992

Function calling:
  - Average score 100 questions: 0.7710
  - Average score 200 questions: 0.8000

### gpt-4-1106-preview' (gpt-4 turbo preview)
Chat messages (no function calling):
  - Average score 200 questions: 0.8875
Function calling:
  - Average score 200 questions: 


In [38]:
train.describe()

Unnamed: 0,id
count,200.0
mean,99.5
std,57.879185
min,0.0
25%,49.75
50%,99.5
75%,149.25
max,199.0
