In [1]:
!pip install python-dotenv --quiet
!pip install -U datasets --quiet
!pip install datasets>=2.15.0 --quiet

Note:

* When exploring the dataset, I noticed that 4 questions had incosistent choice number and such questions were **discarded** as they do not fit our prompt structure. I Kept 295 out of 299 questions with exactly 4 choices.




In [4]:
from dotenv import load_dotenv
import os
import openai
from google.colab import userdata


class OpenAIConfig:
    def __init__(self, model_id: str, temperature: float = 0.0, api_key: str = None):
        self.api_key = api_key
        self.client = openai.OpenAI(api_key=self.api_key)
        self.model_id = model_id
        self.temperature = temperature

    def ask(self, prompt: str) -> str:
        response = self.client.chat.completions.create(
            model=self.model_id,
            messages=[{"role": "user", "content": prompt}],
            temperature=self.temperature
        )
        return response.choices[0].message.content.strip().upper()

In [6]:
config = OpenAIConfig(
    model_id="id",
    temperature=0,
    api_key="key"
)

In [7]:
  #  Question: <question>
  #   A. <choice A>
  #   B. <choice B>
  #   C. <choice C>
  #   D. <choice D>
  #   Answer:

def format_prompt(example):
    choices = example['choices']['text']
    prompt = (
        f"Question: {example['question'].strip()}\n"
        f"A. {choices[0]}\n"
        f"B. {choices[1]}\n"
        f"C. {choices[2]}\n"
        f"D. {choices[3]}\n"
        f"Answer:"
    )
    return prompt

In [8]:
from datasets import load_dataset
ds = load_dataset("ai2_arc", "ARC-Challenge")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
val_split = ds["validation"]
print(val_split[0])
print(val_split.shape)

{'id': 'Mercury_SC_407695', 'question': 'Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?', 'choices': {'text': ['Put the objects in groups.', 'Change the height of the ramp.', 'Choose different objects to roll.', 'Record the details of the investigation.'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'D'}
(299, 4)


Below I investigate the validation dataset a bit.


In [10]:
# Check that all examples have answer key
has_all_keys = all([example["answerKey"] is not None for example in val_split])
print("All have ground truth:", has_all_keys)
missing = [i for i, example in enumerate(val_split) if example["answerKey"] is None]
print("Missing indexes:", missing)


All have ground truth: True
Missing indexes: []


In [19]:
# Check that all examples have 4 choices for the answer to fit our prompt structure
for example in val_split:
    num_choices = len(example['choices']['text'])
    if num_choices != 4:
        print(f"Question ID: {example['id']}")
        print(f"Question: {example['question'].strip()}")
        print(f"# Choices: {num_choices}")
        print(f"Choices: {example['choices']['text']}")
        print(f"Answer Key: {example['answerKey']}")
        print("-" * 50)

Question ID: NYSEDREGENTS_2014_4_4
Question: Which state of matter has no definite volume and no definite shape?
# Choices: 3
Choices: ['gas', 'liquid', 'solid']
Answer Key: A
--------------------------------------------------
Question ID: NYSEDREGENTS_2014_4_19
Question: As kittens grow into cats, their body weight usually
# Choices: 3
Choices: ['decreases', 'increases', 'remains the same']
Answer Key: B
--------------------------------------------------
Question ID: TIMSS_2003_8_pg29
Question: Which of the following organs is NOT situated in the abdomen?
# Choices: 5
Choices: ['liver', 'kidney', 'stomach', 'bladder', 'heart']
Answer Key: E
--------------------------------------------------
Question ID: NYSEDREGENTS_2014_4_28
Question: Large birds have been eating small animals in an area. If all of the large birds died from a disease, the number of small animals in the area would probably
# Choices: 3
Choices: ['decrease', 'increase', 'remain the same']
Answer Key: B
----------------



Note: I excluded 4 questions that did not contain exactly 4 answer choices to maintain a consistent prompt format for the fine-tuned model.

In [12]:
clean_val_split = [example for example in val_split if len(example['choices']['text']) == 4]
print(f"Kept {len(clean_val_split)} out of {len(val_split)} questions with exactly 4 choices.")

Kept 295 out of 299 questions with exactly 4 choices.


In [14]:
prompts = [format_prompt(q) for q in clean_val_split]
ground_truth = [q['answerKey'] for q in clean_val_split]

In [15]:
from tqdm import tqdm

predictions = []
for prompt in tqdm(prompts, desc="Querying model"):
    answer = config.ask(prompt)
    predictions.append(answer)

Querying model: 100%|██████████| 295/295 [02:09<00:00,  2.27it/s]


In [17]:
correct = sum([pred == gt for pred, gt in zip(predictions, ground_truth)])
accuracy = correct / len(ground_truth) * 100

print(f"Final accuracy on 100 samples: {accuracy:.1f}%")

Final accuracy on 100 samples: 88.5%


In [18]:
import re
from collections import Counter

# Get incorrectly answered questions
incorrect_questions = [
    item["question"]
    for item, pred, gt in zip(clean_val_split, predictions, ground_truth)
    if pred != gt
]

print(f"Number of incorrectly answered questions: {len(incorrect_questions)}")

# Split into words, lowercase and count
all_words = []

for question in incorrect_questions:
    words = re.findall(r'\b\w+\b', question.lower())
    all_words.extend(words)

counter = Counter(all_words)
top5 = counter.most_common(5)

formatted = ", ".join([f"{word}: {count}" for word, count in top5])
print(f"Top 5 most frequent words in incorrectly answered questions:\n{formatted}")

Number of incorrectly answered questions: 34
Top 5 most frequent words in incorrectly answered questions:
the: 68, of: 42, a: 31, is: 24, to: 21


Top 5 most frequent words in incorrectly answered questions:

**the: 67, of: 41, a: 30, is: 24, to: 21**





Final note: just word frequency might not be very informative, and another measure could be TF-IDF which downweights common words and highlights terms that are relatively special to the questions the model got wrong. For TF-IDF I got these top words and their respective scores:
likely: 1.179, best: 0.980, stream: 0.881, water: 0.881, s: 0.794