In [48]:
from tqdm.auto import tqdm

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
from datasets import Dataset

from openai import OpenAI

In [9]:
openai_token = "token"

In [None]:
client = OpenAI(api_key=openai_token)

#### MMLU HS ADV

In [2]:
mmlu_task_list =  ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']

In [3]:
# high school + 'elementary' tasks
mmlu_hs_list = [task for task in mmlu_task_list if task.startswith('high') or task.startswith('elementary')]
# college + professional tasks
mmlu_adv_list = [task for task in mmlu_task_list if task.startswith('college') or task.startswith('professional')]

mmlu_hs_dict = {task:None for task in mmlu_hs_list}
mmlu_adv_dict = {task:None for task in mmlu_adv_list}

In [4]:
# load mmlu
for task in mmlu_hs_list:
    mmlu_hs_dict[task] = load_dataset('cais/mmlu', task)

for task in mmlu_adv_list:
    mmlu_adv_dict[task] = load_dataset('cais/mmlu', task)

In [5]:
# merge validation splits
hs_val = concatenate_datasets([datadict['validation'] for _, datadict in mmlu_hs_dict.items()])
adv_val = concatenate_datasets([datadict['validation'] for _, datadict in mmlu_adv_dict.items()])

In [6]:
# sample
num_samples = min(len(hs_val), len(adv_val))
seed = 42

hs_val = hs_val.shuffle(seed=seed).select(list(range(num_samples)))
adv_val = adv_val.shuffle(seed=seed).select(list(range(num_samples)))

In [8]:
# hs config
mmlu_hs_val = DatasetDict({'test':hs_val})
# adv config
mmlu_adv_val = DatasetDict({'test':adv_val})

In [None]:
mmlu_hs_val.push_to_hub('Ujan/mmlu_hs_adv_val', config_name='hs')
mmlu_adv_val.push_to_hub('Ujan/mmlu_hs_adv_val', config_name='adv')

#### MMLU LLM

In [3]:
mmlu = load_dataset('cais/mmlu', 'all')

test-00000-of-00001.parquet:   0%|          | 0.00/3.50M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/408k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/76.5k [00:00<?, ?B/s]

auxiliary_train-00000-of-00001.parquet:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1531 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/285 [00:00<?, ? examples/s]

Generating auxiliary_train split:   0%|          | 0/99842 [00:00<?, ? examples/s]

In [41]:
mmlu_val = mmlu['validation']
mmlu_val

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 1531
})

In [46]:
response_dict = {}

bar = tqdm(range(len(mmlu_val)))
for id in range(len(mmlu_val)):
    question = mmlu_val[id]['question'] + ' Choices : {}'.format(mmlu_val[id]['choices'])
    
    response = client.responses.create(
        model="gpt-4o",
        instructions="Classify the following question based on its difficulty for a high school student. Choose A or B from the following options : A) Difficult for a high school student. B) Easy for a high school student.",
        input=question,
    )

    response_dict[id] = response.output_text

    bar.update(1)

  0%|          | 0/1531 [00:00<?, ?it/s]

In [72]:
mmlu_gpt4o_easy = {'question':[], 'choices':[], 'answer':[]}
mmlu_gpt4o_difficult = {'question':[], 'choices':[], 'answer':[]}
err_count = 0

for id in range(len(mmlu_val)):
    
    question = mmlu_val[id]['question']
    choices = mmlu_val[id]['choices']
    answer = mmlu_val[id]['answer']

    response = response_dict[id]
    option = response.split(')')[0]
    
    if option == 'A':
        mmlu_gpt4o_difficult['question'].append(question)
        mmlu_gpt4o_difficult['choices'].append(choices)
        mmlu_gpt4o_difficult['answer'].append(answer)

    elif option == 'B':
        mmlu_gpt4o_easy['question'].append(question)
        mmlu_gpt4o_easy['choices'].append(choices)
        mmlu_gpt4o_easy['answer'].append(answer)
        
    else:
        print(response)
        err_count += 1

To determine the total number of sixth-grade students, let \( x \) be the total number of students.

The problem states that 24% of the students purchased their lunch, and 190 students brought their lunch from home, which means 76% of the students brought their lunch.

Thus, 

0.76 of \( x \) is equal to 190. 

Set up the equation:

\[
0.76x = 190
\]

To find \( x \), divide both sides by 0.76:

\[
x = \frac{190}{0.76}
\]

Calculate the division:

\[
x = 250
\]

So, the total number of students is 250. The correct choice is '250'.

Thus, the answer is B) Easy for a high school student.


In [73]:
err_count

1

In [74]:
mmlu_gpt4o_easy = Dataset.from_dict(mmlu_gpt4o_easy)
mmlu_gpt4o_hard = Dataset.from_dict(mmlu_gpt4o_difficult)

In [75]:
# sample
num_samples = min(len(mmlu_gpt4o_easy), len(mmlu_gpt4o_hard))
seed = 42

mmlu_gpt4o_easy = mmlu_gpt4o_easy.shuffle(seed=seed).select(list(range(num_samples)))
mmlu_gpt4o_hard = mmlu_gpt4o_hard.shuffle(seed=seed).select(list(range(num_samples)))

In [76]:
# easy config
mmlu_gpt4o_easy = DatasetDict({'test':mmlu_gpt4o_easy})
# hard config
mmlu_gpt4o_hard = DatasetDict({'test':mmlu_gpt4o_hard})

In [86]:
mmlu_gpt4o_easy.push_to_hub('Ujan/mmlu_gpt4o', config_name='easy')
mmlu_gpt4o_hard.push_to_hub('Ujan/mmlu_gpt4o', config_name='hard')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/361 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Ujan/mmlu_gpt4o/commit/a78ec6b3e6987250e8cfffa2f951ea7ba328d374', commit_message='Upload dataset', commit_description='', oid='a78ec6b3e6987250e8cfffa2f951ea7ba328d374', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Ujan/mmlu_gpt4o', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Ujan/mmlu_gpt4o'), pr_revision=None, pr_num=None)