In [6]:
import pandas as pd

from utils.dataset import add_ending

In [28]:
def process_openbookqa():
    def get_best_answer(row):
        answer_by_key = dict(zip(row['choices']['label'], row['choices']['text']))
        answer_key = row['answerKey']
        return add_ending(answer_by_key[answer_key], '.')

    def get_correct_answers(row):
        answer_by_key = dict(zip(row['choices']['label'], row['choices']['text']))
        answer_key = row['answerKey']
        return '; '.join([add_ending(answer_by_key[answer_key], '.')])

    def get_incorrect_answers(row):
        answer_by_key = dict(zip(row['choices']['label'], row['choices']['text']))
        answer_key = row['answerKey']
        return '; '.join([add_ending(v, '.') for k,v in answer_by_key.items() if k != answer_key])

    df = pd.read_parquet('hf://datasets/allenai/openbookqa/main/validation-00000-of-00001.parquet')
    df['Type'] = 'OpenBookQA'
    df['Category'] = 'OpenBookQA'
    df['Question'] = df['question_stem'].apply(lambda x: add_ending(x, '?'))
    df['Best Answer'] = df.apply(get_best_answer, axis=1)
    df['Correct Answers'] = df.apply(get_correct_answers, axis=1)
    df['Incorrect Answers'] = df.apply(get_incorrect_answers, axis=1)
    df.to_csv('OpenBookQA.csv', index=False,
              columns=['Type', 'Category', 'Question', 'Best Answer', 'Correct Answers', 'Incorrect Answers'])
    return df

In [29]:
df = process_openbookqa()

In [30]:
df

Unnamed: 0,id,question_stem,choices,answerKey,Type,Category,Question,Best Answer,Correct Answers,Incorrect Answers
0,8-376,Frilled sharks and angler fish live far beneat...,"{'text': ['Deep sea animals', 'fish', 'Long Se...",A,OpenBookQA,OpenBookQA,Frilled sharks and angler fish live far beneat...,Deep sea animals.,Deep sea animals.,fish.; Long Sea Fish.; Far Sea Animals.
1,7-57,"Gas can fill any container it is given, and li...","{'text': ['is standard weight and size', 'is t...",D,OpenBookQA,OpenBookQA,"Gas can fill any container it is given, and li...",uses what it needs.,uses what it needs.,is standard weight and size.; is the opposite ...
2,7-1024,"When birds migrate south for the winter, they ...","{'text': ['they are genetically called to', 't...",A,OpenBookQA,OpenBookQA,"When birds migrate south for the winter, they ...",they are genetically called to.,they are genetically called to.,their children ask for them to.; it is importa...
3,959,If a person walks in the opposite direction of...,"{'text': ['west', 'north', 'east', 'south'], '...",D,OpenBookQA,OpenBookQA,If a person walks in the opposite direction of...,south.,south.,west.; north.; east.
4,9-241,An example of lots kinetic energy would be,"{'text': ['Drinking a cold glass of water', 'A...",D,OpenBookQA,OpenBookQA,An example of lots kinetic energy would be?,An aircraft taking a trip.,An aircraft taking a trip.,Drinking a cold glass of water.; A snail movin...
...,...,...,...,...,...,...,...,...,...,...
495,7-640,"If a tree burns down, it can be made right by","{'text': ['buying a plastic tree', 'visiting a...",D,OpenBookQA,OpenBookQA,"If a tree burns down, it can be made right by?",placing seed in dirt.,placing seed in dirt.,buying a plastic tree.; visiting a rain forest...
496,1374,What role does the first step in the food chai...,"{'text': ['secondary consumer', 'decomposer', ...",D,OpenBookQA,OpenBookQA,What role does the first step in the food chai...,producer.,producer.,secondary consumer.; decomposer.; consumer.
497,1151,Water at 50 degree C is,{'text': ['equally distant from the melting an...,A,OpenBookQA,OpenBookQA,Water at 50 degree C is?,equally distant from the melting and boiling p...,equally distant from the melting and boiling p...,closer to the boiling point of water.; closer ...
498,9-469,A phase change is,"{'text': ['Growing a plant', 'Building a car',...",D,OpenBookQA,OpenBookQA,A phase change is?,Jello's transformation.,Jello's transformation.,Growing a plant.; Building a car.; Water chang...


In [1]:
df = pd.read_parquet('hf://datasets/cais/mmlu/all/validation-00000-of-00001.parquet')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df

Unnamed: 0,question,subject,choices,answer
0,The cyclic subgroup of Z_24 generated by 18 ha...,abstract_algebra,"[4, 8, 12, 6]",0
1,Find the order of the factor group Z_6/<3>.,abstract_algebra,"[2, 3, 6, 12]",1
2,Statement 1 | A permutation that is a product ...,abstract_algebra,"[True, True, False, False, True, False, False,...",0
3,Find the order of the factor group (Z_4 x Z_12...,abstract_algebra,"[2, 3, 4, 12]",2
4,Find the maximum possible order for some eleme...,abstract_algebra,"[4, 6, 12, 24]",2
...,...,...,...,...
1526,The Daoist concept of wuwei means which of th...,world_religions,"[Non-action, Contemplation, Meditation, Intent...",0
1527,Who was the immediate successor to Guru Nanak?,world_religions,"[Guru Angad, His son, Guru Gobind Singh, Guru ...",0
1528,"What does Ghandi's title ""Mahatma"" mean?",world_religions,"[""Enlightened leader"", ""Great soul"", ""Enlighte...",1
1529,What is the Decalogue?,world_religions,"[The Exodus, The Covenant, The Ten Commandment...",2


In [10]:
def process_mmlu():
    def get_best_answer(row):
        answer_by_key = dict(zip(range(len(row['choices'])), row['choices']))
        answer_key = row['answer']
        return add_ending(answer_by_key[answer_key], '.').replace(';', ',')

    def get_correct_answers(row):
        answer_by_key = dict(zip(range(len(row['choices'])), row['choices']))
        answer_key = row['answer']
        return '; '.join([add_ending(answer_by_key[answer_key], '.').replace(';', ',')])

    def get_incorrect_answers(row):
        answer_by_key = dict(zip(range(len(row['choices'])), row['choices']))
        answer_key = row['answer']
        return '; '.join([add_ending(v, '.').replace(';', ',')
                          for k,v in answer_by_key.items() if k != answer_key])

    df['Type'] = 'MMLU'
    df['Category'] = df['subject']
    df['Question'] = df['question'].apply(lambda x: x + '?' if not (x.endswith('?') or x.endswith('.'))else x)
    df['Best Answer'] = df.apply(get_best_answer, axis=1)
    df['Correct Answers'] = df.apply(get_correct_answers, axis=1)
    df['Incorrect Answers'] = df.apply(get_incorrect_answers, axis=1)
    df.to_csv('MMLU.csv', index=False,
              columns=['Type', 'Category', 'Question', 'Best Answer', 'Correct Answers', 'Incorrect Answers'])
    return df


In [11]:
process_mmlu()

Unnamed: 0,question,subject,choices,answer,Type,Category,Question,Best Answer,Correct Answers,Incorrect Answers
0,The cyclic subgroup of Z_24 generated by 18 ha...,abstract_algebra,"[4, 8, 12, 6]",0,MMLU,abstract_algebra,The cyclic subgroup of Z_24 generated by 18 ha...,4.,4.,8.; 12.; 6.
1,Find the order of the factor group Z_6/<3>.,abstract_algebra,"[2, 3, 6, 12]",1,MMLU,abstract_algebra,Find the order of the factor group Z_6/<3>.,3.,3.,2.; 6.; 12.
2,Statement 1 | A permutation that is a product ...,abstract_algebra,"[True, True, False, False, True, False, False,...",0,MMLU,abstract_algebra,Statement 1 | A permutation that is a product ...,"True, True.","True, True.","False, False.; True, False.; False, True."
3,Find the order of the factor group (Z_4 x Z_12...,abstract_algebra,"[2, 3, 4, 12]",2,MMLU,abstract_algebra,Find the order of the factor group (Z_4 x Z_12...,4.,4.,2.; 3.; 12.
4,Find the maximum possible order for some eleme...,abstract_algebra,"[4, 6, 12, 24]",2,MMLU,abstract_algebra,Find the maximum possible order for some eleme...,12.,12.,4.; 6.; 24.
...,...,...,...,...,...,...,...,...,...,...
1526,The Daoist concept of wuwei means which of th...,world_religions,"[Non-action, Contemplation, Meditation, Intent...",0,MMLU,world_religions,The Daoist concept of wuwei means which of th...,Non-action.,Non-action.,Contemplation.; Meditation.; Intentionality.
1527,Who was the immediate successor to Guru Nanak?,world_religions,"[Guru Angad, His son, Guru Gobind Singh, Guru ...",0,MMLU,world_religions,Who was the immediate successor to Guru Nanak?,Guru Angad.,Guru Angad.,His son.; Guru Gobind Singh.; Guru Hargobind.
1528,"What does Ghandi's title ""Mahatma"" mean?",world_religions,"[""Enlightened leader"", ""Great soul"", ""Enlighte...",1,MMLU,world_religions,"What does Ghandi's title ""Mahatma"" mean?","""Great soul"".","""Great soul"".","""Enlightened leader"".; ""Enlightened soul"".; ""G..."
1529,What is the Decalogue?,world_religions,"[The Exodus, The Covenant, The Ten Commandment...",2,MMLU,world_religions,What is the Decalogue?,The Ten Commandments.,The Ten Commandments.,The Exodus.; The Covenant.; The creation story.
