In [5]:
import numpy as np
import string
from datasets import Value

first_four_letters = string.ascii_uppercase[:4]
def to_mmmlu_format(mmlu_dataset):
    for col_name in mmlu_dataset.features:
        mmlu_dataset = mmlu_dataset.rename_column(col_name, col_name.capitalize())
    choice_arr = np.array(mmlu_dataset['Choices'])
    for idx, letter in enumerate(first_four_letters):
        mmlu_dataset = mmlu_dataset.add_column(letter, choice_arr[:, idx])
    mmlu_dataset = mmlu_dataset.remove_columns('Choices')

    # From number answer to letter answer
    mmlu_dataset = mmlu_dataset.cast_column('Answer', Value('string'))
    mmlu_dataset = mmlu_dataset.map(lambda x: {"Answer": first_four_letters[int(x['Answer'])]})
    return mmlu_dataset

In [6]:
from datasets import load_dataset
# [English, ZH_CN]
lang_list = ["English", "ZH_CN"]
curr_lang = lang_list[-1]
if curr_lang == 'English':
    mmmlu_ds = load_dataset('cais/mmlu', 'all', split='test')
    mmmlu_ds = to_mmmlu_format(mmmlu_ds)
else:
    mmmlu_ds = load_dataset("openai/MMMLU", curr_lang, split='test')

In [7]:
# https://github.com/hendrycks/test/blob/master/categories.py
subcategories = {
    "abstract_algebra": ["math"],
    "anatomy": ["health"],
    "astronomy": ["physics"],
    "business_ethics": ["business"],
    "clinical_knowledge": ["health"],
    "college_biology": ["biology"],
    "college_chemistry": ["chemistry"],
    "college_computer_science": ["computer science"],
    "college_mathematics": ["math"],
    "college_medicine": ["health"],
    "college_physics": ["physics"],
    "computer_security": ["computer science"],
    "conceptual_physics": ["physics"],
    "econometrics": ["economics"],
    "electrical_engineering": ["engineering"],
    "elementary_mathematics": ["math"],
    "formal_logic": ["philosophy"],
    "global_facts": ["other"],
    "high_school_biology": ["biology"],
    "high_school_chemistry": ["chemistry"],
    "high_school_computer_science": ["computer science"],
    "high_school_european_history": ["history"],
    "high_school_geography": ["geography"],
    "high_school_government_and_politics": ["politics"],
    "high_school_macroeconomics": ["economics"],
    "high_school_mathematics": ["math"],
    "high_school_microeconomics": ["economics"],
    "high_school_physics": ["physics"],
    "high_school_psychology": ["psychology"],
    "high_school_statistics": ["math"],
    "high_school_us_history": ["history"],
    "high_school_world_history": ["history"],
    "human_aging": ["health"],
    "human_sexuality": ["culture"],
    "international_law": ["law"],
    "jurisprudence": ["law"],
    "logical_fallacies": ["philosophy"],
    "machine_learning": ["computer science"],
    "management": ["business"],
    "marketing": ["business"],
    "medical_genetics": ["health"],
    "miscellaneous": ["other"],
    "moral_disputes": ["philosophy"],
    "moral_scenarios": ["philosophy"],
    "nutrition": ["health"],
    "philosophy": ["philosophy"],
    "prehistory": ["history"],
    "professional_accounting": ["other"],
    "professional_law": ["law"],
    "professional_medicine": ["health"],
    "professional_psychology": ["psychology"],
    "public_relations": ["politics"],
    "security_studies": ["politics"],
    "sociology": ["culture"],
    "us_foreign_policy": ["politics"],
    "virology": ["health"],
    "world_religions": ["philosophy"],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other (business, health, misc.)": ["other", "business", "health"],
}

In [8]:
# Choose the first subtask in the subcategory
subtasks = {}
chosen_subtasks = []
for subtask, task_subcategory in subcategories.items():
    curr_category = task_subcategory[0]
    if curr_category not in subtasks:
        subtasks[curr_category] = []
        chosen_subtasks.append(subtask)
    subtasks[curr_category].append(subtask)

In [9]:
chosen_subtasks

['abstract_algebra',
 'anatomy',
 'astronomy',
 'business_ethics',
 'college_biology',
 'college_chemistry',
 'college_computer_science',
 'econometrics',
 'electrical_engineering',
 'formal_logic',
 'global_facts',
 'high_school_european_history',
 'high_school_geography',
 'high_school_government_and_politics',
 'high_school_psychology',
 'human_sexuality',
 'international_law']

In [10]:
import datasets
from datasets import concatenate_datasets
datasets.disable_progress_bar()
def sample_first_n_data_from_subtask(ds, subtasks, first_n=100,):
    sampled_ds_list = []
    indice = range(first_n)
    for subtask in subtasks:
        curr_ds = ds.filter(lambda x: x['Subject'] == subtask)
        sampled = curr_ds.select(indice)
        sampled_ds_list.append(sampled)
    return concatenate_datasets(sampled_ds_list)


In [11]:
sample_first_n_data_from_subtask(mmmlu_ds, chosen_subtasks)

Dataset({
    features: ['Unnamed: 0', 'Question', 'A', 'B', 'C', 'D', 'Answer', 'Subject'],
    num_rows: 1700
})

In [12]:
# common.py
QUERY_TEMPLATE_MULTICHOICE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.

{Question}

A) {A}
B) {B}
C) {C}
D) {D}
""".strip()

BASE_TEMPLATE_PLAIN = """
Answer the following multiple choice question. {Output_format} Think step by step before answering.

{Question}

A) {A}
B) {B}
C) {C}
D) {D}
""".strip()

BASE_TASK = "Answer the following multiple choice question."
BASE_OUPUT_FORMAT = "The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD."
BASE_INSTRUCRTION = "Think step by step before answering."

JSON_OUPUT_FORMAT = """The last line of your response should be of the following format:
'
{
    "Answer": "$LETTER"
}
'
where LETTER is one of ABCD. Namely, a json format for the output is required.
"""

XML_OUPUT_FORMAT = """The last line of your response should be of the following format:
'
<root>
    <Answer> $LETTER </Answer>
</root>
'
where LETTER is one of ABCD. Namely, a XML format for the output is required.
"""

# Draft Json. Not confirm
JSON_TEMPLATE_DICT = {
    "Task": None,
    "Output_format": None,
    "Instruction": None,
    "Question": None,
    "A": None,
    "B": None,
    "C": None,
    "D": None
}

# Fill task and instruction from base prompt
base_json_dict = JSON_TEMPLATE_DICT.copy()
base_json_dict['Task'] = BASE_TASK
base_json_dict['Instruction'] = BASE_INSTRUCRTION

DEFAULT_ORDER = np.array(["A", "B", "C", "D"])

In [13]:
import xml.etree.ElementTree as ET
import copy

root = ET.Element("root")
task_tag = ET.SubElement(root, 'Task')
task_tag.text = BASE_TASK

output_tag = ET.SubElement(root, "Output_format")
instruction_tag = ET.SubElement(root, "Instruction")
instruction_tag.text = BASE_INSTRUCRTION

question_tag = ET.SubElement(root, "Qeustion")
a_tag = ET.SubElement(root, "A")
b_tag = ET.SubElement(root, "B")
c_tag = ET.SubElement(root, "C")
d_tag = ET.SubElement(root, "D")

ET.indent(root)

In [14]:
ET.dump(root)

<root>
  <Task>Answer the following multiple choice question.</Task>
  <Output_format />
  <Instruction>Think step by step before answering.</Instruction>
  <Qeustion />
  <A />
  <B />
  <C />
  <D />
</root>


In [15]:
JSON_OUPUT_FORMAT

'The last line of your response should be of the following format:\n\'\n{\n    "Answer": "$LETTER"\n}\n\'\nwhere LETTER is one of ABCD. Namely, a json format for the output is required.\n'

In [16]:
import json
# Get Json string
query_json_string = json.dumps(base_json_dict, ensure_ascii=False, indent=4)
print(query_json_string)

{
    "Task": "Answer the following multiple choice question.",
    "Output_format": null,
    "Instruction": "Think step by step before answering.",
    "Question": null,
    "A": null,
    "B": null,
    "C": null,
    "D": null
}


In [19]:
def get_perumte_map(options, permute_method=None, output_format=None):
    # Temp
    match permute_method:
        case "reverse":
            permutation = np.flip(DEFAULT_ORDER)
        case "long-first":
            sort_by_long = np.flip(np.argsort(np.char.str_len(options)))
            permutation = DEFAULT_ORDER[sort_by_long]
        case "short-first":
            sort_by_short = np.argsort(np.char.str_len(options))
            permutation = DEFAULT_ORDER[sort_by_short]
        case _:
            permutation = DEFAULT_ORDER
    return {opt: permutation[idx] for idx, opt in enumerate(DEFAULT_ORDER)}

def get_output_format(output_format_type=None):
    if output_format_type is None:
        output_format_type = ""
    match output_format_type.lower():
        case "json":
            return JSON_OUPUT_FORMAT
        case "xml":
            return XML_OUPUT_FORMAT
        case _:
            return BASE_OUPUT_FORMAT


# Base Prompt
def get_query_permute_pair(curr_question, permute_method=None, output_format_type=None):
    query = BASE_TEMPLATE_PLAIN
    output_format = get_output_format(output_format_type)

    options = [curr_question[opt] for opt in DEFAULT_ORDER]
    option_mapping = get_perumte_map(options, permute_method)

    query_filling = {option_mapping[opt]: curr_question[opt].strip() for opt in DEFAULT_ORDER}
    query_filling['Question'] = curr_question['Question'].strip()
    query_filling["Output_format"] = output_format
    return query.format_map(query_filling), option_mapping

# Json input, arbitrary output
def get_json_query_permute_pair(curr_question, permute_method=None, output_format_type=None):
    query_dict = base_json_dict.copy()
    output_format = get_output_format(output_format_type)

    options = [curr_question[opt] for opt in DEFAULT_ORDER]
    option_mapping = get_perumte_map(options, permute_method)

    query_filling = {option_mapping[opt]: curr_question[opt].strip() for opt in DEFAULT_ORDER}
    query_filling['Question'] = curr_question['Question'].strip()
    query_filling['Output_format'] = output_format
    query_dict.update(query_filling)
    json_query = json.dumps(query_dict, ensure_ascii=False, indent=4)
    return json_query, option_mapping


for curr_subtask in chosen_subtasks:
    curr_ds = mmmlu_ds.filter(lambda x: x['Subject'] == subtask)
    curr_query, curr_mapping = get_json_query_permute_pair(curr_ds[42], permute_method="reverse")
    curr_ans = curr_mapping[curr_ds[0]['Answer']] # Get the new position of answer
    print(curr_query)
    break

{
    "Task": "Answer the following multiple choice question.",
    "Output_format": "The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD.",
    "Instruction": "Think step by step before answering.",
    "Question": "罗马声称他们的主教（教皇）是哪个领袖的继位人？",
    "A": "马修",
    "B": "耶稣",
    "C": "保罗",
    "D": "彼得"
}


In [18]:
# Metrics
# Accuracy, F1, ...

# FR: total samples N, mean(response_forward != response_backward)

# RStd: the num of options k, compute deviation of recall

# RSD: the num of options k, compute deviation of accuracy divided by average acc.

# CKLD: the num of options k, the ratio of ground truth i-label p_i, ratio of predicted i-label q_i,
# mean(p_i * log(p_i / q_i))