In [None]:
import mmmlu_preparer
from mmmlu_preparer.read_mmmlu_dataset import (
    TARGET_SUBTASKS,
    MMMLULanguage,
    create_mmmlu_dataset,
    sample_first_n_data_from_subtask
)
from mmmlu_preparer.query_formats import (
    InputFormat,
    OutputFormat,
    ShuffleMethod,
    get_current_queries
)


model_list = ['gemini-2.0-flash', 'mistral-small-2503']
lang_list = list(MMMLULanguage)
input_output_list = [('base', 'base'), ('base', 'json-full'), ('json', 'json-full'), ('xml', 'base')]
shuffle_list = list(ShuffleMethod)

In [None]:
import pandas as pd

# Format

# experiment_save_dict = {
#     "Model": "",
#     "Language": "",
#     "Subtask": "",
#     "Question id in subtask": "",
#     "Shuffle method": "",
#     "Original to shuffled": "",
#     "Input format": "",
#     "Output format": "",
#     "Query": "",
#     "Original correct answer": "",
#     "Shuffled correct answer": "",
#     "Response answer": "",
#     "Model output": "",  # Output text only
# }

In [None]:
import json
from collections import defaultdict
from itertools import product
from pathlib import Path
from tqdm.auto import tqdm
import mmmlu_preparer
from mmmlu_preparer.answer_extract import extract_answer_from_response


chosen_subtasks = TARGET_SUBTASKS
combinations = list(product(model_list, input_output_list, shuffle_list))
merged_result_dict = defaultdict(list) # "model_name:"

for lang_enum in lang_list:
    mmmlu_ds = create_mmmlu_dataset(lang_enum)
    chosen_subtasks = TARGET_SUBTASKS
    mmmlu_subset = sample_first_n_data_from_subtask(mmmlu_ds, chosen_subtasks)

    for curr_combo in tqdm(combinations):
        model_name, format_tuple, shuffle_method_enum = curr_combo
        if lang_enum is MMMLULanguage.EN and shuffle_method_enum in [ShuffleMethod.MOST_KANA_RATIO, ShuffleMethod.FEWEST_KANA_RATIO]:
            continue
        input_format, output_format = format_tuple
        lang_string = lang_enum.value.lower().replace("_", "-")
        shuffle_method = shuffle_method_enum.name.lower().replace("_", "-")
        log_file_name = f"{model_name}_{lang_string}_{input_format}_input_{output_format}_output_{shuffle_method}_shuffle.jsonl"
        print(log_file_name)

        input_format_enum = InputFormat(input_format.replace("-", "_"))
        output_format_enum = OutputFormat[output_format.replace("-", "_").upper()]

        curr_queries = get_current_queries(mmmlu_subset,
                                            lang_enum,
                                            chosen_subtasks,
                                            input_format_enum,
                                            output_format_enum,
                                            shuffle_method_enum,
                                            )

        log_path = Path(f"./mmmlu_output/{log_file_name}")
        if log_path.exists():
            with log_path.open('r', encoding='utf-8') as file:
                for idx, line in enumerate(file):
                    curr_query = curr_queries[idx]
                    curr_result_dict = {}
                    response_dict = json.loads(line.strip())
                    response = response_dict['kwargs']['content']
                    extracted_answer = extract_answer_from_response(response)

                    curr_result_dict['Model'] = model_name
                    curr_result_dict['Language'] = lang_string
                    curr_result_dict['Subtask'] = curr_query['Subtask']
                    curr_result_dict['Question id in subtask'] = curr_query['Question id in subtask']
                    curr_result_dict['Shuffle method'] = shuffle_method
                    curr_result_dict['Original to shuffled'] = curr_query['Original to shuffled']
                    curr_result_dict['Input format'] = input_format
                    curr_result_dict['Output format'] = output_format
                    curr_result_dict['Query'] = curr_query['Query']
                    curr_result_dict['Original correct answer'] = curr_query['Original correct answer']
                    curr_result_dict['Shuffled correct answer'] = curr_query['Shuffled correct answer']
                    curr_result_dict['Response answer'] = extracted_answer
                    curr_result_dict['Model output'] = response

                    merged_result_dict[model_name].append(curr_result_dict)

experiment_dfs = [pd.DataFrame(val_list) for val_list in merged_result_dict.values()]

In [None]:
for df in experiment_dfs:
    model_name = df['Model'][0]
    print(model_name)
    csv_name = f"{model_name}_merged_results.csv"
    df.to_csv(csv_name, index=False)

In [None]:
test_df = pd.read_csv('gemini-2.0-flash_merged_results.csv')