In [None]:
import mmmlu_preparer
from mmmlu_preparer.read_mmmlu_dataset import (
    TARGET_SUBTASKS,
    MMMLULanguage,
    create_mmmlu_dataset,
    sample_first_n_data_from_subtask
)
from mmmlu_preparer.query_formats import (
    InputFormat,
    OutputFormat,
    ShuffleMethod,
    get_current_queries
)


model_list = ['gemini-2.0-flash', 'mistral-small-2503', 'llama-v3p1-8b-instruct']
lang_list = list(MMMLULanguage)
input_output_list = [('base', 'base'), ('base', 'json-full'), ('base', 'xml-full'),('json', 'base'),('json', 'json-full'),('json', 'xml-full'), ('xml', 'base'),('xml', 'json-full'),('xml', 'xml-full')]
shuffle_list = list(ShuffleMethod)

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)
# Format

# experiment_save_dict = {
#     "Model": "",
#     "Language": "",
#     "Subtask": "",
#     "Question id in subtask": "",
#     "Shuffle method": "",
#     "Original to shuffled": "",
#     "Input format": "",
#     "Output format": "",
#     "Query": "",
#     "Original correct answer": "",
#     "Shuffled correct answer": "",
#     "Response answer": "",
#     "Model output": "",  # Output text only
# }

In [None]:
import json
from collections import defaultdict
from itertools import product
from pathlib import Path
from tqdm.auto import tqdm
import mmmlu_preparer
from mmmlu_preparer.answer_extract import extract_answer_from_response


chosen_subtasks = TARGET_SUBTASKS
combinations = list(product(model_list, input_output_list, shuffle_list))
merged_result_dict = defaultdict(list) # "model_name:"

generate_csv = True

if generate_csv:
    for lang_enum in lang_list:
        mmmlu_ds = create_mmmlu_dataset(lang_enum)
        chosen_subtasks = TARGET_SUBTASKS
        mmmlu_subset = sample_first_n_data_from_subtask(mmmlu_ds, chosen_subtasks)

        for curr_combo in tqdm(combinations):
            model_name, format_tuple, shuffle_method_enum = curr_combo
            if lang_enum is MMMLULanguage.EN and shuffle_method_enum in [ShuffleMethod.MOST_KANA_RATIO, ShuffleMethod.FEWEST_KANA_RATIO]:
                continue
            input_format, output_format = format_tuple
            lang_string = lang_enum.value.lower().replace("_", "-")
            shuffle_method = shuffle_method_enum.name.lower().replace("_", "-")
            log_file_name = f"{model_name}_{lang_string}_{input_format}_input_{output_format}_output_{shuffle_method}_shuffle.jsonl"
            print(log_file_name)

            input_format_enum = InputFormat(input_format.replace("-", "_"))
            output_format_enum = OutputFormat[output_format.replace("-", "_").upper()]

            curr_queries = get_current_queries(mmmlu_subset,
                                                lang_enum,
                                                chosen_subtasks,
                                                input_format_enum,
                                                output_format_enum,
                                                shuffle_method_enum,
                                                )

            log_path = Path(f"./mmmlu_output/{log_file_name}")
            if log_path.exists():
                with log_path.open('r', encoding='utf-8') as file:
                    for idx, line in enumerate(file):
                        curr_query = curr_queries[idx]
                        curr_result_dict = {}
                        response_dict = json.loads(line.strip())
                        response = response_dict['kwargs']['content']
                        extracted_answer = extract_answer_from_response(response)

                        if extracted_answer is None:
                            extracted_answer = 'Others'

                        curr_result_dict['Model'] = model_name
                        curr_result_dict['Language'] = lang_string
                        curr_result_dict['Subtask'] = curr_query['Subtask']
                        curr_result_dict['Question id in subtask'] = curr_query['Question id in subtask']
                        curr_result_dict['Shuffle method'] = shuffle_method
                        curr_result_dict['Original to shuffled'] = curr_query['Original to shuffled']
                        curr_result_dict['Input format'] = input_format
                        curr_result_dict['Output format'] = output_format
                        curr_result_dict['Query'] = curr_query['Query']
                        curr_result_dict['Original correct answer'] = curr_query['Original correct answer']
                        curr_result_dict['Shuffled correct answer'] = curr_query['Shuffled correct answer']
                        curr_result_dict['Response answer'] = extracted_answer
                        curr_result_dict['Model output'] = response
                        if model_name == 'llama-v3p1-8b-instruct':
                            curr_result_dict['logprobs'] = response_dict['kwargs']['response_metadata']['logprobs']
                        else:
                            curr_result_dict['logprobs'] = []

                        merged_result_dict[model_name].append(curr_result_dict)

    experiment_dfs = [pd.DataFrame(val_list) for val_list in merged_result_dict.values()]

In [None]:
if generate_csv:
    for df in experiment_dfs:
        model_name = df['Model'][0]
        print(model_name)
        csv_name = f"{model_name}_merged_results.csv"
        df.to_csv(csv_name, index=False)

In [None]:
model_list

In [None]:
import pandas as pd
target_model_name = model_list[0]
test_df = pd.read_csv(f'{target_model_name}_merged_results.csv', engine='c')

In [None]:
import ast

def shuffle_map_type_conversion(shuffle_map):
    """Convert shuffled map string to python dictionary"""
    if isinstance(shuffle_map, str):
        return ast.literal_eval(shuffle_map)
    return shuffle_map

#test_df['Shuffled to Original']
test_df['Original to shuffled'] = test_df['Original to shuffled'].apply(shuffle_map_type_conversion)


In [None]:
test_df['Shuffled to Original']  = test_df['Original to shuffled'].apply(lambda x: {val:key for key, val in x.items()})

In [None]:
def map_answer(row):
    """Convert model's answer to the id in the default order"""
    response_ansewr = row['Response answer']
    shuffled_to_original = row['Shuffled to Original']
    if response_ansewr in shuffled_to_original:
        return shuffled_to_original[response_ansewr]
    return response_ansewr # nan

test_df['Response answer id in default'] = test_df.apply(map_answer, axis=1)

In [None]:
from typing import Optional, Union

def append_new_level(levels: list[str], append_level: Optional[Union[str, list[str]]] = None) -> list[str]:
    """Add new levels for grouping"""
    if append_level is not None:
        if isinstance(append_level, str):
            append_level = [append_level]
        levels.extend(append_level)
    return levels

def get_metric_level_dict(
    metric_df: pd.DataFrame,
    target_key: str,
    append_levels: Optional[Union[str, list[str]]] = None,
) -> dict:
    metric_dict = {}

    # Language level
    # Order Sensitivity across Languages
    language_level = append_new_level(['Language'], append_levels)
    metric_dict['Language'] = metric_df.groupby(language_level)[target_key].mean()

    # Subtask level
    # Order Sensitivity across subtasks
    subtask_level = append_new_level(['Subtask'], append_levels)
    metric_dict['Subtask'] = metric_df.groupby(subtask_level)[target_key].mean()

    # Language & format level
    # Order Sensitivity across Languages
    language_format_level = append_new_level(['Language', "Input format", "Output format"], append_levels)
    metric_dict['Language_format'] = metric_df.groupby(language_format_level)[target_key].mean()

    # Subtask & Format level
    # Does input/output formatting increase or reduce order bias?
    subtask_format_level = append_new_level(['Subtask', "Input format", "Output format"], append_levels)
    metric_dict['Subtask_format'] = metric_df.groupby(subtask_format_level)[target_key].mean()

    # all level
    all_level = append_new_level(['Language', 'Subtask', 'Input format', 'Output format'], append_levels)
    metric_dict['All'] = metric_df.groupby(all_level)[target_key].mean()
    return metric_dict

In [None]:
def compute_fr(base_df: pd.DataFrame, forward_method: str, backward_method: str) -> dict:
    """Compute Fluctuation Rate in different level (language, subtask, ...)
    Ref: https://aclanthology.org/2024.findings-acl.333/
    """
    forward_df = base_df[base_df['Shuffle method'] == forward_method].reset_index()
    backward_df = base_df[base_df['Shuffle method'] == backward_method].reset_index()

    # Map invalid answer to 'Others'
    forward_df['Response answer id in default'] = forward_df['Response answer id in default'].fillna("Others")
    backward_df['Response answer id in default'] = backward_df['Response answer id in default'].fillna("Others")

    target_key = 'Forward != backward'
    forward_df[target_key] = (forward_df['Response answer id in default'] != backward_df['Response answer id in default'])
    return get_metric_level_dict(forward_df, target_key)



In [None]:
def compute_accuracy(base_df: pd.DataFrame) -> dict:
    """Compute accuracy in different level (language, subtask, ...)"""
    base_df = base_df.copy()
    target_key = 'Is correct response'
    base_df[target_key] = base_df['Response answer'] == base_df['Shuffled correct answer']

    append_levels = 'Shuffle method'
    return get_metric_level_dict(base_df, target_key, append_levels)


In [None]:
def get_option_acc_dict(base_df: pd.DataFrame) -> dict:
    """Compute option accuracy in different level (language, subtask, ...)"""
    base_df = base_df.copy()
    target_key = 'Is correct response'
    base_df[target_key] = base_df['Response answer'] == base_df['Shuffled correct answer']

    append_levels = ['Shuffle method', 'Original correct answer']
    return get_metric_level_dict(base_df, target_key, append_levels)


In [None]:
acc_dict = compute_accuracy(test_df)
acc_dict['Subtask_format']#.loc[('ja-jp', 'base', 'base')]

In [None]:
acc_dict.keys()

In [None]:
acc_dict['All']#.groupby([])

In [None]:
option_acc_dict = get_option_acc_dict(test_df)

# Rstd
all_rstd = option_acc_dict['All'].groupby(['Language', "Subtask", "Input format", "Output format", "Shuffle method"]).std(ddof=0)
print(all_rstd)

# RSD
all_rsd = all_rstd / acc_dict["All"]
print(all_rsd)

In [None]:
# Rstd
lang_rstd = option_acc_dict['Language_format'].groupby(['Language', "Input format", "Output format", "Shuffle method"]).std(ddof=0)
print(lang_rstd)

# # RSD
lang_rsd = lang_rstd / acc_dict["Language_format"]
print(lang_rsd)

In [None]:
lang_rsd.xs(('default'), level='Shuffle method')

In [None]:
# Rstd
subtask_rstd = option_acc_dict['Subtask_format'].groupby(['Subtask', "Input format", "Output format", "Shuffle method"]).std(ddof=0)
print(subtask_rstd)

# # RSD
subtask_rsd = subtask_rstd / acc_dict["Subtask_format"]
print(subtask_rsd)

In [None]:
option_acc_dict['Language_format'].loc[('en', 'base', 'base')].groupby("Shuffle method").mean()

In [None]:
forward_backward_pair = [('default', 'reverse'), ('longest-first', 'shortest-first'), ('most-kana-ratio', 'fewest-kana-ratio')]


for forward_method, backward_method in forward_backward_pair:
    fr_dict = compute_fr(test_df, forward_method, backward_method)
    break

In [None]:
fr_dict['Language_format']