In [1]:
import mmmlu_preparer
from mmmlu_preparer.read_mmmlu_dataset import (
    TARGET_SUBTASKS,
    MMMLULanguage,
    create_mmmlu_dataset,
    sample_first_n_data_from_subtask
)
from mmmlu_preparer.query_formats import (
    InputFormat,
    OutputFormat,
    ShuffleMethod,
    get_current_queries
)


model_list = ['gemini-2.0-flash', 'mistral-small-2503', 'llama-v3p1-8b-instruct']
lang_list = list(MMMLULanguage)
input_output_list = [('base', 'base'), ('base', 'json-full'), ('base', 'xml-full'),('json', 'base'),('json', 'json-full'),('json', 'xml-full'), ('xml', 'base'),('xml', 'json-full'),('xml', 'xml-full')]
shuffle_list = list(ShuffleMethod)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

# Format

# experiment_save_dict = {
#     "Model": "",
#     "Language": "",
#     "Subtask": "",
#     "Question id in subtask": "",
#     "Shuffle method": "",
#     "Original to shuffled": "",
#     "Input format": "",
#     "Output format": "",
#     "Query": "",
#     "Original correct answer": "",
#     "Shuffled correct answer": "",
#     "Response answer": "",
#     "Model output": "",  # Output text only
# }

In [3]:
from itertools import product
from tqdm.auto import tqdm
from collections import defaultdict

chosen_subtasks = TARGET_SUBTASKS
combinations = list(product(model_list, input_output_list, shuffle_list))
merged_result_dict = defaultdict(list) # "model_name:"

for lang_enum in lang_list:
    mmmlu_ds = create_mmmlu_dataset(lang_enum)
    chosen_subtasks = TARGET_SUBTASKS
    mmmlu_subset = sample_first_n_data_from_subtask(mmmlu_ds, chosen_subtasks)
    for curr_combo in tqdm(combinations):
        model_name, format_tuple, shuffle_method_enum = curr_combo
        input_format, output_format = format_tuple
        lang_string = lang_enum.value.lower().replace("_", "-")
        shuffle_method = shuffle_method_enum.name.lower().replace("_", "-")
        log_file_name = f"{model_name}_{lang_string}_{input_format}_input_{output_format}_output_{shuffle_method}_shuffle.jsonl"
        print(log_file_name)
        input_format_enum = InputFormat(input_format.replace("-", "_"))
        output_format_enum = OutputFormat[output_format.replace("-", "_").upper()]

100%|██████████| 162/162 [00:00<00:00, 158386.30it/s]


gemini-2.0-flash_en_base_input_base_output_default_shuffle.jsonl
gemini-2.0-flash_en_base_input_base_output_reverse_shuffle.jsonl
gemini-2.0-flash_en_base_input_base_output_longest-first_shuffle.jsonl
gemini-2.0-flash_en_base_input_base_output_shortest-first_shuffle.jsonl
gemini-2.0-flash_en_base_input_base_output_most-kana-ratio_shuffle.jsonl
gemini-2.0-flash_en_base_input_base_output_fewest-kana-ratio_shuffle.jsonl
gemini-2.0-flash_en_base_input_json-full_output_default_shuffle.jsonl
gemini-2.0-flash_en_base_input_json-full_output_reverse_shuffle.jsonl
gemini-2.0-flash_en_base_input_json-full_output_longest-first_shuffle.jsonl
gemini-2.0-flash_en_base_input_json-full_output_shortest-first_shuffle.jsonl
gemini-2.0-flash_en_base_input_json-full_output_most-kana-ratio_shuffle.jsonl
gemini-2.0-flash_en_base_input_json-full_output_fewest-kana-ratio_shuffle.jsonl
gemini-2.0-flash_en_base_input_xml-full_output_default_shuffle.jsonl
gemini-2.0-flash_en_base_input_xml-full_output_reverse_shuf

100%|██████████| 162/162 [00:00<00:00, 157177.25it/s]

gemini-2.0-flash_ja-jp_base_input_base_output_default_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_base_output_reverse_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_base_output_longest-first_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_base_output_shortest-first_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_base_output_most-kana-ratio_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_base_output_fewest-kana-ratio_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_json-full_output_default_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_json-full_output_reverse_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_json-full_output_longest-first_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_json-full_output_shortest-first_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_json-full_output_most-kana-ratio_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_json-full_output_fewest-kana-ratio_shuffle.jsonl
gemini-2.0-flash_ja-jp_base_input_xml-full_output_default_shuffle.jsonl
gemini-2.0-flash_ja-




In [6]:
import json
from collections import defaultdict
from itertools import product
from pathlib import Path
from tqdm.auto import tqdm
import mmmlu_preparer
from mmmlu_preparer.answer_extract import extract_answer_from_response


chosen_subtasks = TARGET_SUBTASKS
combinations = list(product(model_list, input_output_list, shuffle_list))
merged_result_dict = defaultdict(list) # "model_name:"

for lang_enum in lang_list:
    mmmlu_ds = create_mmmlu_dataset(lang_enum)
    chosen_subtasks = TARGET_SUBTASKS
    mmmlu_subset = sample_first_n_data_from_subtask(mmmlu_ds, chosen_subtasks)

    for curr_combo in tqdm(combinations):
        model_name, format_tuple, shuffle_method_enum = curr_combo
        if lang_enum is MMMLULanguage.EN and shuffle_method_enum in [ShuffleMethod.MOST_KANA_RATIO, ShuffleMethod.FEWEST_KANA_RATIO]:
            continue
        input_format, output_format = format_tuple
        lang_string = lang_enum.value.lower().replace("_", "-")
        shuffle_method = shuffle_method_enum.name.lower().replace("_", "-")
        log_file_name = f"{model_name}_{lang_string}_{input_format}_input_{output_format}_output_{shuffle_method}_shuffle.jsonl"
        print(log_file_name)

        input_format_enum = InputFormat(input_format.replace("-", "_"))
        output_format_enum = OutputFormat[output_format.replace("-", "_").upper()]

        curr_queries = get_current_queries(mmmlu_subset,
                                            lang_enum,
                                            chosen_subtasks,
                                            input_format_enum,
                                            output_format_enum,
                                            shuffle_method_enum,
                                            )

        log_path = Path(f"./mmmlu_output/{log_file_name}")
        if log_path.exists():
            with log_path.open('r', encoding='utf-8') as file:
                for idx, line in enumerate(file):
                    curr_query = curr_queries[idx]
                    curr_result_dict = {}
                    response_dict = json.loads(line.strip())
                    response = response_dict['kwargs']['content']
                    extracted_answer = extract_answer_from_response(response)

                    curr_result_dict['Model'] = model_name
                    curr_result_dict['Language'] = lang_string
                    curr_result_dict['Subtask'] = curr_query['Subtask']
                    curr_result_dict['Question id in subtask'] = curr_query['Question id in subtask']
                    curr_result_dict['Shuffle method'] = shuffle_method
                    curr_result_dict['Original to shuffled'] = curr_query['Original to shuffled']
                    curr_result_dict['Input format'] = input_format
                    curr_result_dict['Output format'] = output_format
                    curr_result_dict['Query'] = curr_query['Query']
                    curr_result_dict['Original correct answer'] = curr_query['Original correct answer']
                    curr_result_dict['Shuffled correct answer'] = curr_query['Shuffled correct answer']
                    curr_result_dict['Response answer'] = extracted_answer
                    curr_result_dict['Model output'] = response
                    if model_name == 'llama-v3p1-8b-instruct':
                        curr_result_dict['logprobs'] = response_dict['kwargs']['response_metadata']['logprobs']
                    else:
                        curr_result_dict['logprobs'] = []
                        
                    merged_result_dict[model_name].append(curr_result_dict)

experiment_dfs = [pd.DataFrame(val_list) for val_list in merged_result_dict.values()]

  0%|          | 0/162 [00:00<?, ?it/s]

gemini-2.0-flash_en_base_input_base_output_default_shuffle.jsonl


  1%|          | 1/162 [00:00<02:05,  1.28it/s]

gemini-2.0-flash_en_base_input_base_output_reverse_shuffle.jsonl


  1%|          | 2/162 [00:01<02:06,  1.27it/s]

gemini-2.0-flash_en_base_input_base_output_longest-first_shuffle.jsonl


  2%|▏         | 3/162 [00:02<02:10,  1.22it/s]

gemini-2.0-flash_en_base_input_base_output_shortest-first_shuffle.jsonl


  2%|▏         | 4/162 [00:03<02:10,  1.21it/s]

gemini-2.0-flash_en_base_input_json-full_output_default_shuffle.jsonl


  4%|▍         | 7/162 [00:04<01:17,  2.01it/s]

gemini-2.0-flash_en_base_input_json-full_output_reverse_shuffle.jsonl


  5%|▍         | 8/162 [00:05<01:31,  1.68it/s]

gemini-2.0-flash_en_base_input_json-full_output_longest-first_shuffle.jsonl


  6%|▌         | 9/162 [00:06<01:44,  1.46it/s]

gemini-2.0-flash_en_base_input_json-full_output_shortest-first_shuffle.jsonl


  6%|▌         | 10/162 [00:07<01:54,  1.32it/s]

gemini-2.0-flash_en_base_input_xml-full_output_default_shuffle.jsonl


  8%|▊         | 13/162 [00:07<01:13,  2.03it/s]

gemini-2.0-flash_en_base_input_xml-full_output_reverse_shuffle.jsonl


  9%|▊         | 14/162 [00:08<01:23,  1.78it/s]

gemini-2.0-flash_en_base_input_xml-full_output_longest-first_shuffle.jsonl


  9%|▉         | 15/162 [00:09<01:30,  1.62it/s]

gemini-2.0-flash_en_base_input_xml-full_output_shortest-first_shuffle.jsonl


 10%|▉         | 16/162 [00:10<01:36,  1.51it/s]

gemini-2.0-flash_en_json_input_base_output_default_shuffle.jsonl


 12%|█▏        | 19/162 [00:11<01:06,  2.16it/s]

gemini-2.0-flash_en_json_input_base_output_reverse_shuffle.jsonl


 12%|█▏        | 20/162 [00:11<01:15,  1.87it/s]

gemini-2.0-flash_en_json_input_base_output_longest-first_shuffle.jsonl


 13%|█▎        | 21/162 [00:12<01:24,  1.66it/s]

gemini-2.0-flash_en_json_input_base_output_shortest-first_shuffle.jsonl


 14%|█▎        | 22/162 [00:13<01:32,  1.51it/s]

gemini-2.0-flash_en_json_input_json-full_output_default_shuffle.jsonl


 15%|█▌        | 25/162 [00:14<01:07,  2.04it/s]

gemini-2.0-flash_en_json_input_json-full_output_reverse_shuffle.jsonl


 16%|█▌        | 26/162 [00:15<01:19,  1.71it/s]

gemini-2.0-flash_en_json_input_json-full_output_longest-first_shuffle.jsonl


 17%|█▋        | 27/162 [00:16<01:33,  1.45it/s]

gemini-2.0-flash_en_json_input_json-full_output_shortest-first_shuffle.jsonl


 17%|█▋        | 28/162 [00:17<01:42,  1.31it/s]

gemini-2.0-flash_en_json_input_xml-full_output_default_shuffle.jsonl


 19%|█▉        | 31/162 [00:18<01:07,  1.94it/s]

gemini-2.0-flash_en_json_input_xml-full_output_reverse_shuffle.jsonl


 20%|█▉        | 32/162 [00:19<01:14,  1.74it/s]

gemini-2.0-flash_en_json_input_xml-full_output_longest-first_shuffle.jsonl


 20%|██        | 33/162 [00:20<01:22,  1.57it/s]

gemini-2.0-flash_en_json_input_xml-full_output_shortest-first_shuffle.jsonl


 21%|██        | 34/162 [00:20<01:28,  1.44it/s]

gemini-2.0-flash_en_xml_input_base_output_default_shuffle.jsonl


 23%|██▎       | 37/162 [00:21<01:01,  2.03it/s]

gemini-2.0-flash_en_xml_input_base_output_reverse_shuffle.jsonl


 23%|██▎       | 38/162 [00:22<01:10,  1.76it/s]

gemini-2.0-flash_en_xml_input_base_output_longest-first_shuffle.jsonl


 24%|██▍       | 39/162 [00:23<01:20,  1.52it/s]

gemini-2.0-flash_en_xml_input_base_output_shortest-first_shuffle.jsonl


 25%|██▍       | 40/162 [00:24<01:28,  1.39it/s]

gemini-2.0-flash_en_xml_input_json-full_output_default_shuffle.jsonl


 27%|██▋       | 43/162 [00:25<00:59,  1.99it/s]

gemini-2.0-flash_en_xml_input_json-full_output_reverse_shuffle.jsonl


 27%|██▋       | 44/162 [00:26<01:07,  1.75it/s]

gemini-2.0-flash_en_xml_input_json-full_output_longest-first_shuffle.jsonl


 28%|██▊       | 45/162 [00:27<01:15,  1.56it/s]

gemini-2.0-flash_en_xml_input_json-full_output_shortest-first_shuffle.jsonl


 28%|██▊       | 46/162 [00:28<01:21,  1.42it/s]

gemini-2.0-flash_en_xml_input_xml-full_output_default_shuffle.jsonl


 30%|███       | 49/162 [00:29<00:55,  2.03it/s]

gemini-2.0-flash_en_xml_input_xml-full_output_reverse_shuffle.jsonl


 31%|███       | 50/162 [00:29<01:03,  1.77it/s]

gemini-2.0-flash_en_xml_input_xml-full_output_longest-first_shuffle.jsonl


 31%|███▏      | 51/162 [00:30<01:10,  1.57it/s]

gemini-2.0-flash_en_xml_input_xml-full_output_shortest-first_shuffle.jsonl


 32%|███▏      | 52/162 [00:31<01:17,  1.43it/s]

mistral-small-2503_en_base_input_base_output_default_shuffle.jsonl


 34%|███▍      | 55/162 [00:32<00:52,  2.05it/s]

mistral-small-2503_en_base_input_base_output_reverse_shuffle.jsonl


 35%|███▍      | 56/162 [00:33<00:58,  1.80it/s]

mistral-small-2503_en_base_input_base_output_longest-first_shuffle.jsonl


 35%|███▌      | 57/162 [00:34<01:07,  1.56it/s]

mistral-small-2503_en_base_input_base_output_shortest-first_shuffle.jsonl


 36%|███▌      | 58/162 [00:35<01:12,  1.44it/s]

mistral-small-2503_en_base_input_json-full_output_default_shuffle.jsonl


 38%|███▊      | 61/162 [00:36<00:50,  2.00it/s]

mistral-small-2503_en_base_input_json-full_output_reverse_shuffle.jsonl


 38%|███▊      | 62/162 [00:37<00:58,  1.71it/s]

mistral-small-2503_en_base_input_json-full_output_longest-first_shuffle.jsonl


 39%|███▉      | 63/162 [00:38<01:06,  1.50it/s]

mistral-small-2503_en_base_input_json-full_output_shortest-first_shuffle.jsonl


 40%|███▉      | 64/162 [00:38<01:12,  1.36it/s]

mistral-small-2503_en_base_input_xml-full_output_default_shuffle.jsonl


 41%|████▏     | 67/162 [00:39<00:47,  2.00it/s]

mistral-small-2503_en_base_input_xml-full_output_reverse_shuffle.jsonl


 42%|████▏     | 68/162 [00:40<00:52,  1.79it/s]

mistral-small-2503_en_base_input_xml-full_output_longest-first_shuffle.jsonl


 43%|████▎     | 69/162 [00:41<00:57,  1.60it/s]

mistral-small-2503_en_base_input_xml-full_output_shortest-first_shuffle.jsonl


 43%|████▎     | 70/162 [00:42<01:02,  1.48it/s]

mistral-small-2503_en_json_input_base_output_default_shuffle.jsonl


 45%|████▌     | 73/162 [00:43<00:42,  2.10it/s]

mistral-small-2503_en_json_input_base_output_reverse_shuffle.jsonl


 46%|████▌     | 74/162 [00:44<00:49,  1.77it/s]

mistral-small-2503_en_json_input_base_output_longest-first_shuffle.jsonl


 46%|████▋     | 75/162 [00:44<00:55,  1.58it/s]

mistral-small-2503_en_json_input_base_output_shortest-first_shuffle.jsonl


 47%|████▋     | 76/162 [00:45<00:59,  1.44it/s]

mistral-small-2503_en_json_input_json-full_output_default_shuffle.jsonl


 49%|████▉     | 79/162 [00:46<00:42,  1.96it/s]

mistral-small-2503_en_json_input_json-full_output_reverse_shuffle.jsonl


 49%|████▉     | 80/162 [00:47<00:49,  1.66it/s]

mistral-small-2503_en_json_input_json-full_output_longest-first_shuffle.jsonl


 50%|█████     | 81/162 [00:48<00:56,  1.44it/s]

mistral-small-2503_en_json_input_json-full_output_shortest-first_shuffle.jsonl


 51%|█████     | 82/162 [00:49<01:01,  1.29it/s]

mistral-small-2503_en_json_input_xml-full_output_default_shuffle.jsonl


 52%|█████▏    | 85/162 [00:50<00:40,  1.89it/s]

mistral-small-2503_en_json_input_xml-full_output_reverse_shuffle.jsonl


 53%|█████▎    | 86/162 [00:51<00:45,  1.68it/s]

mistral-small-2503_en_json_input_xml-full_output_longest-first_shuffle.jsonl


 54%|█████▎    | 87/162 [00:52<00:49,  1.51it/s]

mistral-small-2503_en_json_input_xml-full_output_shortest-first_shuffle.jsonl


 54%|█████▍    | 88/162 [00:53<00:54,  1.35it/s]

mistral-small-2503_en_xml_input_base_output_default_shuffle.jsonl


 56%|█████▌    | 91/162 [00:54<00:37,  1.91it/s]

mistral-small-2503_en_xml_input_base_output_reverse_shuffle.jsonl


 57%|█████▋    | 92/162 [00:55<00:42,  1.66it/s]

mistral-small-2503_en_xml_input_base_output_longest-first_shuffle.jsonl


 57%|█████▋    | 93/162 [00:56<00:46,  1.47it/s]

mistral-small-2503_en_xml_input_base_output_shortest-first_shuffle.jsonl


 58%|█████▊    | 94/162 [00:57<00:50,  1.34it/s]

mistral-small-2503_en_xml_input_json-full_output_default_shuffle.jsonl


 60%|█████▉    | 97/162 [00:58<00:33,  1.92it/s]

mistral-small-2503_en_xml_input_json-full_output_reverse_shuffle.jsonl


 60%|██████    | 98/162 [00:59<00:37,  1.69it/s]

mistral-small-2503_en_xml_input_json-full_output_longest-first_shuffle.jsonl


 61%|██████    | 99/162 [00:59<00:42,  1.50it/s]

mistral-small-2503_en_xml_input_json-full_output_shortest-first_shuffle.jsonl


 62%|██████▏   | 100/162 [01:00<00:45,  1.37it/s]

mistral-small-2503_en_xml_input_xml-full_output_default_shuffle.jsonl


 64%|██████▎   | 103/162 [01:01<00:30,  1.95it/s]

mistral-small-2503_en_xml_input_xml-full_output_reverse_shuffle.jsonl


 64%|██████▍   | 104/162 [01:02<00:34,  1.71it/s]

mistral-small-2503_en_xml_input_xml-full_output_longest-first_shuffle.jsonl


 65%|██████▍   | 105/162 [01:03<00:38,  1.48it/s]

mistral-small-2503_en_xml_input_xml-full_output_shortest-first_shuffle.jsonl


 65%|██████▌   | 106/162 [01:04<00:42,  1.32it/s]

llama-v3p1-8b-instruct_en_base_input_base_output_default_shuffle.jsonl


 67%|██████▋   | 109/162 [01:05<00:27,  1.91it/s]

llama-v3p1-8b-instruct_en_base_input_base_output_reverse_shuffle.jsonl


 68%|██████▊   | 110/162 [01:06<00:30,  1.69it/s]

llama-v3p1-8b-instruct_en_base_input_base_output_longest-first_shuffle.jsonl


 69%|██████▊   | 111/162 [01:07<00:33,  1.54it/s]

llama-v3p1-8b-instruct_en_base_input_base_output_shortest-first_shuffle.jsonl


 69%|██████▉   | 112/162 [01:08<00:35,  1.42it/s]

llama-v3p1-8b-instruct_en_base_input_json-full_output_default_shuffle.jsonl


 71%|███████   | 115/162 [01:09<00:24,  1.95it/s]

llama-v3p1-8b-instruct_en_base_input_json-full_output_reverse_shuffle.jsonl


 72%|███████▏  | 116/162 [01:10<00:27,  1.70it/s]

llama-v3p1-8b-instruct_en_base_input_json-full_output_longest-first_shuffle.jsonl


 72%|███████▏  | 117/162 [01:10<00:29,  1.53it/s]

llama-v3p1-8b-instruct_en_base_input_json-full_output_shortest-first_shuffle.jsonl


 73%|███████▎  | 118/162 [01:12<00:32,  1.35it/s]

llama-v3p1-8b-instruct_en_base_input_xml-full_output_default_shuffle.jsonl


 75%|███████▍  | 121/162 [01:12<00:21,  1.88it/s]

llama-v3p1-8b-instruct_en_base_input_xml-full_output_reverse_shuffle.jsonl


 75%|███████▌  | 122/162 [01:14<00:25,  1.58it/s]

llama-v3p1-8b-instruct_en_base_input_xml-full_output_longest-first_shuffle.jsonl


 76%|███████▌  | 123/162 [01:14<00:26,  1.46it/s]

llama-v3p1-8b-instruct_en_base_input_xml-full_output_shortest-first_shuffle.jsonl


 77%|███████▋  | 124/162 [01:15<00:29,  1.31it/s]

llama-v3p1-8b-instruct_en_json_input_base_output_default_shuffle.jsonl


 78%|███████▊  | 127/162 [01:16<00:18,  1.87it/s]

llama-v3p1-8b-instruct_en_json_input_base_output_reverse_shuffle.jsonl


 79%|███████▉  | 128/162 [01:17<00:20,  1.63it/s]

llama-v3p1-8b-instruct_en_json_input_base_output_longest-first_shuffle.jsonl


 80%|███████▉  | 129/162 [01:18<00:23,  1.42it/s]

llama-v3p1-8b-instruct_en_json_input_base_output_shortest-first_shuffle.jsonl


 80%|████████  | 130/162 [01:19<00:24,  1.31it/s]

llama-v3p1-8b-instruct_en_json_input_json-full_output_default_shuffle.jsonl


 82%|████████▏ | 133/162 [01:20<00:15,  1.81it/s]

llama-v3p1-8b-instruct_en_json_input_json-full_output_reverse_shuffle.jsonl


 83%|████████▎ | 134/162 [01:21<00:18,  1.55it/s]

llama-v3p1-8b-instruct_en_json_input_json-full_output_longest-first_shuffle.jsonl


 83%|████████▎ | 135/162 [01:22<00:19,  1.41it/s]

llama-v3p1-8b-instruct_en_json_input_json-full_output_shortest-first_shuffle.jsonl


 84%|████████▍ | 136/162 [01:23<00:20,  1.26it/s]

llama-v3p1-8b-instruct_en_json_input_xml-full_output_default_shuffle.jsonl


 86%|████████▌ | 139/162 [01:24<00:13,  1.76it/s]

llama-v3p1-8b-instruct_en_json_input_xml-full_output_reverse_shuffle.jsonl


 86%|████████▋ | 140/162 [01:26<00:14,  1.49it/s]

llama-v3p1-8b-instruct_en_json_input_xml-full_output_longest-first_shuffle.jsonl


 87%|████████▋ | 141/162 [01:26<00:15,  1.38it/s]

llama-v3p1-8b-instruct_en_json_input_xml-full_output_shortest-first_shuffle.jsonl


 88%|████████▊ | 142/162 [01:28<00:16,  1.23it/s]

llama-v3p1-8b-instruct_en_xml_input_base_output_default_shuffle.jsonl


 90%|████████▉ | 145/162 [01:28<00:09,  1.77it/s]

llama-v3p1-8b-instruct_en_xml_input_base_output_reverse_shuffle.jsonl


 90%|█████████ | 146/162 [01:29<00:10,  1.56it/s]

llama-v3p1-8b-instruct_en_xml_input_base_output_longest-first_shuffle.jsonl


 91%|█████████ | 147/162 [01:30<00:10,  1.37it/s]

llama-v3p1-8b-instruct_en_xml_input_base_output_shortest-first_shuffle.jsonl


 91%|█████████▏| 148/162 [01:31<00:11,  1.26it/s]

llama-v3p1-8b-instruct_en_xml_input_json-full_output_default_shuffle.jsonl


 93%|█████████▎| 151/162 [01:33<00:06,  1.76it/s]

llama-v3p1-8b-instruct_en_xml_input_json-full_output_reverse_shuffle.jsonl


 94%|█████████▍| 152/162 [01:34<00:06,  1.52it/s]

llama-v3p1-8b-instruct_en_xml_input_json-full_output_longest-first_shuffle.jsonl


 94%|█████████▍| 153/162 [01:35<00:06,  1.39it/s]

llama-v3p1-8b-instruct_en_xml_input_json-full_output_shortest-first_shuffle.jsonl


 95%|█████████▌| 154/162 [01:36<00:06,  1.24it/s]

llama-v3p1-8b-instruct_en_xml_input_xml-full_output_default_shuffle.jsonl


 97%|█████████▋| 157/162 [01:37<00:02,  1.74it/s]

llama-v3p1-8b-instruct_en_xml_input_xml-full_output_reverse_shuffle.jsonl


 98%|█████████▊| 158/162 [01:38<00:02,  1.50it/s]

llama-v3p1-8b-instruct_en_xml_input_xml-full_output_longest-first_shuffle.jsonl


 98%|█████████▊| 159/162 [01:39<00:02,  1.37it/s]

llama-v3p1-8b-instruct_en_xml_input_xml-full_output_shortest-first_shuffle.jsonl


100%|██████████| 162/162 [01:40<00:00,  1.62it/s]
  0%|          | 0/162 [00:00<?, ?it/s]

gemini-2.0-flash_ja-jp_base_input_base_output_default_shuffle.jsonl


  1%|          | 1/162 [00:00<02:27,  1.09it/s]

gemini-2.0-flash_ja-jp_base_input_base_output_reverse_shuffle.jsonl


  1%|          | 2/162 [00:01<01:23,  1.91it/s]

gemini-2.0-flash_ja-jp_base_input_base_output_longest-first_shuffle.jsonl


  2%|▏         | 3/162 [00:01<01:05,  2.42it/s]

gemini-2.0-flash_ja-jp_base_input_base_output_shortest-first_shuffle.jsonl


  2%|▏         | 4/162 [00:01<00:56,  2.78it/s]

gemini-2.0-flash_ja-jp_base_input_base_output_most-kana-ratio_shuffle.jsonl


  3%|▎         | 5/162 [00:02<00:52,  3.00it/s]

gemini-2.0-flash_ja-jp_base_input_base_output_fewest-kana-ratio_shuffle.jsonl


  4%|▎         | 6/162 [00:02<00:55,  2.80it/s]

gemini-2.0-flash_ja-jp_base_input_json-full_output_default_shuffle.jsonl


  4%|▍         | 7/162 [00:02<00:55,  2.80it/s]

gemini-2.0-flash_ja-jp_base_input_json-full_output_reverse_shuffle.jsonl


  5%|▍         | 8/162 [00:03<00:55,  2.78it/s]

gemini-2.0-flash_ja-jp_base_input_json-full_output_longest-first_shuffle.jsonl


  6%|▌         | 9/162 [00:03<00:56,  2.70it/s]

gemini-2.0-flash_ja-jp_base_input_json-full_output_shortest-first_shuffle.jsonl


  6%|▌         | 10/162 [00:03<00:57,  2.66it/s]

gemini-2.0-flash_ja-jp_base_input_json-full_output_most-kana-ratio_shuffle.jsonl


  7%|▋         | 11/162 [00:04<00:58,  2.60it/s]

gemini-2.0-flash_ja-jp_base_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


  7%|▋         | 12/162 [00:04<00:58,  2.55it/s]

gemini-2.0-flash_ja-jp_base_input_xml-full_output_default_shuffle.jsonl


  8%|▊         | 13/162 [00:04<00:49,  2.98it/s]

gemini-2.0-flash_ja-jp_base_input_xml-full_output_reverse_shuffle.jsonl


  9%|▊         | 14/162 [00:05<00:44,  3.36it/s]

gemini-2.0-flash_ja-jp_base_input_xml-full_output_longest-first_shuffle.jsonl


  9%|▉         | 15/162 [00:05<00:40,  3.59it/s]

gemini-2.0-flash_ja-jp_base_input_xml-full_output_shortest-first_shuffle.jsonl


 10%|▉         | 16/162 [00:05<00:39,  3.68it/s]

gemini-2.0-flash_ja-jp_base_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 10%|█         | 17/162 [00:05<00:38,  3.77it/s]

gemini-2.0-flash_ja-jp_base_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 11%|█         | 18/162 [00:06<00:37,  3.83it/s]

gemini-2.0-flash_ja-jp_json_input_base_output_default_shuffle.jsonl


 12%|█▏        | 19/162 [00:06<00:36,  3.95it/s]

gemini-2.0-flash_ja-jp_json_input_base_output_reverse_shuffle.jsonl


 12%|█▏        | 20/162 [00:06<00:35,  3.99it/s]

gemini-2.0-flash_ja-jp_json_input_base_output_longest-first_shuffle.jsonl


 13%|█▎        | 21/162 [00:06<00:36,  3.87it/s]

gemini-2.0-flash_ja-jp_json_input_base_output_shortest-first_shuffle.jsonl


 14%|█▎        | 22/162 [00:07<00:36,  3.79it/s]

gemini-2.0-flash_ja-jp_json_input_base_output_most-kana-ratio_shuffle.jsonl


 14%|█▍        | 23/162 [00:07<00:37,  3.69it/s]

gemini-2.0-flash_ja-jp_json_input_base_output_fewest-kana-ratio_shuffle.jsonl


 15%|█▍        | 24/162 [00:07<00:38,  3.61it/s]

gemini-2.0-flash_ja-jp_json_input_json-full_output_default_shuffle.jsonl


 15%|█▌        | 25/162 [00:08<00:42,  3.19it/s]

gemini-2.0-flash_ja-jp_json_input_json-full_output_reverse_shuffle.jsonl


 16%|█▌        | 26/162 [00:08<00:46,  2.95it/s]

gemini-2.0-flash_ja-jp_json_input_json-full_output_longest-first_shuffle.jsonl


 17%|█▋        | 27/162 [00:08<00:49,  2.73it/s]

gemini-2.0-flash_ja-jp_json_input_json-full_output_shortest-first_shuffle.jsonl


 17%|█▋        | 28/162 [00:09<00:51,  2.58it/s]

gemini-2.0-flash_ja-jp_json_input_json-full_output_most-kana-ratio_shuffle.jsonl


 18%|█▊        | 29/162 [00:09<00:54,  2.45it/s]

gemini-2.0-flash_ja-jp_json_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 19%|█▊        | 30/162 [00:10<00:55,  2.38it/s]

gemini-2.0-flash_ja-jp_json_input_xml-full_output_default_shuffle.jsonl


 19%|█▉        | 31/162 [00:10<00:53,  2.43it/s]

gemini-2.0-flash_ja-jp_json_input_xml-full_output_reverse_shuffle.jsonl


 20%|█▉        | 32/162 [00:10<00:47,  2.75it/s]

gemini-2.0-flash_ja-jp_json_input_xml-full_output_longest-first_shuffle.jsonl


 20%|██        | 33/162 [00:11<00:43,  2.95it/s]

gemini-2.0-flash_ja-jp_json_input_xml-full_output_shortest-first_shuffle.jsonl


 21%|██        | 34/162 [00:11<00:41,  3.12it/s]

gemini-2.0-flash_ja-jp_json_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 22%|██▏       | 35/162 [00:11<00:39,  3.20it/s]

gemini-2.0-flash_ja-jp_json_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 22%|██▏       | 36/162 [00:12<00:38,  3.26it/s]

gemini-2.0-flash_ja-jp_xml_input_base_output_default_shuffle.jsonl


 23%|██▎       | 37/162 [00:12<00:38,  3.29it/s]

gemini-2.0-flash_ja-jp_xml_input_base_output_reverse_shuffle.jsonl


 23%|██▎       | 38/162 [00:12<00:37,  3.26it/s]

gemini-2.0-flash_ja-jp_xml_input_base_output_longest-first_shuffle.jsonl


 24%|██▍       | 39/162 [00:13<00:38,  3.17it/s]

gemini-2.0-flash_ja-jp_xml_input_base_output_shortest-first_shuffle.jsonl


 25%|██▍       | 40/162 [00:13<00:39,  3.09it/s]

gemini-2.0-flash_ja-jp_xml_input_base_output_most-kana-ratio_shuffle.jsonl


 25%|██▌       | 41/162 [00:13<00:40,  3.01it/s]

gemini-2.0-flash_ja-jp_xml_input_base_output_fewest-kana-ratio_shuffle.jsonl


 26%|██▌       | 42/162 [00:14<00:40,  2.94it/s]

gemini-2.0-flash_ja-jp_xml_input_json-full_output_default_shuffle.jsonl


 27%|██▋       | 43/162 [00:14<00:37,  3.14it/s]

gemini-2.0-flash_ja-jp_xml_input_json-full_output_reverse_shuffle.jsonl


 27%|██▋       | 44/162 [00:14<00:35,  3.28it/s]

gemini-2.0-flash_ja-jp_xml_input_json-full_output_longest-first_shuffle.jsonl


 28%|██▊       | 45/162 [00:14<00:35,  3.28it/s]

gemini-2.0-flash_ja-jp_xml_input_json-full_output_shortest-first_shuffle.jsonl


 28%|██▊       | 46/162 [00:15<00:35,  3.28it/s]

gemini-2.0-flash_ja-jp_xml_input_json-full_output_most-kana-ratio_shuffle.jsonl


 29%|██▉       | 47/162 [00:15<00:35,  3.25it/s]

gemini-2.0-flash_ja-jp_xml_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 30%|██▉       | 48/162 [00:15<00:35,  3.22it/s]

gemini-2.0-flash_ja-jp_xml_input_xml-full_output_default_shuffle.jsonl


 30%|███       | 49/162 [00:16<00:33,  3.38it/s]

gemini-2.0-flash_ja-jp_xml_input_xml-full_output_reverse_shuffle.jsonl


 31%|███       | 50/162 [00:16<00:32,  3.45it/s]

gemini-2.0-flash_ja-jp_xml_input_xml-full_output_longest-first_shuffle.jsonl


 31%|███▏      | 51/162 [00:16<00:32,  3.41it/s]

gemini-2.0-flash_ja-jp_xml_input_xml-full_output_shortest-first_shuffle.jsonl


 32%|███▏      | 52/162 [00:17<00:32,  3.37it/s]

gemini-2.0-flash_ja-jp_xml_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 33%|███▎      | 53/162 [00:17<00:33,  3.30it/s]

gemini-2.0-flash_ja-jp_xml_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 33%|███▎      | 54/162 [00:17<00:33,  3.25it/s]

mistral-small-2503_ja-jp_base_input_base_output_default_shuffle.jsonl


 34%|███▍      | 55/162 [00:17<00:30,  3.51it/s]

mistral-small-2503_ja-jp_base_input_base_output_reverse_shuffle.jsonl


 35%|███▍      | 56/162 [00:18<00:28,  3.67it/s]

mistral-small-2503_ja-jp_base_input_base_output_longest-first_shuffle.jsonl


 35%|███▌      | 57/162 [00:18<00:28,  3.66it/s]

mistral-small-2503_ja-jp_base_input_base_output_shortest-first_shuffle.jsonl


 36%|███▌      | 58/162 [00:18<00:28,  3.65it/s]

mistral-small-2503_ja-jp_base_input_base_output_most-kana-ratio_shuffle.jsonl


 36%|███▋      | 59/162 [00:18<00:28,  3.60it/s]

mistral-small-2503_ja-jp_base_input_base_output_fewest-kana-ratio_shuffle.jsonl


 37%|███▋      | 60/162 [00:19<00:28,  3.56it/s]

mistral-small-2503_ja-jp_base_input_json-full_output_default_shuffle.jsonl


 38%|███▊      | 61/162 [00:19<00:29,  3.38it/s]

mistral-small-2503_ja-jp_base_input_json-full_output_reverse_shuffle.jsonl


 38%|███▊      | 62/162 [00:20<00:35,  2.84it/s]

mistral-small-2503_ja-jp_base_input_json-full_output_longest-first_shuffle.jsonl


 39%|███▉      | 63/162 [00:20<00:35,  2.80it/s]

mistral-small-2503_ja-jp_base_input_json-full_output_shortest-first_shuffle.jsonl


 40%|███▉      | 64/162 [00:20<00:35,  2.78it/s]

mistral-small-2503_ja-jp_base_input_json-full_output_most-kana-ratio_shuffle.jsonl


 40%|████      | 65/162 [00:21<00:35,  2.73it/s]

mistral-small-2503_ja-jp_base_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 41%|████      | 66/162 [00:21<00:35,  2.69it/s]

mistral-small-2503_ja-jp_base_input_xml-full_output_default_shuffle.jsonl


 41%|████▏     | 67/162 [00:21<00:30,  3.11it/s]

mistral-small-2503_ja-jp_base_input_xml-full_output_reverse_shuffle.jsonl


 42%|████▏     | 68/162 [00:21<00:27,  3.46it/s]

mistral-small-2503_ja-jp_base_input_xml-full_output_longest-first_shuffle.jsonl


 43%|████▎     | 69/162 [00:22<00:25,  3.66it/s]

mistral-small-2503_ja-jp_base_input_xml-full_output_shortest-first_shuffle.jsonl


 43%|████▎     | 70/162 [00:22<00:24,  3.81it/s]

mistral-small-2503_ja-jp_base_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 44%|████▍     | 71/162 [00:22<00:23,  3.86it/s]

mistral-small-2503_ja-jp_base_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 44%|████▍     | 72/162 [00:22<00:23,  3.89it/s]

mistral-small-2503_ja-jp_json_input_base_output_default_shuffle.jsonl


 45%|████▌     | 73/162 [00:23<00:22,  3.99it/s]

mistral-small-2503_ja-jp_json_input_base_output_reverse_shuffle.jsonl


 46%|████▌     | 74/162 [00:23<00:21,  4.00it/s]

mistral-small-2503_ja-jp_json_input_base_output_longest-first_shuffle.jsonl


 46%|████▋     | 75/162 [00:23<00:22,  3.89it/s]

mistral-small-2503_ja-jp_json_input_base_output_shortest-first_shuffle.jsonl


 47%|████▋     | 76/162 [00:24<00:22,  3.80it/s]

mistral-small-2503_ja-jp_json_input_base_output_most-kana-ratio_shuffle.jsonl


 48%|████▊     | 77/162 [00:24<00:23,  3.69it/s]

mistral-small-2503_ja-jp_json_input_base_output_fewest-kana-ratio_shuffle.jsonl


 48%|████▊     | 78/162 [00:24<00:23,  3.60it/s]

mistral-small-2503_ja-jp_json_input_json-full_output_default_shuffle.jsonl


 49%|████▉     | 79/162 [00:24<00:25,  3.29it/s]

mistral-small-2503_ja-jp_json_input_json-full_output_reverse_shuffle.jsonl


 49%|████▉     | 80/162 [00:25<00:26,  3.04it/s]

mistral-small-2503_ja-jp_json_input_json-full_output_longest-first_shuffle.jsonl


 50%|█████     | 81/162 [00:25<00:28,  2.84it/s]

mistral-small-2503_ja-jp_json_input_json-full_output_shortest-first_shuffle.jsonl


 51%|█████     | 82/162 [00:26<00:29,  2.71it/s]

mistral-small-2503_ja-jp_json_input_json-full_output_most-kana-ratio_shuffle.jsonl


 51%|█████     | 83/162 [00:26<00:30,  2.60it/s]

mistral-small-2503_ja-jp_json_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 52%|█████▏    | 84/162 [00:27<00:30,  2.52it/s]

mistral-small-2503_ja-jp_json_input_xml-full_output_default_shuffle.jsonl


 52%|█████▏    | 85/162 [00:27<00:27,  2.85it/s]

mistral-small-2503_ja-jp_json_input_xml-full_output_reverse_shuffle.jsonl


 53%|█████▎    | 86/162 [00:27<00:24,  3.10it/s]

mistral-small-2503_ja-jp_json_input_xml-full_output_longest-first_shuffle.jsonl


 54%|█████▎    | 87/162 [00:27<00:23,  3.23it/s]

mistral-small-2503_ja-jp_json_input_xml-full_output_shortest-first_shuffle.jsonl


 54%|█████▍    | 88/162 [00:28<00:22,  3.32it/s]

mistral-small-2503_ja-jp_json_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 55%|█████▍    | 89/162 [00:28<00:21,  3.34it/s]

mistral-small-2503_ja-jp_json_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 56%|█████▌    | 90/162 [00:28<00:21,  3.34it/s]

mistral-small-2503_ja-jp_xml_input_base_output_default_shuffle.jsonl


 56%|█████▌    | 91/162 [00:28<00:21,  3.30it/s]

mistral-small-2503_ja-jp_xml_input_base_output_reverse_shuffle.jsonl


 57%|█████▋    | 92/162 [00:29<00:21,  3.24it/s]

mistral-small-2503_ja-jp_xml_input_base_output_longest-first_shuffle.jsonl


 57%|█████▋    | 93/162 [00:29<00:22,  3.09it/s]

mistral-small-2503_ja-jp_xml_input_base_output_shortest-first_shuffle.jsonl


 58%|█████▊    | 94/162 [00:30<00:26,  2.60it/s]

mistral-small-2503_ja-jp_xml_input_base_output_most-kana-ratio_shuffle.jsonl


 59%|█████▊    | 95/162 [00:30<00:25,  2.61it/s]

mistral-small-2503_ja-jp_xml_input_base_output_fewest-kana-ratio_shuffle.jsonl


 59%|█████▉    | 96/162 [00:30<00:25,  2.63it/s]

mistral-small-2503_ja-jp_xml_input_json-full_output_default_shuffle.jsonl


 60%|█████▉    | 97/162 [00:31<00:22,  2.87it/s]

mistral-small-2503_ja-jp_xml_input_json-full_output_reverse_shuffle.jsonl


 60%|██████    | 98/162 [00:31<00:21,  3.04it/s]

mistral-small-2503_ja-jp_xml_input_json-full_output_longest-first_shuffle.jsonl


 61%|██████    | 99/162 [00:31<00:20,  3.09it/s]

mistral-small-2503_ja-jp_xml_input_json-full_output_shortest-first_shuffle.jsonl


 62%|██████▏   | 100/162 [00:32<00:19,  3.12it/s]

mistral-small-2503_ja-jp_xml_input_json-full_output_most-kana-ratio_shuffle.jsonl


 62%|██████▏   | 101/162 [00:32<00:19,  3.12it/s]

mistral-small-2503_ja-jp_xml_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 63%|██████▎   | 102/162 [00:32<00:19,  3.12it/s]

mistral-small-2503_ja-jp_xml_input_xml-full_output_default_shuffle.jsonl


 64%|██████▎   | 103/162 [00:33<00:18,  3.27it/s]

mistral-small-2503_ja-jp_xml_input_xml-full_output_reverse_shuffle.jsonl


 64%|██████▍   | 104/162 [00:33<00:17,  3.35it/s]

mistral-small-2503_ja-jp_xml_input_xml-full_output_longest-first_shuffle.jsonl


 65%|██████▍   | 105/162 [00:33<00:17,  3.32it/s]

mistral-small-2503_ja-jp_xml_input_xml-full_output_shortest-first_shuffle.jsonl


 65%|██████▌   | 106/162 [00:33<00:17,  3.29it/s]

mistral-small-2503_ja-jp_xml_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 66%|██████▌   | 107/162 [00:34<00:16,  3.24it/s]

mistral-small-2503_ja-jp_xml_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 67%|██████▋   | 109/162 [00:34<00:14,  3.60it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_base_output_default_shuffle.jsonl
llama-v3p1-8b-instruct_ja-jp_base_input_base_output_reverse_shuffle.jsonl


 68%|██████▊   | 110/162 [00:34<00:13,  3.89it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_base_output_longest-first_shuffle.jsonl


 69%|██████▊   | 111/162 [00:35<00:12,  3.96it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_base_output_shortest-first_shuffle.jsonl


 69%|██████▉   | 112/162 [00:35<00:12,  4.01it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_base_output_most-kana-ratio_shuffle.jsonl


 70%|██████▉   | 113/162 [00:35<00:12,  3.97it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_base_output_fewest-kana-ratio_shuffle.jsonl


 70%|███████   | 114/162 [00:35<00:12,  3.95it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_json-full_output_default_shuffle.jsonl


 71%|███████   | 115/162 [00:36<00:11,  4.18it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_json-full_output_reverse_shuffle.jsonl


 72%|███████▏  | 116/162 [00:36<00:10,  4.25it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_json-full_output_longest-first_shuffle.jsonl


 72%|███████▏  | 117/162 [00:36<00:10,  4.21it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_json-full_output_shortest-first_shuffle.jsonl


 73%|███████▎  | 118/162 [00:36<00:10,  4.17it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_json-full_output_most-kana-ratio_shuffle.jsonl


 73%|███████▎  | 119/162 [00:37<00:10,  4.04it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 74%|███████▍  | 120/162 [00:37<00:10,  3.98it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_xml-full_output_default_shuffle.jsonl


 75%|███████▍  | 121/162 [00:37<00:09,  4.21it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_xml-full_output_reverse_shuffle.jsonl


 75%|███████▌  | 122/162 [00:37<00:09,  4.34it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_xml-full_output_longest-first_shuffle.jsonl


 76%|███████▌  | 123/162 [00:38<00:09,  4.27it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_xml-full_output_shortest-first_shuffle.jsonl


 77%|███████▋  | 124/162 [00:38<00:09,  4.20it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 77%|███████▋  | 125/162 [00:38<00:09,  4.10it/s]

llama-v3p1-8b-instruct_ja-jp_base_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 78%|███████▊  | 126/162 [00:38<00:08,  4.03it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_base_output_default_shuffle.jsonl


 78%|███████▊  | 127/162 [00:39<00:08,  4.08it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_base_output_reverse_shuffle.jsonl


 79%|███████▉  | 128/162 [00:39<00:08,  4.05it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_base_output_longest-first_shuffle.jsonl


 80%|███████▉  | 129/162 [00:39<00:08,  3.89it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_base_output_shortest-first_shuffle.jsonl


 80%|████████  | 130/162 [00:39<00:08,  3.78it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_base_output_most-kana-ratio_shuffle.jsonl


 81%|████████  | 131/162 [00:40<00:08,  3.65it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_base_output_fewest-kana-ratio_shuffle.jsonl


 81%|████████▏ | 132/162 [00:40<00:08,  3.56it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_json-full_output_default_shuffle.jsonl


 82%|████████▏ | 133/162 [00:40<00:07,  3.70it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_json-full_output_reverse_shuffle.jsonl


 83%|████████▎ | 134/162 [00:40<00:07,  3.76it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_json-full_output_longest-first_shuffle.jsonl


 83%|████████▎ | 135/162 [00:41<00:07,  3.68it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_json-full_output_shortest-first_shuffle.jsonl


 84%|████████▍ | 136/162 [00:41<00:07,  3.57it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_json-full_output_most-kana-ratio_shuffle.jsonl


 85%|████████▍ | 137/162 [00:41<00:07,  3.46it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 85%|████████▌ | 138/162 [00:42<00:07,  3.41it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_xml-full_output_default_shuffle.jsonl


 86%|████████▌ | 139/162 [00:42<00:06,  3.59it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_xml-full_output_reverse_shuffle.jsonl


 86%|████████▋ | 140/162 [00:42<00:05,  3.68it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_xml-full_output_longest-first_shuffle.jsonl


 87%|████████▋ | 141/162 [00:42<00:05,  3.61it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_xml-full_output_shortest-first_shuffle.jsonl


 88%|████████▊ | 142/162 [00:43<00:05,  3.58it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 88%|████████▊ | 143/162 [00:43<00:05,  3.51it/s]

llama-v3p1-8b-instruct_ja-jp_json_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


 89%|████████▉ | 144/162 [00:43<00:05,  3.45it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_base_output_default_shuffle.jsonl


 90%|████████▉ | 145/162 [00:44<00:04,  3.54it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_base_output_reverse_shuffle.jsonl


 90%|█████████ | 146/162 [00:44<00:04,  3.54it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_base_output_longest-first_shuffle.jsonl


 91%|█████████ | 147/162 [00:44<00:04,  3.45it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_base_output_shortest-first_shuffle.jsonl


 91%|█████████▏| 148/162 [00:45<00:04,  3.38it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_base_output_most-kana-ratio_shuffle.jsonl


 92%|█████████▏| 149/162 [00:45<00:03,  3.30it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_base_output_fewest-kana-ratio_shuffle.jsonl


 93%|█████████▎| 150/162 [00:45<00:03,  3.24it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_json-full_output_default_shuffle.jsonl


 93%|█████████▎| 151/162 [00:45<00:03,  3.37it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_json-full_output_reverse_shuffle.jsonl


 94%|█████████▍| 152/162 [00:46<00:02,  3.44it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_json-full_output_longest-first_shuffle.jsonl


 94%|█████████▍| 153/162 [00:46<00:02,  3.38it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_json-full_output_shortest-first_shuffle.jsonl


 95%|█████████▌| 154/162 [00:46<00:02,  3.33it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_json-full_output_most-kana-ratio_shuffle.jsonl


 96%|█████████▌| 155/162 [00:47<00:02,  3.27it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_json-full_output_fewest-kana-ratio_shuffle.jsonl


 96%|█████████▋| 156/162 [00:47<00:01,  3.21it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_xml-full_output_default_shuffle.jsonl


 97%|█████████▋| 157/162 [00:47<00:01,  3.35it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_xml-full_output_reverse_shuffle.jsonl


 98%|█████████▊| 158/162 [00:48<00:01,  3.41it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_xml-full_output_longest-first_shuffle.jsonl


 98%|█████████▊| 159/162 [00:48<00:00,  3.36it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_xml-full_output_shortest-first_shuffle.jsonl


 99%|█████████▉| 160/162 [00:48<00:00,  3.32it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_xml-full_output_most-kana-ratio_shuffle.jsonl


 99%|█████████▉| 161/162 [00:49<00:00,  2.77it/s]

llama-v3p1-8b-instruct_ja-jp_xml_input_xml-full_output_fewest-kana-ratio_shuffle.jsonl


100%|██████████| 162/162 [00:49<00:00,  3.27it/s]


In [7]:
for df in experiment_dfs:
    model_name = df['Model'][0]
    print(model_name)
    csv_name = f"{model_name}_merged_results.csv"
    df.to_csv(csv_name, index=False)

gemini-2.0-flash
mistral-small-2503
llama-v3p1-8b-instruct


In [None]:
test_df = pd.read_csv('gemini-2.0-flash_merged_results.csv')