In [1]:
import os
import csv

ROOT = "./results"

model_names = [
    "llm-jp-3-1.8b",
    "llm-jp-3-3.7b",
    "llm-jp-3-7.2b",
    "llm-jp-3-13b",
    "qwen2.5-3b",
    "qwen2.5-7b",
    "sarashina3b",
    "llama3.2-1b",
    "llama3.2-3b",
]

templates = [
    "standard",
    # "instructed"
]

num_shots = [0]

tasks = ["mcqa", "mcqa-en", "pubmedqa", "mt_en2ja", "mt_ja2en", "ner", "dc", "sts"]

In [2]:
def report_accs(task2accs):    
    qa_ja_tasks = [
        "medmcqa_jp", "usmleqa_jp", "medqa_jp", "igakuqa",
        "mmlu_medical_jp", "jmmlu_medical", "pubmedqa_jp"]
    qa_en_tasks = [
        "pubmedqa", "medmcqa", "usmleqa", "medqa", "mmlu_medical"]
    mt_en2ja = ["ejmmt-en2ja"]
    mt_ja2en = ["ejmmt-ja2en"]
    ner_tasks = [
        "mrner_disease", "mrner_medicine", "nrner", "bc2gm_jp",
        "bc5chem_jp", "bc5disease_jp", "jnlpba_jp", "ncbi_disease_jp"]
    dc_tasks = ["crade", "rrtnm", "smdis"]
    sts_tasks = ["jcsts"]
    
    qa_ja_accs = [task2accs[task] for task in task2accs if task in qa_ja_tasks]
    qa_en_accs = [task2accs[task] for task in task2accs if task in qa_en_tasks]
    mt_en2ja_accs = [task2accs[task] for task in task2accs if task in mt_en2ja]
    mt_ja2en_accs = [task2accs[task] for task in task2accs if task in mt_ja2en]
    ner_accs = [task2accs[task] for task in task2accs if task in ner_tasks]
    dc_tasks_accs = [task2accs[task] for task in task2accs if task in dc_tasks]
    sts_tasks_accs = [task2accs[task] for task in task2accs if task in sts_tasks]
    print(f"QA (ja): {sum(qa_ja_accs)/len(qa_ja_accs):.3f}\n"
          f"QA (en): {sum(qa_en_accs)/len(qa_en_accs):.3f}\n"
          f"MT (en2ja): {sum(mt_en2ja_accs)/len(mt_en2ja_accs):.3f}\n"
          f"MT (ja2en): {sum(mt_ja2en_accs)/len(mt_ja2en_accs):.3f}\n"
          f"Ner: {sum(ner_accs)/len(ner_accs):.3f}\n"
          f"DC: {sum(dc_tasks_accs)/len(dc_tasks_accs):.3f}\n"
          f"STS: {sum(sts_tasks_accs)/len(sts_tasks_accs):.3f}")

In [3]:
import itertools
for model_name, template, num_shot in itertools.product(model_names, templates, num_shots):
    result_dir = os.path.join(ROOT, f"{model_name}_{template}_{str(num_shot)}-shot")
    if not os.path.exists(result_dir):
        print(f"Directory {result_dir} does not exist.")
        continue
    task2accs = {}
    for task in tasks:
        csv_file_path = os.path.join(result_dir, f"{task}.csv")
        if not os.path.isfile(csv_file_path):
            print(f"Task result {csv_file_path} does not exist.")
            continue
        # print(f"Processing task {task} in {result_dir}")
        with open(csv_file_path, 'r') as csvfile:
            reader = csv.reader(csvfile)
            header = next(reader)
            for row in reader:
                task, acc = row[0], row[2]
                task2accs[task] = float(acc)
    try:
        print(f"Model: {model_name}, Template: {template}, NUM_SHOT: {num_shot}")
        report_accs(task2accs)
    except Exception as e:
        print(f"Error processing {model_name}, {template}, {num_shot}: {e}")
        continue

Model: llm-jp-3-1.8b, Template: standard, NUM_SHOT: 0
QA (ja): 0.307
QA (en): 0.326
MT (en2ja): 3.299
MT (ja2en): 8.232
Ner: 0.147
DC: 0.374
STS: -0.034
Model: llm-jp-3-3.7b, Template: standard, NUM_SHOT: 0
QA (ja): 0.308
QA (en): 0.321
MT (en2ja): 7.080
MT (ja2en): 12.602
Ner: 0.099
DC: 0.370
STS: -0.132
Model: llm-jp-3-7.2b, Template: standard, NUM_SHOT: 0
QA (ja): 0.329
QA (en): 0.368
MT (en2ja): 16.041
MT (ja2en): 16.198
Ner: 0.165
DC: 0.374
STS: -0.066
Model: llm-jp-3-13b, Template: standard, NUM_SHOT: 0
QA (ja): 0.371
QA (en): 0.398
MT (en2ja): 19.474
MT (ja2en): 16.139
Ner: 0.153
DC: 0.370
STS: -0.132
Model: qwen2.5-3b, Template: standard, NUM_SHOT: 0
QA (ja): 0.387
QA (en): 0.519
MT (en2ja): 2.679
MT (ja2en): 0.641
Ner: 0.041
DC: 0.467
STS: 0.284
Model: qwen2.5-7b, Template: standard, NUM_SHOT: 0
QA (ja): 0.462
QA (en): 0.574
MT (en2ja): 16.276
MT (ja2en): 10.876
Ner: 0.212
DC: 0.548
STS: 0.272
Model: sarashina3b, Template: standard, NUM_SHOT: 0
QA (ja): 0.396
QA (en): 0.419
MT