In [1]:
import os
import csv

ROOT = "./results"

model_names = [
    "llm-jp-3-13b",
    "llm-jp-3-13b-exp1",
    "llm-jp-3-13b-exp1-en",
    "llm-jp-3-13b-exp3-science",
]

templates = ["standard", "english"]
# templates = ["minimal"]

num_shots = [0]

In [2]:
def report_accs(task2accs):    
    prompt_tasks = [
        "nii_en5_mono_prompt-en",
        "nii_en5_mono_prompt-ja",
        "nii_en5_bi_prompt-en",
        "nii_en5_bi_prompt-ja",
        "nii_en5_tri_prompt-en",
        "nii_en5_tri_prompt-ja",
        "nii_ja5_mono_prompt-en",
        "nii_ja5_mono_prompt-ja",
        "nii_ja5_bi_prompt-en",
        "nii_ja5_bi_prompt-ja",
        "nii_ja5_tri_prompt-en",
        "nii_ja5_tri_prompt-ja"]
    
    for task in prompt_tasks:
        print(f"{task}: {task2accs[task]:.3f}")

In [3]:
import itertools
import pandas as pd
from tabulate import tabulate

accs = []
for model_name, template, num_shot in itertools.product(model_names, templates, num_shots):
    result_dir = os.path.join(ROOT, f"{model_name}_{template}_{str(num_shot)}-shot")
    if not os.path.exists(result_dir):
        print(f"Directory {result_dir} does not exist.")
        continue
    task2accs = {}
    csv_file_path = os.path.join(result_dir, f"prompt.csv")
    if not os.path.isfile(csv_file_path):
        print(f"Task result {csv_file_path} does not exist.")
        continue
    # print(f"Processing task {task} in {result_dir}")
    with open(csv_file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        for row in reader:
            task, acc = row[0], row[3]
            task2accs[task] = float(acc)
            if (task.endswith("ja") and template == "english") or (task.endswith("en") and template == "standard"):
                continue
            # train_lang = "en"
            if model_name == "llm-jp-3-13b":
                train_lang = "none"
            elif model_name == "llm-jp-3-13b-exp1":
                train_lang = "ja"
            elif model_name == "llm-jp-3-13b-exp1-en":
                train_lang = "en"
            else:
                continue

            accs.append({
                "model_name": model_name,
                "template-lang": "ja" if template == "standard" else "en",
                # "template-lang": "none",
                "train-lang": train_lang,
                "src-eval-lang": "ja" if task.startswith("nii_ja") else "en",
                "eval-lang": "ja" if task.endswith("ja") else "en",
                "prompt-type": task.split("_")[2] + "-sentence",
                # "num_shot": num_shot,
                "task": task,
                "task-prefix": task.split("-")[0],
                "acc": round(float(acc), 4),
            })
    df = pd.DataFrame(accs)

In [4]:

## 1. Does the continual-pretraining work well to inject new knowledge? 

_df = df[(df["eval-lang"]==df["train-lang"]) & (df["eval-lang"]==df["src-eval-lang"]) | (df["train-lang"]== "none")]
for task, sdf in _df.groupby('task'):
    if len(sdf) == 1:
        continue
    sdf.drop(["template-lang", "model_name", "task", "task-prefix"], axis=1, inplace=True)
    print(tabulate(sdf, headers='keys', tablefmt='pretty'))

+----+------------+---------------+-----------+-------------+--------+
|    | train-lang | src-eval-lang | eval-lang | prompt-type |  acc   |
+----+------------+---------------+-----------+-------------+--------+
| 7  |    none    |      en       |    en     | bi-sentence | 0.4202 |
| 31 |     en     |      en       |    en     | bi-sentence | 0.4267 |
+----+------------+---------------+-----------+-------------+--------+
+----+------------+---------------+-----------+---------------+--------+
|    | train-lang | src-eval-lang | eval-lang |  prompt-type  |  acc   |
+----+------------+---------------+-----------+---------------+--------+
| 6  |    none    |      en       |    en     | mono-sentence | 0.4785 |
| 30 |     en     |      en       |    en     | mono-sentence | 0.4915 |
+----+------------+---------------+-----------+---------------+--------+
+----+------------+---------------+-----------+--------------+--------+
|    | train-lang | src-eval-lang | eval-lang | prompt-type  |  

In [5]:

## 1. Does the continual-pretraining work well to inject new knowledge? 

_df = df[(df["eval-lang"]!=df["train-lang"]) & (df["train-lang"]==df["src-eval-lang"]) | (df["train-lang"]== "none")]
for task, sdf in _df.groupby('task'):
    if len(sdf) == 1:
        continue
    sdf.drop(["template-lang", "model_name", "task", "task-prefix"], axis=1, inplace=True)
    print(tabulate(sdf, headers='keys', tablefmt='pretty'))

+----+------------+---------------+-----------+-------------+--------+
|    | train-lang | src-eval-lang | eval-lang | prompt-type |  acc   |
+----+------------+---------------+-----------+-------------+--------+
| 1  |    none    |      en       |    ja     | bi-sentence | 0.4398 |
| 25 |     en     |      en       |    ja     | bi-sentence | 0.4387 |
+----+------------+---------------+-----------+-------------+--------+
+----+------------+---------------+-----------+---------------+--------+
|    | train-lang | src-eval-lang | eval-lang |  prompt-type  |  acc   |
+----+------------+---------------+-----------+---------------+--------+
| 0  |    none    |      en       |    ja     | mono-sentence | 0.4755 |
| 24 |     en     |      en       |    ja     | mono-sentence | 0.4717 |
+----+------------+---------------+-----------+---------------+--------+
+----+------------+---------------+-----------+--------------+--------+
|    | train-lang | src-eval-lang | eval-lang | prompt-type  |  