<a href="https://colab.research.google.com/github/tmdoi/small-Japanese-LLM-compare/blob/main/benchMarkOrginal_v01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install "transformers>=4.43" accelerate torch --upgrade
!pip -q install pandas sacrebleu rouge-score fugashi ipadic

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m51.2/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m697.9/697.9 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone


In [2]:
# === (Colab 1セル完結) RakutenAI-2.0-mini-instruct vs TinySwallow-1.5B-Instruct 簡易ベンチマーク ===
# セットアップ
try:
    import google.colab  # noqa: F401
    IN_COLAB = True
except Exception:
    IN_COLAB = False

import sys, subprocess, math, time, re
def pip_install(pkgs):
    cmd = [sys.executable, "-m", "pip", "install", "-q", "--upgrade"] + pkgs
    print("Installing:", " ".join(pkgs))
    subprocess.check_call(cmd)

# 主要ライブラリ
#pip_install(["transformers>=4.43", "accelerate", "torch", "pandas", "sacrebleu", "rouge-score", "fugashi", "ipadic"])

# ---- 以降 Python 本体 ----
import torch, pandas as pd
from dataclasses import dataclass
from typing import List, Dict, Any
from transformers import AutoModelForCausalLM, AutoTokenizer
from rouge_score import rouge_scorer
import sacrebleu
import numpy as np
import random

# 乱数固定（再現性の一助）
random.seed(0)
np.random.seed(0)
if torch.cuda.is_available():
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)

# 比較対象モデル（必要に応じて変更可）
MODELS = {
    "RakutenAI-2.0-mini-instruct": "Rakuten/RakutenAI-2.0-mini-instruct",
    "TinySwallow-1.5B-Instruct":   "SakanaAI/TinySwallow-1.5B-Instruct",
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

@dataclass
class GenConfig:
    max_new_tokens: int = 256
    temperature: float = 0.0   # 再現性重視
    top_p: float = 1.0
    do_sample: bool = False
    num_beams: int = 1

GENCFG = GenConfig()

def load_model(repo_id: str):
    tok = AutoTokenizer.from_pretrained(repo_id, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        repo_id,
        torch_dtype="auto",
        device_map="auto",   # ColabのGPUに自動割当
    )
    return tok, model

def chat_generate(tokenizer, model, messages: List[Dict[str, str]], cfg: GenConfig = GENCFG):
    # 各モデルのchatテンプレートを利用
    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)

    attn = None
    if tokenizer.pad_token_id is not None:
        attn = input_ids.ne(tokenizer.pad_token_id).long()

    t0 = time.perf_counter()
    with torch.no_grad():
        out_ids = model.generate(
            input_ids,
            max_new_tokens=cfg.max_new_tokens,
            do_sample=cfg.do_sample,
            temperature=cfg.temperature,
            top_p=cfg.top_p,
            num_beams=cfg.num_beams,
            attention_mask=attn,
            pad_token_id=tokenizer.eos_token_id,
        )
    dt = time.perf_counter() - t0
    gen_ids = out_ids[:, input_ids.shape[1]:]
    text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
    toks = gen_ids.shape[1]
    tps = toks / dt if dt > 0 else float("nan")
    return text, {"latency_sec": dt, "gen_tokens": toks, "tok_per_sec": tps}

# 簡易タスク（自動採点可能なもの中心）
TASKS = [
    {
        "name": "JA-QA: 富士山の標高",
        "messages": [
            {"role":"system","content":"あなたは有能な日本語アシスタントです。"},
            {"role":"user","content":"富士山の標高は？数値と単位で簡潔に答えてください。"}
        ],
        # 3776 を数値として含めれば正解扱い（ゆるい判定）
        "judge": lambda x: ("3776" in re.sub(r"[^\d]", "", x)) or ("3,776" in x) or ("3776 m" in x) or ("3776メートル" in x),
    },
    {
        "name": "算数: 12×(7+5)",
        "messages": [
            {"role":"system","content":"あなたは計算に正確です。"},
            {"role":"user","content":"12×(7+5) の結果だけを半角数字で答えてください。"}
        ],
        "judge": lambda x: "144" in re.sub(r"[^\d\-]", "", x),
    },
    {
        "name": "要約: 5文→1文",
        "messages": [
            {"role":"system","content":"与えられた段落を1文で要約してください。"},
            {"role":"user","content":
             "奈良公園には多くのシカが生息し、観光客に人気です。"
             "近年は観光客の増加に伴い、エサの与え方やごみ問題が課題となっています。"
             "地元自治体はルール啓発と清掃活動を強化しています。"
             "一方で来園者のマナー向上には時間がかかるとの指摘もあります。"
             "持続可能な観光の実現に向け、地域と来訪者の協力が求められています。"
            }
        ],
        "ref": "奈良公園のシカと観光をめぐる課題に対し、自治体と来訪者の協力による持続可能な観光の実現が求められている。",
        "rougeL": True
    },
    {
        "name": "翻訳: EN→JA",
        "messages": [
            {"role":"system","content":"次の英文を自然な日本語に翻訳してください。"},
            {"role":"user","content":"Edge-friendly small LLMs enable private, low-latency applications without relying on cloud services."}
        ],
        "ref": "エッジ向けの小型LLMは、クラウドサービスに依存せずにプライバシーに配慮した低遅延アプリケーションを可能にする。",
        "bleu": True
    },
]

scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

def evaluate_one(model_name, repo):
    tok, mdl = load_model(repo)
    rows = []
    for task in TASKS:
        out, stats = chat_generate(tok, mdl, task["messages"])
        row = {
            "model": model_name,
            "task": task["name"],
            "output": out,
            **stats
        }
        if "judge" in task:
            row["pass@1"] = bool(task["judge"](out))
        if task.get("rougeL"):
            r = scorer.score(task["ref"], out)["rougeL"].fmeasure
            row["ROUGE-L"] = r
        if task.get("bleu"):
            bleu = sacrebleu.corpus_bleu([out], [[task["ref"]]]).score
            row["BLEU"] = bleu
        rows.append(row)
    return pd.DataFrame(rows)

# 実行
all_dfs = []
for name, repo in MODELS.items():
    print(f"\n== Evaluating {name} ==")
    df = evaluate_one(name, repo)
    display(df[["model","task","pass@1","ROUGE-L","BLEU","latency_sec","tok_per_sec","output"]])
    all_dfs.append(df)

summary = pd.concat(all_dfs, ignore_index=True)

# 集計（タスク別の平均）
def safe_mean(xs):
    xs = [x for x in xs if x is not None and not (isinstance(x, float) and math.isnan(x))]
    return float(np.mean(xs)) if xs else float("nan")

report = []
for m in summary["model"].unique():
    sub = summary[summary["model"]==m]
    pass_mean = safe_mean([1.0 if x is True else (0.0 if x is False else None) for x in sub.get("pass@1", []).tolist()])
    rouge_mean = safe_mean(sub.get("ROUGE-L", []).tolist())
    bleu_mean  = safe_mean(sub.get("BLEU", []).tolist())
    tps_mean   = safe_mean(sub.get("tok_per_sec", []).tolist())
    lat_mean   = safe_mean(sub.get("latency_sec", []).tolist())
    report.append({
        "model": m,
        "pass@1(mean)": pass_mean,
        "ROUGE-L(mean)": rouge_mean,
        "BLEU(mean)": bleu_mean,
        "tok_per_sec(mean)": tps_mean,
        "latency_sec(mean)": lat_mean
    })

print("\n== Summary ==")
display(pd.DataFrame(report))

# 生成長・温度などを変えたい場合の例（任意で有効化）
# GENCFG.max_new_tokens = 128
# GENCFG.temperature = 0.7
# GENCFG.do_sample = True
# print("New GenConfig:", GENCFG)


Device: cpu

== Evaluating RakutenAI-2.0-mini-instruct ==


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/755k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Unnamed: 0,model,task,pass@1,ROUGE-L,BLEU,latency_sec,tok_per_sec,output
0,RakutenAI-2.0-mini-instruct,JA-QA: 富士山の標高,True,,,29.191549,0.513847,"富士山の標高は3,776メートルです。"
1,RakutenAI-2.0-mini-instruct,算数: 12×(7+5),True,,,19.86546,1.208127,12 × (7 + 5) = 12 × 12 = 144
2,RakutenAI-2.0-mini-instruct,要約: 5文→1文,,0.0,,84.137854,1.045903,奈良公園に生息するシカは観光客に人気がありますが、近年の観光客の増加に伴い、エサの与え方やご...
3,RakutenAI-2.0-mini-instruct,翻訳: EN→JA,,,0.0,24.320581,1.274641,Edge-friendly small LLMsは、クラウドサービスに頼ることなく、プライベ...



== Evaluating TinySwallow-1.5B-Instruct ==


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Unnamed: 0,model,task,pass@1,ROUGE-L,BLEU,latency_sec,tok_per_sec,output
0,TinySwallow-1.5B-Instruct,JA-QA: 富士山の標高,True,,,25.761466,0.388177,"3,776メートル"
1,TinySwallow-1.5B-Instruct,算数: 12×(7+5),False,,,11.693934,0.342058,94
2,TinySwallow-1.5B-Instruct,要約: 5文→1文,,0.0,,87.900588,1.137649,奈良公園では、シカへの餌やりやゴミ問題など、観光客による影響が課題となっており、地元自治体は...
3,TinySwallow-1.5B-Instruct,翻訳: EN→JA,,,0.0,33.850027,1.299851,エッジフレンドリーな小型LLMは、クラウドサービスに依存せずにプライベートで低遅延のアプリケ...



== Summary ==


Unnamed: 0,model,pass@1(mean),ROUGE-L(mean),BLEU(mean),tok_per_sec(mean),latency_sec(mean)
0,RakutenAI-2.0-mini-instruct,1.0,0.0,0.0,1.010629,39.378861
1,TinySwallow-1.5B-Instruct,0.5,0.0,0.0,0.791934,39.801503


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

