In [None]:

# 02: 分词器对比（fugashi/MeCab, SudachiPy, SentencePiece）
import sys, os
repo_root = os.path.abspath(".")
if repo_root not in sys.path: sys.path.append(repo_root)

# 确保依赖
!pip -q install fugashi unidic-lite SudachiPy sudachidict-core sentencepiece > /dev/null

from fugashi import Tagger
from sudachipy import dictionary, tokenizer as tknz
from transformers import AutoTokenizer
from src.data.load_jglue import load_jnli
import numpy as np, pandas as pd, random

tagger = Tagger()
sud = dictionary.Dictionary().create()
spm = AutoTokenizer.from_pretrained("rinna/japanese-gpt-neox-3.6b")

def mecab_tokens(t): return [m.surface for m in tagger(t)]
def sudachi_tokens(t, mode=tknz.Tokenizer.SplitMode.C): return [m.surface() for m in sud.tokenize(t, mode)]
def spm_tokens(t): return spm.convert_ids_to_tokens(spm(t, add_special_tokens=False)["input_ids"])

# 样例对比
sample_text = "本日は晴天なり。生成モデルの分かち書きとサブワード分割を比較します。2024年のデータ。"
print("TEXT:", sample_text)
print("MeCab:", mecab_tokens(sample_text))
print("Sudachi(C):", sudachi_tokens(sample_text))
print("SPM:", spm_tokens(sample_text))

# 500 例统计
ds = load_jnli("validation")
sents = [x["sentence1"] for x in ds.select(range(min(500, len(ds))))]

def avg_len(fn):
    lens = [len(fn(s)) for s in sents]
    return float(np.mean(lens)), float(np.percentile(lens,95))

res = pd.DataFrame([{
    "tokenizer":"MeCab", "mean_len": avg_len(mecab_tokens)[0], "p95_len": avg_len(mecab_tokens)[1]
},{
    "tokenizer":"Sudachi(C)", "mean_len": avg_len(sudachi_tokens)[0], "p95_len": avg_len(sudachi_tokens)[1]
},{
    "tokenizer":"SentencePiece", "mean_len": avg_len(spm_tokens)[0], "p95_len": avg_len(spm_tokens)[1]
}])
print(res)

# 保存 CSV
import pathlib
out = pathlib.Path("reports/phase1"); out.mkdir(parents=True, exist_ok=True)
res.to_csv(out/"tokenizer_len_stats.csv", index=False)
print("Saved:", out/"tokenizer_len_stats.csv")
