# MeCab, SentencePiece の精度評価

- 評価データセット：ldcc
- 評価方法：pipeline
    - ../model/
        - pipe-jptokenizermecab.gz
        - pipe-jptokenizersentencepiece.gz

In [1]:
import numpy
import pandas
import scipy.stats

In [2]:
import sys
sys.path.append('../')

from classify_ldcc import DocRecord, DatasetLdcc
from classify_ldcc import JpTokenizerMeCab, JpTokenizerSentencePiece

'pattern' package not found; tag filters are not available for English


## Pipelineの確認

In [3]:
import os
import joblib
from classify_ldcc import ident_tokener, SparsetoDense, Transer
os.chdir("../")
pipe_mecab = joblib.load("model/pipe-jptokenizermecab.gz")
pipe_sentencepiece = joblib.load("model/pipe-jptokenizersentencepiece.gz")
os.chdir("notebook/")

In [4]:
pipe_mecab

Pipeline(memory=None,
         steps=[('tokenizer',
                 <classify_ldcc.JpTokenizerMeCab object at 0x7f9fec6db908>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf...
                 LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                colsample_bytree=1.0, importance_type='gain',
                                learning_rate=0.1, max_depth=-1,
                                min_child_samples=20, min_child_weight=0.001,
                                min_split_gain=0.0, n_estimators=100, n_j

In [5]:
pipe_sentencepiece

Pipeline(memory=None,
         steps=[('tokenizer',
                 <classify_ldcc.JpTokenizerSentencePiece object at 0x7f9f61290d68>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, sm...
                 LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                colsample_bytree=1.0, importance_type='gain',
                                learning_rate=0.1, max_depth=-1,
                                min_child_samples=20, min_child_weight=0.001,
                                min_split_gain=0.0, n_estimators=100, n_j

In [6]:
result_csv = "../data/result.csv"
columns = ["tokenizer", "train_acc", "valid_acc", "elapsed_time", "cpu_time"]
df = pandas.read_csv(result_csv, header=None, names=columns)
df.head()

Unnamed: 0,tokenizer,train_acc,valid_acc,elapsed_time,cpu_time
0,JpTokenizerMeCab,1.0,0.94256,62.041894,296.817731
1,JpTokenizerSentencePiece,1.0,0.953867,94.967452,549.502217
2,JpTokenizerMeCab,1.0,0.953415,61.877692,294.31305
3,JpTokenizerSentencePiece,1.0,0.951153,95.696913,551.355981
4,JpTokenizerMeCab,1.0,0.947535,61.406188,291.654511


## 回数情報を追加

In [7]:
tokenizers = df["tokenizer"].drop_duplicates()
times = numpy.array([list(range(1, 10+1)) for tkr in tokenizers]).T.ravel()
df["times"] = times[:len(df)]
df.head()

Unnamed: 0,tokenizer,train_acc,valid_acc,elapsed_time,cpu_time,times
0,JpTokenizerMeCab,1.0,0.94256,62.041894,296.817731,1
1,JpTokenizerSentencePiece,1.0,0.953867,94.967452,549.502217,1
2,JpTokenizerMeCab,1.0,0.953415,61.877692,294.31305,2
3,JpTokenizerSentencePiece,1.0,0.951153,95.696913,551.355981,2
4,JpTokenizerMeCab,1.0,0.947535,61.406188,291.654511,3


## 実行時間を評価

In [8]:
_acc_df = df.pivot(index="tokenizer", columns="times", values=["valid_acc", "train_acc", "elapsed_time", "cpu_time"]).T
#_acc_df["mean"] = pvdf.mean(axis=1)
#_acc_df["std"] = pvdf.std(axis=1)
_acc_df.head(10)

Unnamed: 0_level_0,tokenizer,JpTokenizerMeCab,JpTokenizerSentencePiece
Unnamed: 0_level_1,times,Unnamed: 2_level_1,Unnamed: 3_level_1
valid_acc,1,0.94256,0.953867
valid_acc,2,0.953415,0.951153
valid_acc,3,0.947535,0.954772
valid_acc,4,0.952962,0.960651
valid_acc,5,0.954772,0.957033
valid_acc,6,0.953867,0.957485
valid_acc,7,0.951153,0.957485
valid_acc,8,0.94663,0.959294
valid_acc,9,0.947987,0.957033
valid_acc,10,0.949796,0.953415


### 経過時間

In [9]:
edf = _acc_df.loc["elapsed_time"].dropna().T
edf["mean"] = edf.mean(axis=1)
edf["std"] = edf.std(axis=1)
edf

times,1,2,3,4,5,6,7,8,9,10,mean,std
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
JpTokenizerMeCab,62.041894,61.877692,61.406188,61.73116,62.640263,61.855734,62.03053,61.769541,61.421218,61.370561,61.814478,0.362408
JpTokenizerSentencePiece,94.967452,95.696913,93.747547,94.643323,94.18893,94.490407,93.761616,95.202418,93.966003,94.462001,94.512661,0.604838


In [10]:
for tkr, m, s in edf[["mean", "std"]].reset_index().values:
    print(f"{tkr}: {m/60:.1f} min ({s:.1f} sec)")

JpTokenizerMeCab: 1.0 min (0.4 sec)
JpTokenizerSentencePiece: 1.6 min (0.6 sec)


### CPU時間

In [11]:
cdf = _acc_df.loc["cpu_time"].dropna().T
cdf["mean"] = cdf.mean(axis=1)
cdf["std"] = cdf.std(axis=1)
cdf

times,1,2,3,4,5,6,7,8,9,10,mean,std
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
JpTokenizerMeCab,296.817731,294.31305,291.654511,293.421905,300.750381,292.933979,295.147034,293.366931,291.36748,291.590212,294.136321,2.735782
JpTokenizerSentencePiece,549.502217,551.355981,540.871926,543.07545,542.472539,543.068041,541.148729,546.557152,542.139124,544.121128,544.431229,3.383975


In [12]:
for tkr, m, s in cdf[["mean", "std"]].reset_index().values:
    print(f"{tkr}: {m/60:.1f} min ({s:.1f} sec)")

JpTokenizerMeCab: 4.9 min (2.7 sec)
JpTokenizerSentencePiece: 9.1 min (3.4 sec)


## 精度評価

In [13]:
acc_df = _acc_df.loc["valid_acc"].dropna()
acc_df

tokenizer,JpTokenizerMeCab,JpTokenizerSentencePiece
times,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.94256,0.953867
2,0.953415,0.951153
3,0.947535,0.954772
4,0.952962,0.960651
5,0.954772,0.957033
6,0.953867,0.957485
7,0.951153,0.957485
8,0.94663,0.959294
9,0.947987,0.957033
10,0.949796,0.953415


In [14]:
acc = acc_df.dropna().T.copy()
acc["mean"] = acc.mean(axis=1)
acc["std"] = acc.std(axis=1)
acc["mean"] *= 100
acc["std"] *= 100
acc.sort_values("mean", ascending=False)

times,1,2,3,4,5,6,7,8,9,10,mean,std
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
JpTokenizerSentencePiece,0.953867,0.951153,0.954772,0.960651,0.957033,0.957485,0.957485,0.959294,0.957033,0.953415,95.621891,0.273473
JpTokenizerMeCab,0.94256,0.953415,0.947535,0.952962,0.954772,0.953867,0.951153,0.94663,0.947987,0.949796,95.006784,0.369215


In [15]:
for tkr, m, s in acc[["mean", "std"]].reset_index().values:
    print(f"{tkr}: {m:.1f} % ({s:.1f} %)")

JpTokenizerMeCab: 95.0 % (0.4 %)
JpTokenizerSentencePiece: 95.6 % (0.3 %)


## 検定

### 正規性の検定

In [16]:
for tkr in acc_df.columns:
    W, pvalue = scipy.stats.shapiro(acc_df[tkr].dropna())
    print(tkr, W, pvalue, pvalue < 0.05)

JpTokenizerMeCab 0.9387243390083313 0.5389084815979004 False
JpTokenizerSentencePiece 0.9642882943153381 0.833436906337738 False


### 正規乱数で検定に必要なサンプルサイズを評価

In [17]:
x = numpy.random.normal(0, 1, 10)
scipy.stats.shapiro(x)

(0.9630908966064453, 0.8204600811004639)

In [18]:
x = numpy.random.uniform(0, 1, 100)
scipy.stats.shapiro(x)

(0.9465673565864563, 0.0004963289247825742)

In [19]:
x = numpy.random.uniform(0, 1, 50)
scipy.stats.shapiro(x)

(0.9616269469261169, 0.10400641709566116)

- 10 サンプル程度では、正規分布からのサンプルであることを否定するのは難しそう
- 50サンプルで、ギリギリな印象
- 結果的に、50-100サンプルは正規性を否定できるためのサンプルとして取得したい

### t検定（対応あり）
- MeCab, SentencePiece の2群のみを比較するため、t検定でよい
- t検定は、正規性に頑健性があるので、参考として実行する

In [20]:
cols = acc_df.columns
for base in cols:
    for target in [trg for trg in cols if trg != base]:
        t, pvalue = scipy.stats.ttest_rel(acc_df[base], acc_df[target])
        if pvalue < 0.05:
            print(base, target, t, pvalue, (pvalue < 0.05))

JpTokenizerMeCab JpTokenizerSentencePiece -4.353253917718455 0.0018414918124858705 True
JpTokenizerSentencePiece JpTokenizerMeCab 4.353253917718455 0.0018414918124858705 True


### ウィルコクソンの符号順位検定
- 両側検定
- 連続補正なし（精度は、離散分布ではないため）

In [21]:
cols = acc_df.columns
for base in cols:
    for target in [trg for trg in cols if trg != base]:
        w, pvalue = scipy.stats.wilcoxon(acc_df[base], acc_df[target], correction=False)
        if pvalue < 0.05:
            print(base, target, w, pvalue, (pvalue < 0.05))

JpTokenizerMeCab JpTokenizerSentencePiece 1.0 0.0069104298078147995 True
JpTokenizerSentencePiece JpTokenizerMeCab 1.0 0.0069104298078147995 True


### 検定結果

- t検定も、ウィルコクソンの符号順位和検定のいずれも、有意差がある結果になった

| tokenizer name | accuracy mean (std) |
| --------------- | --- |
| JpTokenizerMeCab | 95.0 (0.4) |
| JpTokenizerSentencePiece | 95.6 (0.3) |

- MeCab の平均が、$95.0 \% (\pm 0.4 \%)$、SentencePiece の平均が $95.6 \% (\pm 0.3 \%)$ 
    - 精度は、MeCab < SentencePiece
    - 精度差は、偶然ではかなり発生しづらく（0.7%未満）、何らかの意味・理由があると言える

## まとめ

- MeCab, SentencePiece の精度を比較すると、有意に、SentencePiece の方が(約0.6%)よい
- 精度と実行時間の関係は、以下のようになる
    
| tokenizer name | accuracy mean (std) | elapsed time mean (std) | cpu time mean (std) |
| -------------- | --- | ----------------------- | ------------------- |
| JpTokenizerMeCab | 95.0 % (0.4 %) | 1.0 min (0.4 sec) | 4.9 min (2.7 sec) |
| JpTokenizerSentencePiece | 95.6 % (0.3 %) | 1.6 min (0.6 sec) | 9.1 min (3.4 sec) |
    
- 経過時間（elapsed time）を、比較すると 約 0.6 min = 36 sec の差であった
- CPU時間（cpu time）を、比較すると 約 4.9 min, 9.1 min と、倍近く差がある
    - これは、利用したCPU数が、倍近くであることを示している
    - つまり、MeCab は、4 cpu を利用したのに対し、SentencePiece は、8 cpu を利用したと考えられる
- 以上をまとめると
    - 計算資源が十分（8 cpu 以上）ある場合は、経過時間の差は大きくない（いずれも実用に耐えうる）
    - 計算資源が十分な場合は、若干だがより精度が高い SentencePiece を利用してよく
    - 計算資源が限られている場合（i.e. 経過時間を優先すべきとき）は、MeCab を利用した方が良さそうである

## 残課題

- サンプルサイズが、$10 \times 2=20$ と小さいので、$100 \times 2=200$ 程度で評価しても差があるのか否かを評価する
