In [None]:
import numpy

rs = numpy.random.RandomState(12345)
rs

In [None]:
import os
os.chdir('/home/dsuser/workspace/backend')

In [None]:
import joblib
from modules.model.text_classify import *


In [None]:
itr = "00"
dataset = "aozora"
pipe = joblib.load(f"data/model/pipe-jptokenizermecab_{dataset}set_iter{itr}.gz")
pipe

In [None]:
dataset = joblib.load(f"data/dataset/{dataset}set_iter{itr}.gz")
dataset

In [None]:
# dataset.shuffle().split()
X_train, X_valid = dataset.get_data(do_split=True)
y_train, y_valid = dataset.get_labels(do_split=True)
len(X_train), len(X_valid)

In [None]:
# train with trainset
# pipe.fit(X_train, y_train)

In [None]:
# predict trainset
p_train = pipe.predict(X_train)
train_acc = accuracy_score(y_train, p_train)
train_acc

In [None]:
# predict validset
prob_valid = pipe.predict_proba(X_valid)

In [None]:
lgbm = pipe[-1]
p_valid = lgbm._le.inverse_transform(prob_valid.argmax(axis=1))
valid_acc = accuracy_score(y_valid, p_valid)
valid_acc

In [None]:
lgbm._le.inverse_transform(range(len(dataset.labelset)))

In [None]:
dataset.labelset

# PR Curve

In [None]:
import numpy
from sklearn import metrics

In [None]:
from matplotlib import pyplot

def plot_pr_curve_by_label(ax, lbl: str, t: numpy.ndarray, p: numpy.ndarray):
    fpr, tpr, thresholds = metrics.roc_curve(t, p)
    auc = metrics.auc(fpr, tpr)
    # print(f"ROC Area / {lbl} : (AUC = {auc:.3f})")

    precision, recall, thresholds = metrics.precision_recall_curve(t, p)
    ap = metrics.average_precision_score(t, p)

    ax.step(recall, precision, color='g', alpha=0.2, where='post')
    ax.fill_between(recall, precision, step='post', alpha=0.2, color='g')

    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_xlim([0.0, 1.05])
    ax.set_ylim([0.0, 1.0])
    ax.set_title(f'{lbl} : AP={ap:0.3f} / AUC={auc:0.3f}')


In [None]:
def plot_pr_curve(dataset, pipe):
    n_classes = len(dataset.labelset)
    n_rows = int(numpy.sqrt(n_classes))
    print("n_rows:", n_rows)

    delta = int(bool(n_classes % n_rows))
    n = (n_classes // n_rows) * n_rows + n_rows * delta
    n_cols = n // n_rows
    n_rows -= int(n == n_classes + n_cols)
    fig, axes = pyplot.subplots(nrows=n_rows, ncols=n_cols, figsize = (10, 10), squeeze=False, tight_layout=True)
    fig.suptitle('PR-Curve', fontsize=16)

    lgbm = pipe[-1]
    for idx, lbl in enumerate(dataset.labelset):
        idx_lbl = lgbm._le.transform([lbl])

        t = (numpy.array(y_valid) == lbl).astype(numpy.int32)
        p = pipe.predict_proba(X_valid)[:, idx_lbl]

        r = idx // n_cols
        c = idx % n_cols
        plot_pr_curve_by_label(axes[r, c], lbl, t, p)


    # show plots
    fig.tight_layout()
    fig.show()


In [None]:
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

In [None]:
plot_pr_curve(dataset, pipe)

# LIME

In [None]:
tokenizer = pipe[0]
tokenizer

In [None]:
def make_feature_text(idx: int):
    # sentence = X_valid[idx]
    # return "".join(sentence)
    tokens = tokenizer.transform([X_valid[idx]])[0]
    feature = " ".join(tokens)
    return feature



In [None]:
from lime.lime_text import LimeTextExplainer
labels_indices = range(len(dataset.labelset))
labels = lgbm._le.inverse_transform(labels_indices)
explainer = LimeTextExplainer(class_names=labels, split_expression=lambda x: x.split(" "))

In [None]:
labels

In [None]:
y_valid[:5], p_valid[:5]

## Pipeline を作り直す
- LIME　の文分割の処理に合わせて、トークナイザをスプリット処理して、空白を除去し次のパイプライン（CountVectorizer）に渡す

In [None]:
from sklearn.pipeline import make_pipeline

_pipe = make_pipeline(Splitter(), pipe[1:])
_pipe

In [None]:
from sklearn.linear_model import Ridge
model_regressor = Ridge(alpha=1, fit_intercept=True, random_state=rs)

def _explain(indices_samples, n_pickup=2):
    for idx in indices_samples[:n_pickup]:
        print("idx:", idx, f"actual: {y_valid[idx]}", f"prediction: {p_valid[idx]}")

        x = make_feature_text(idx)
        print("x:", x[:64])
        # exp = explainer.explain_instance(x, _pipe.predict_proba, num_features=10, labels=labels_indices, model_regressor=model_regressor)
        exp = explainer.explain_instance(x, _pipe.predict_proba, num_features=10, top_labels=2, model_regressor=model_regressor)
        exp.show_in_notebook(text=True)
    return

In [None]:
indices_samples_correct = numpy.arange(len(y_valid))[y_valid == p_valid]
indices_samples_correct = rs.permutation(indices_samples_correct)

In [None]:
_explain(indices_samples_correct)

In [None]:
indices_samples_wrong = numpy.arange(len(y_valid))[y_valid != p_valid]
indices_samples_wrong = rs.permutation(indices_samples_wrong)

In [None]:
_explain(indices_samples_wrong)

- 911 詩歌 　(1,689件)
- 915 日記．書簡．紀行 　(656件)
- 913 小説．物語 　(6,329件)
    - K913: 子ども・小学生向けの本
- 914 評論．エッセイ．随筆 　(4,366件)

---

- 910 日本文学 　(240件)
- 912 戯曲 　(228件)
- 916 記録．手記．ルポルタージュ 　(109件)
- 917 箴言．アフォリズム．寸言 　(19件)
- 918 作品集
- 919 漢詩文．日本漢文学 　(4件)


In [None]:
pipe2 = make_pipeline(Splitter(sep=" "), pipe)

In [None]:
idx = indices_samples_correct[0]
x = " ".join(X_valid[idx])
x[:200]
type(x)

In [None]:
def _tokenize(x: str) -> list:
    sentences = x.split(" ")
    tokenized = tokenizer.transform([sentences])
    return tokenized[0]

In [None]:
y_valid[idx], p_valid[idx]

In [None]:
# model_regressor = Ridge(alpha=1, fit_intercept=True, random_state=rs)
explainer = LimeTextExplainer(class_names=labels, split_expression=_tokenize)
exp = explainer.explain_instance(x, pipe2.predict_proba, num_features=10, top_labels=2, model_regressor=model_regressor)
exp.show_in_notebook(text=True)
