# Chapter6-3 文書分類


## 6-3-2 前処理

In [None]:
from glob import glob

In [None]:
directories = glob('text/*')
directories

In [None]:
txts = glob('text/*.txt')
txts

In [None]:
for txt in txts:
    directories.remove(txt)

In [None]:
directories

In [None]:
filepaths = glob('text/it-life-hack/*.txt')

In [None]:
filepath = filepaths[0]
filepath

In [None]:
with open(filepath, encoding='utf-8') as f:
    # URL等の先頭２行を除いた各行の文章を連結（join）して格納
    text = ''.join(f.readlines()[2:])

In [None]:
# 先頭の 150 文字を表示
text[:150]

In [None]:
from janome.tokenizer import Tokenizer

In [None]:
tagger = Tokenizer(wakati=True)

In [None]:
words = tagger.tokenize(text)
words[:10]

In [None]:
def preprocessing(filepath):
    with open(filepath, encoding='utf-8') as f:
        # URL等の先頭２行を除いた各行の文章を連結（join）して格納
        text = ''.join(f.readlines()[2:])
        text = text.replace('\u3000', '')
        text = text.replace('\n', '')
        words = tagger.tokenize(text)
    return words

In [None]:
words = preprocessing(filepath)
words[:10]

In [None]:
def labeling(directory):
    if 'it-life-hack' in directory or 'kaden-channel' in directory:
        return 1
    else:
        return 0

In [None]:
labeling('text/it-life-hack')

In [None]:
labeling('text/movie-enter')

In [None]:
word_collect, labels = [], []
for directory in directories:
    filepaths = glob(directory + '/*.txt')
    for filepath in filepaths:
        words = preprocessing(filepath)
        label = labeling(directory)
        word_collect.append(words)
        labels.append(label)

In [None]:
len(word_collect)

In [None]:
from gensim import corpora, matutils

In [None]:
dictionary = corpora.Dictionary(word_collect)

In [None]:
n_words = len(dictionary)
n_words

In [None]:
dictionary.filter_extremes(no_below=20)

In [None]:
n_words = len(dictionary)
n_words

In [None]:
bow_ids = []
for words in word_collect:
    bow_id = dictionary.doc2bow(words)
    bow_ids.append(bow_id)

In [None]:
bows = matutils.corpus2dense(bow_ids, n_words).T

In [None]:
bows.shape

In [None]:
import numpy as np

In [None]:
# i は データ型を int32 と指定
labels = np.array(labels, 'i')

## 6-3-3 モデル構築

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.hist(labels, bins=3)

In [None]:
weight = {
    0: len(labels) / len(labels[labels==0]),
    1: len(labels) / len(labels[labels==1])
}

In [None]:
weight

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, t_train, t_test = train_test_split(bows, labels, test_size=0.3, random_state=0)

In [None]:
x_train.shape, x_test.shape, t_train.shape, t_test.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(class_weight=weight, random_state=0)

In [None]:
clf.fit(x_train, t_train)

In [None]:
clf.score(x_train, t_train)

In [None]:
clf.score(x_test, t_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [None]:
y_test = clf.predict(x_test)

In [None]:
# 適合率
precision_score(y_test, t_test)

In [None]:
# 再現率
recall_score(y_test, t_test)

In [None]:
# 混合行列
matrix = confusion_matrix(t_test, y_test)
matrix

In [None]:
import seaborn as sns

In [None]:
# 混合行列の可視化
sns.heatmap(matrix, annot=True, cmap='Blues');
plt.xlabel('Prediction')
plt.ylabel('Target')

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# ハイパーパラメータの候補を列挙
params = {
    "max_depth": list(range(2, 5)),
    "criterion": ["gini", "entropy"]
}

In [None]:
# 評価する scoring に precision を指定
# cv は交差検証の分割数
clf_grid = GridSearchCV(
    estimator=DecisionTreeClassifier(class_weight=weight, random_state=0),
    scoring='precision',
    param_grid=params,
    cv=5,
)

In [None]:
# すべてのハイパーパラメータの候補で訓練
clf_grid.fit(x_train, t_train)

In [None]:
# 最も良かったハイパーパラメータの組み合わせ
clf_grid.best_params_

In [None]:
# 最も良かったハイパーパラメータを持つ訓練済みモデルを受け継ぐ
clf = clf_grid.best_estimator_
clf

In [None]:
# 正解率
clf.score(x_test, t_test)

In [None]:
# 予測値の計算
y_test = clf.predict(x_test)

In [None]:
# 適合率
precision_score(t_test, y_test)

In [None]:
# 再現率
recall_score(t_test, y_test)

In [None]:
# 混合行列
matrix = confusion_matrix(t_test, y_test)
matrix

In [None]:
# 混合行列の可視化
sns.heatmap(matrix, annot=True, cmap='Blues');
plt.xlabel('Prediction')
plt.ylabel('Target')

In [None]:
import joblib

In [None]:
# モデルの保存
joblib.dump(clf, 'document_classifier.pkl')

In [None]:
# 辞書の保存
joblib.dump(dictionary, 'dictionary.pkl')