## 勾配ブースティングを使って文書分類モデルを実装

In [1]:
%load_ext lab_black

In [2]:
import os
from glob import glob
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# カテゴリ（ディレクトリ名）をリスト化
categories = [name for name in os.listdir("text") if os.path.isdir("text/" + name)]
print(categories)

# カテゴリをID化
category2id = {}
for i, cat in enumerate(categories):
    category2id[cat] = i

# DataFrame作成
datasets = pd.DataFrame(columns=["document", "category"])

for category in tqdm(categories):
    path = "text/" + category + "/*.txt"
    files = glob(path)
    for text_name in files:
        with open(text_name, "r", encoding="utf-8") as f:
            document = f.read()
            row = pd.Series([document, category], index=datasets.columns)
            datasets = datasets.append(row, ignore_index=True)
print("doc num", len(datasets))

['dokujo-tsushin', 'it-life-hack', 'kaden-channel', 'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch', 'topic-news']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for category in tqdm(categories):


  0%|          | 0/9 [00:00<?, ?it/s]

doc num 7376


#### 形態素解析の定義

In [3]:
import MeCab
import re

tagger = MeCab.Tagger("-Owakati")


def make_wakati(sentence):
    sentence = tagger.parse(sentence)
    sentence = re.sub(r"[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+", " ", sentence)
    sentence = re.sub(
        r"[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=＝)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+",
        "",
        sentence,
    )
    wakati = sentence.split(" ")
    wakati = list(filter(("").__ne__, wakati))
    return wakati

#### Word2Vecで単語分散表現作成

In [4]:
from gensim.models import Word2Vec
import numpy as np
import logging

# word2vec parameters
num_features = 200
min_word_count = 5
num_workers = 40
context = 10
downsampling = 1e-3
model_name = "livedoor_corpus_feature200.model"

# コーパス読み込み
corpus = []
for doc in tqdm(datasets["document"]):
    corpus.append(make_wakati(doc))


# word2vecモデルの作成＆モデルの保存
print("cleating word2vec model ...")
logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)
model = Word2Vec(
    corpus,
    workers=num_workers,
    hs=0,
    sg=1,
    negative=10,
    epochs=25,
    vector_size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling,
    seed=1,
)
model.save(model_name)
print("Done.")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm(datasets["document"]):


  0%|          | 0/7376 [00:00<?, ?it/s]

2021-12-18 01:56:50,623 : INFO : collecting all words and their counts
2021-12-18 01:56:50,624 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


cleating word2vec model ...


2021-12-18 01:56:52,388 : INFO : collected 55890 word types from a corpus of 4000496 raw words and 7376 sentences
2021-12-18 01:56:52,390 : INFO : Creating a fresh vocabulary
2021-12-18 01:56:52,708 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 24508 unique words (43.850420468777955%% of original 55890, drops 31382)', 'datetime': '2021-12-18T01:56:52.708476', 'gensim': '4.1.2', 'python': '3.9.9 (main, Dec 18 2021, 00:37:41) \n[GCC 9.3.0]', 'platform': 'Linux-5.10.76-linuxkit-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2021-12-18 01:56:52,711 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 3944386 word corpus (98.597423919434%% of original 4000496, drops 56110)', 'datetime': '2021-12-18T01:56:52.711429', 'gensim': '4.1.2', 'python': '3.9.9 (main, Dec 18 2021, 00:37:41) \n[GCC 9.3.0]', 'platform': 'Linux-5.10.76-linuxkit-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2021-12-18 01:56:53,198 : INFO : deleting the raw counts dicti

Done.


#### 文書ベクトル作成用関数の定義
単語の和の平均で文書ベクトルを作成

In [5]:
def wordvec2docvec(sentence):
    # 文章ベクトルの初期値（0ベクトルを初期値とする）
    docvecs = np.zeros(num_features, dtype="float32")

    # 文章に現れる単語のうち、モデルに存在しない単語をカウントする
    denomenator = len(sentence)

    # 文章内の各単語ベクトルを足し合わせる
    for word in sentence:
        try:
            temp = model[word]
        except:
            denomenator -= 1
            continue
        docvecs += temp

    # 文章に現れる単語のうち、モデルに存在した単語の数で割る
    if denomenator > 0:
        docvecs = docvecs / denomenator

    return docvecs

#### データ準備②

In [6]:
from sklearn.model_selection import train_test_split

print(len(datasets["document"]))
X, Y = [], []
for doc, category in tqdm(zip(datasets["document"], datasets["category"])):
    wakati = make_wakati(doc)
    docvec = wordvec2docvec(wakati)
    X.append(list(docvec))
    Y.append(category2id[category])
data_X = pd.DataFrame(X, columns=["X" + str(i + 1) for i in range(num_features)])
data_Y = pd.DataFrame(Y, columns=["category_id"])

train_x, test_x, train_y, test_y = train_test_split(data_X, data_Y, train_size=0.7)

7376


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc, category in tqdm(zip(datasets["document"], datasets["category"])):


0it [00:00, ?it/s]

#### XGBoostで分類器を作成＆予測

In [7]:
import xgboost as xgb
from sklearn.metrics import classification_report

print("Fitting XGboost model ...")
xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_x, train_y)
print("Done.")

# 予測
pred = xgb_model.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

Fitting XGboost model ...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Done.
                precision    recall  f1-score   support

dokujo-tsushin       0.00      0.00      0.00         0
  it-life-hack       0.00      0.00      0.00         0
 kaden-channel       0.00      0.00      0.00         0
livedoor-homme       0.00      0.00      0.00         0
   movie-enter       0.00      0.00      0.00         0
        peachy       0.00      0.00      0.00         0
          smax       0.00      0.00      0.00         0
  sports-watch       1.00      0.12      0.22      2213
    topic-news       0.00      0.00      0.00         0

      accuracy                           0.12      2213
     macro avg       0.11      0.01      0.02      2213
  weighted avg       1.00      0.12      0.22      2213



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### LightGBMで分類器の作成＆予測

In [8]:
import lightgbm as lgbm

print("Fitting LightGBM model ...")
lgbm_model = lgbm.LGBMClassifier()
lgbm_model.fit(train_x, train_y)
print("Done.")

# 予測
pred = lgbm_model.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

Fitting LightGBM model ...
Done.
                precision    recall  f1-score   support

dokujo-tsushin       0.00      0.00      0.00         0
  it-life-hack       0.00      0.00      0.00         0
 kaden-channel       0.00      0.00      0.00         0
livedoor-homme       0.00      0.00      0.00         0
   movie-enter       0.00      0.00      0.00         0
        peachy       0.00      0.00      0.00         0
          smax       0.00      0.00      0.00         0
  sports-watch       1.00      0.12      0.22      2213
    topic-news       0.00      0.00      0.00         0

      accuracy                           0.12      2213
     macro avg       0.11      0.01      0.02      2213
  weighted avg       1.00      0.12      0.22      2213



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(train_x, train_y)
pred = rfc.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

  rfc.fit(train_x, train_y)


                precision    recall  f1-score   support

dokujo-tsushin       0.00      0.00      0.00         0
  it-life-hack       0.00      0.00      0.00         0
 kaden-channel       0.00      0.00      0.00         0
livedoor-homme       0.00      0.00      0.00         0
   movie-enter       0.00      0.00      0.00         0
        peachy       0.00      0.00      0.00         0
          smax       0.00      0.00      0.00         0
  sports-watch       1.00      0.12      0.22      2213
    topic-news       0.00      0.00      0.00         0

      accuracy                           0.12      2213
     macro avg       0.11      0.01      0.02      2213
  weighted avg       1.00      0.12      0.22      2213



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### TF-IDFによる文書ベクトル

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = datasets["document"]
tfidf_vectorizer = TfidfVectorizer(analyzer=make_wakati)
tfidfs = tfidf_vectorizer.fit_transform(corpus)
print(tfidfs.shape)
# (7376, 79673)

tfidf_data_X = pd.DataFrame(
    tfidfs.toarray(), columns=["X" + str(i) for i in range(tfidfs.shape[1])]
)
train_x, test_x, train_y, test_y = train_test_split(data_X, data_Y, train_size=0.7)
lgbm_model = lgbm.LGBMClassifier()
lgbm_model.fit(train_x, train_y)
pred = lgbm_model.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

(7376, 55890)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                precision    recall  f1-score   support

dokujo-tsushin       0.00      0.00      0.00         0
  it-life-hack       0.00      0.00      0.00         0
 kaden-channel       0.00      0.00      0.00         0
livedoor-homme       0.00      0.00      0.00         0
   movie-enter       0.00      0.00      0.00         0
        peachy       0.00      0.00      0.00         0
          smax       0.00      0.00      0.00         0
  sports-watch       1.00      0.12      0.21      2213
    topic-news       0.00      0.00      0.00         0

      accuracy                           0.12      2213
     macro avg       0.11      0.01      0.02      2213
  weighted avg       1.00      0.12      0.21      2213



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
