## 勾配ブースティングを使って文書分類モデルを実装

In [1]:
import os
from glob import glob
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# カテゴリ（ディレクトリ名）をリスト化
categories = [name for name in os.listdir('text') if os.path.isdir("text/" +name)]
print(categories)

# カテゴリをID化
category2id = {}
for i, cat in enumerate(categories):
    category2id[cat] = i

# DataFrame作成
datasets = pd.DataFrame(columns=["document", "category"])

for category in tqdm(categories):
    path = "text/" + category + "/*.txt"
    files = glob(path)
    for text_name in files:
        with open(text_name, 'r', encoding='utf-8') as f:
            document = f.read()
            row = pd.Series([document, category], index=datasets.columns)
            datasets = datasets.append(row, ignore_index=True)
print("doc num", len(datasets))


['dokujo-tsushin', 'it-life-hack', 'kaden-channel', 'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch', 'topic-news']


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


doc num 7376


#### 形態素解析の定義

In [2]:
import MeCab
import re

tagger = MeCab.Tagger("-Owakati")

def make_wakati(sentence):
    sentence = tagger.parse(sentence)
    sentence = re.sub(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+', " ", sentence)
    sentence = re.sub(r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=＝)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+', "", sentence)
    wakati = sentence.split(" ")
    wakati = list(filter(("").__ne__, wakati))
    return wakati

#### Word2Vecで単語分散表現作成

In [3]:
from gensim.models import Word2Vec
import numpy as np
import logging

# word2vec parameters
num_features    = 200
min_word_count  = 5
num_workers     = 40
context         = 10
downsampling    = 1e-3
model_name = "livedoor_corpus_feature200.model"

# コーパス読み込み
corpus = []
for doc in tqdm(datasets["document"]):
    corpus.append(make_wakati(doc))


# word2vecモデルの作成＆モデルの保存
print("cleating word2vec model ...")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(corpus, workers=num_workers, hs = 0, sg = 1, negative = 10, iter = 25, 
                 size=num_features, min_count = min_word_count,  window = context, sample = downsampling, seed=1)
model.save(model_name)
print("Done.")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=7376.0), HTML(value='')))

2021-03-06 13:31:11,821 : INFO : collecting all words and their counts
2021-03-06 13:31:11,839 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



cleating word2vec model ...


2021-03-06 13:31:13,783 : INFO : collected 59937 word types from a corpus of 3848224 raw words and 7376 sentences
2021-03-06 13:31:13,791 : INFO : Loading a fresh vocabulary
2021-03-06 13:31:14,962 : INFO : effective_min_count=5 retains 24994 unique words (41% of original 59937, drops 34943)
2021-03-06 13:31:14,971 : INFO : effective_min_count=5 leaves 3786969 word corpus (98% of original 3848224, drops 61255)
2021-03-06 13:31:15,497 : INFO : deleting the raw counts dictionary of 59937 items
2021-03-06 13:31:15,506 : INFO : sample=0.001 downsamples 34 most-common words
2021-03-06 13:31:15,516 : INFO : downsampling leaves estimated 2737656 word corpus (72.3% of prior 3786969)
2021-03-06 13:31:15,711 : INFO : estimated required memory for 24994 words and 200 dimensions: 52487400 bytes
2021-03-06 13:31:15,718 : INFO : resetting layer weights
2021-03-06 13:31:33,112 : INFO : training model with 40 workers on 24994 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=10 window

2021-03-06 13:34:17,846 : INFO : EPOCH 1 - PROGRESS: at 73.41% examples, 13498 words/s, in_qsize 75, out_qsize 0
2021-03-06 13:34:19,920 : INFO : EPOCH 1 - PROGRESS: at 73.59% examples, 13369 words/s, in_qsize 74, out_qsize 0
2021-03-06 13:34:21,448 : INFO : EPOCH 1 - PROGRESS: at 73.98% examples, 13324 words/s, in_qsize 71, out_qsize 1
2021-03-06 13:34:23,436 : INFO : EPOCH 1 - PROGRESS: at 74.81% examples, 13326 words/s, in_qsize 68, out_qsize 0
2021-03-06 13:34:24,504 : INFO : EPOCH 1 - PROGRESS: at 75.22% examples, 13325 words/s, in_qsize 66, out_qsize 0
2021-03-06 13:34:26,091 : INFO : EPOCH 1 - PROGRESS: at 75.89% examples, 13325 words/s, in_qsize 63, out_qsize 0
2021-03-06 13:34:27,537 : INFO : EPOCH 1 - PROGRESS: at 76.91% examples, 13372 words/s, in_qsize 56, out_qsize 3
2021-03-06 13:34:28,596 : INFO : EPOCH 1 - PROGRESS: at 77.18% examples, 13333 words/s, in_qsize 49, out_qsize 9
2021-03-06 13:34:29,716 : INFO : EPOCH 1 - PROGRESS: at 81.10% examples, 13644 words/s, in_qsize

2021-03-06 13:35:42,217 : INFO : EPOCH 2 - PROGRESS: at 19.54% examples, 11995 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:35:43,233 : INFO : EPOCH 2 - PROGRESS: at 20.25% examples, 12142 words/s, in_qsize 79, out_qsize 1
2021-03-06 13:35:44,913 : INFO : EPOCH 2 - PROGRESS: at 21.65% examples, 12494 words/s, in_qsize 79, out_qsize 10
2021-03-06 13:35:49,220 : INFO : EPOCH 2 - PROGRESS: at 25.07% examples, 13101 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:35:52,216 : INFO : EPOCH 2 - PROGRESS: at 25.52% examples, 12608 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:35:53,413 : INFO : EPOCH 2 - PROGRESS: at 26.32% examples, 12593 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:35:54,968 : INFO : EPOCH 2 - PROGRESS: at 28.28% examples, 12824 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:35:56,168 : INFO : EPOCH 2 - PROGRESS: at 29.20% examples, 12806 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:35:57,461 : INFO : EPOCH 2 - PROGRESS: at 29.98% examples, 12767 words/s, in_qsiz

2021-03-06 13:37:31,119 : INFO : worker thread finished; awaiting finish of 37 more threads
2021-03-06 13:37:33,312 : INFO : EPOCH 2 - PROGRESS: at 85.79% examples, 14953 words/s, in_qsize 36, out_qsize 1
2021-03-06 13:37:33,315 : INFO : worker thread finished; awaiting finish of 36 more threads
2021-03-06 13:37:33,971 : INFO : worker thread finished; awaiting finish of 35 more threads
2021-03-06 13:37:34,186 : INFO : worker thread finished; awaiting finish of 34 more threads
2021-03-06 13:37:34,394 : INFO : EPOCH 2 - PROGRESS: at 87.12% examples, 14980 words/s, in_qsize 33, out_qsize 1
2021-03-06 13:37:34,396 : INFO : worker thread finished; awaiting finish of 33 more threads
2021-03-06 13:37:34,550 : INFO : worker thread finished; awaiting finish of 32 more threads
2021-03-06 13:37:34,689 : INFO : worker thread finished; awaiting finish of 31 more threads
2021-03-06 13:37:34,919 : INFO : worker thread finished; awaiting finish of 30 more threads
2021-03-06 13:37:35,886 : INFO : EPOCH

2021-03-06 13:39:21,088 : INFO : EPOCH 3 - PROGRESS: at 45.35% examples, 13621 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:39:23,328 : INFO : EPOCH 3 - PROGRESS: at 45.55% examples, 13390 words/s, in_qsize 80, out_qsize 2
2021-03-06 13:39:24,467 : INFO : EPOCH 3 - PROGRESS: at 46.88% examples, 13642 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:39:25,879 : INFO : EPOCH 3 - PROGRESS: at 47.55% examples, 13654 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:39:27,294 : INFO : EPOCH 3 - PROGRESS: at 47.80% examples, 13539 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:39:28,360 : INFO : EPOCH 3 - PROGRESS: at 48.58% examples, 13595 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:39:29,870 : INFO : EPOCH 3 - PROGRESS: at 48.78% examples, 13465 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:39:31,339 : INFO : EPOCH 3 - PROGRESS: at 49.20% examples, 13406 words/s, in_qsize 80, out_qsize 5
2021-03-06 13:39:32,441 : INFO : EPOCH 3 - PROGRESS: at 52.70% examples, 14261 words/s, in_qsize

2021-03-06 13:40:34,364 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 13:40:34,366 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 13:40:34,564 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-03-06 13:40:34,678 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-03-06 13:40:34,734 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-03-06 13:40:34,846 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-03-06 13:40:34,865 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-03-06 13:40:34,869 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-03-06 13:40:34,922 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-03-06 13:40:34,929 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-03-06 13:40:34,951 : INFO : worker thread finished; awaiting finish of 2 more thre

2021-03-06 13:42:56,194 : INFO : EPOCH 4 - PROGRESS: at 73.26% examples, 15727 words/s, in_qsize 75, out_qsize 0
2021-03-06 13:42:57,324 : INFO : EPOCH 4 - PROGRESS: at 73.83% examples, 15741 words/s, in_qsize 71, out_qsize 1
2021-03-06 13:42:58,794 : INFO : EPOCH 4 - PROGRESS: at 74.80% examples, 15778 words/s, in_qsize 68, out_qsize 0
2021-03-06 13:42:59,800 : INFO : EPOCH 4 - PROGRESS: at 75.20% examples, 15764 words/s, in_qsize 64, out_qsize 2
2021-03-06 13:43:01,130 : INFO : EPOCH 4 - PROGRESS: at 76.46% examples, 15909 words/s, in_qsize 59, out_qsize 1
2021-03-06 13:43:02,433 : INFO : EPOCH 4 - PROGRESS: at 77.64% examples, 15961 words/s, in_qsize 56, out_qsize 0
2021-03-06 13:43:03,499 : INFO : EPOCH 4 - PROGRESS: at 79.43% examples, 16035 words/s, in_qsize 52, out_qsize 0
2021-03-06 13:43:04,759 : INFO : EPOCH 4 - PROGRESS: at 80.31% examples, 15992 words/s, in_qsize 47, out_qsize 3
2021-03-06 13:43:05,790 : INFO : EPOCH 4 - PROGRESS: at 83.16% examples, 16203 words/s, in_qsize

2021-03-06 13:44:34,985 : INFO : EPOCH 5 - PROGRESS: at 39.64% examples, 15054 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:44:36,492 : INFO : EPOCH 5 - PROGRESS: at 40.47% examples, 15103 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:44:37,687 : INFO : EPOCH 5 - PROGRESS: at 41.02% examples, 15135 words/s, in_qsize 79, out_qsize 1
2021-03-06 13:44:39,165 : INFO : EPOCH 5 - PROGRESS: at 42.22% examples, 15353 words/s, in_qsize 80, out_qsize 2
2021-03-06 13:44:40,588 : INFO : EPOCH 5 - PROGRESS: at 43.32% examples, 15582 words/s, in_qsize 77, out_qsize 4
2021-03-06 13:44:41,781 : INFO : EPOCH 5 - PROGRESS: at 45.35% examples, 16181 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:44:47,442 : INFO : EPOCH 5 - PROGRESS: at 45.55% examples, 15239 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:44:49,054 : INFO : EPOCH 5 - PROGRESS: at 46.49% examples, 15265 words/s, in_qsize 79, out_qsize 1
2021-03-06 13:44:50,844 : INFO : EPOCH 5 - PROGRESS: at 47.75% examples, 15412 words/s, in_qsize

2021-03-06 13:45:57,459 : INFO : worker thread finished; awaiting finish of 16 more threads
2021-03-06 13:45:57,638 : INFO : worker thread finished; awaiting finish of 15 more threads
2021-03-06 13:45:57,697 : INFO : worker thread finished; awaiting finish of 14 more threads
2021-03-06 13:45:57,787 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-03-06 13:45:57,840 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 13:45:57,873 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 13:45:57,877 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-03-06 13:45:58,022 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-03-06 13:45:58,054 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-03-06 13:45:58,070 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-03-06 13:45:58,078 : INFO : worker thread finished; awaiting finish of 6 more 

2021-03-06 13:48:26,585 : INFO : EPOCH 6 - PROGRESS: at 56.20% examples, 11405 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:48:30,327 : INFO : EPOCH 6 - PROGRESS: at 56.91% examples, 11256 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:48:33,346 : INFO : EPOCH 6 - PROGRESS: at 57.17% examples, 11083 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:48:35,424 : INFO : EPOCH 6 - PROGRESS: at 57.42% examples, 10981 words/s, in_qsize 76, out_qsize 3
2021-03-06 13:48:37,473 : INFO : EPOCH 6 - PROGRESS: at 59.37% examples, 11184 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:48:38,641 : INFO : EPOCH 6 - PROGRESS: at 60.07% examples, 11231 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:48:40,158 : INFO : EPOCH 6 - PROGRESS: at 60.24% examples, 11165 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:48:41,568 : INFO : EPOCH 6 - PROGRESS: at 60.43% examples, 11106 words/s, in_qsize 80, out_qsize 2
2021-03-06 13:48:43,368 : INFO : EPOCH 6 - PROGRESS: at 61.50% examples, 11192 words/s, in_qsize

2021-03-06 13:49:54,517 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-03-06 13:49:54,572 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 13:49:54,612 : INFO : EPOCH 6 - PROGRESS: at 95.78% examples, 11258 words/s, in_qsize 11, out_qsize 1
2021-03-06 13:49:54,615 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 13:49:54,913 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-03-06 13:49:55,149 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-03-06 13:49:55,191 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-03-06 13:49:55,278 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-03-06 13:49:55,366 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-03-06 13:49:55,718 : INFO : EPOCH 6 - PROGRESS: at 98.02% examples, 11380 words/s, in_qsize 5, out_qsize 1
2021-03-06 13:49:55,730 : INFO : worker thr

2021-03-06 13:52:12,077 : INFO : EPOCH 7 - PROGRESS: at 45.81% examples, 10135 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:52:13,383 : INFO : EPOCH 7 - PROGRESS: at 46.43% examples, 10186 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:52:15,457 : INFO : EPOCH 7 - PROGRESS: at 47.11% examples, 10183 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:52:16,510 : INFO : EPOCH 7 - PROGRESS: at 47.55% examples, 10205 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:52:17,780 : INFO : EPOCH 7 - PROGRESS: at 48.01% examples, 10210 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:52:19,445 : INFO : EPOCH 7 - PROGRESS: at 48.48% examples, 10184 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:52:21,683 : INFO : EPOCH 7 - PROGRESS: at 48.79% examples, 10075 words/s, in_qsize 79, out_qsize 9
2021-03-06 13:52:23,267 : INFO : EPOCH 7 - PROGRESS: at 50.91% examples, 10429 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:52:24,519 : INFO : EPOCH 7 - PROGRESS: at 51.80% examples, 10526 words/s, in_qsize

2021-03-06 13:53:28,927 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 13:53:28,981 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 13:53:29,020 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-03-06 13:53:29,033 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-03-06 13:53:29,095 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-03-06 13:53:29,133 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-03-06 13:53:29,145 : INFO : EPOCH 7 - PROGRESS: at 97.65% examples, 12658 words/s, in_qsize 6, out_qsize 1
2021-03-06 13:53:29,147 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-03-06 13:53:29,178 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-03-06 13:53:29,228 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-03-06 13:53:29,271 : INFO : worker thread finished; awaiting 

2021-03-06 13:55:50,105 : INFO : EPOCH 8 - PROGRESS: at 72.67% examples, 15642 words/s, in_qsize 78, out_qsize 0
2021-03-06 13:55:51,507 : INFO : EPOCH 8 - PROGRESS: at 73.13% examples, 15585 words/s, in_qsize 75, out_qsize 1
2021-03-06 13:55:53,116 : INFO : EPOCH 8 - PROGRESS: at 74.32% examples, 15686 words/s, in_qsize 70, out_qsize 0
2021-03-06 13:55:54,419 : INFO : EPOCH 8 - PROGRESS: at 74.57% examples, 15596 words/s, in_qsize 66, out_qsize 3
2021-03-06 13:55:55,563 : INFO : EPOCH 8 - PROGRESS: at 75.80% examples, 15765 words/s, in_qsize 63, out_qsize 0
2021-03-06 13:55:56,578 : INFO : EPOCH 8 - PROGRESS: at 76.48% examples, 15801 words/s, in_qsize 60, out_qsize 0
2021-03-06 13:55:57,727 : INFO : EPOCH 8 - PROGRESS: at 77.87% examples, 15863 words/s, in_qsize 56, out_qsize 0
2021-03-06 13:55:58,764 : INFO : EPOCH 8 - PROGRESS: at 79.73% examples, 15941 words/s, in_qsize 50, out_qsize 2
2021-03-06 13:56:00,815 : INFO : EPOCH 8 - PROGRESS: at 82.43% examples, 16047 words/s, in_qsize

2021-03-06 13:57:24,950 : INFO : EPOCH 9 - PROGRESS: at 37.69% examples, 14907 words/s, in_qsize 79, out_qsize 1
2021-03-06 13:57:30,741 : INFO : EPOCH 9 - PROGRESS: at 38.04% examples, 13977 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:57:32,716 : INFO : EPOCH 9 - PROGRESS: at 38.57% examples, 13965 words/s, in_qsize 78, out_qsize 1
2021-03-06 13:57:33,997 : INFO : EPOCH 9 - PROGRESS: at 39.34% examples, 14154 words/s, in_qsize 79, out_qsize 0
2021-03-06 13:57:35,257 : INFO : EPOCH 9 - PROGRESS: at 40.08% examples, 14192 words/s, in_qsize 80, out_qsize 1
2021-03-06 13:57:36,273 : INFO : EPOCH 9 - PROGRESS: at 40.59% examples, 14187 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:57:37,459 : INFO : EPOCH 9 - PROGRESS: at 41.84% examples, 14529 words/s, in_qsize 80, out_qsize 0
2021-03-06 13:57:38,665 : INFO : EPOCH 9 - PROGRESS: at 42.19% examples, 14479 words/s, in_qsize 78, out_qsize 1
2021-03-06 13:57:39,789 : INFO : EPOCH 9 - PROGRESS: at 42.41% examples, 14372 words/s, in_qsize

2021-03-06 13:58:56,720 : INFO : worker thread finished; awaiting finish of 16 more threads
2021-03-06 13:58:56,955 : INFO : worker thread finished; awaiting finish of 15 more threads
2021-03-06 13:58:56,994 : INFO : worker thread finished; awaiting finish of 14 more threads
2021-03-06 13:58:57,057 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-03-06 13:58:57,175 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 13:58:57,200 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 13:58:57,250 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-03-06 13:58:57,314 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-03-06 13:58:57,365 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-03-06 13:58:57,444 : INFO : EPOCH 9 - PROGRESS: at 97.30% examples, 16261 words/s, in_qsize 7, out_qsize 1
2021-03-06 13:58:57,447 : INFO : worker thread finished; await

2021-03-06 14:01:13,505 : INFO : EPOCH 10 - PROGRESS: at 60.67% examples, 13407 words/s, in_qsize 75, out_qsize 4
2021-03-06 14:01:17,804 : INFO : EPOCH 10 - PROGRESS: at 60.90% examples, 13043 words/s, in_qsize 80, out_qsize 9
2021-03-06 14:01:20,409 : INFO : EPOCH 10 - PROGRESS: at 61.09% examples, 12844 words/s, in_qsize 80, out_qsize 9
2021-03-06 14:01:23,111 : INFO : EPOCH 10 - PROGRESS: at 63.25% examples, 13129 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:01:24,309 : INFO : EPOCH 10 - PROGRESS: at 63.46% examples, 13067 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:01:26,474 : INFO : EPOCH 10 - PROGRESS: at 64.06% examples, 13014 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:01:28,224 : INFO : EPOCH 10 - PROGRESS: at 64.22% examples, 12908 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:01:31,158 : INFO : EPOCH 10 - PROGRESS: at 64.66% examples, 12754 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:01:32,654 : INFO : EPOCH 10 - PROGRESS: at 64.90% examples, 12676 words/s,

2021-03-06 14:03:15,883 : INFO : EPOCH 11 - PROGRESS: at 0.68% examples, 809 words/s, in_qsize 76, out_qsize 3
2021-03-06 14:03:17,154 : INFO : EPOCH 11 - PROGRESS: at 0.87% examples, 982 words/s, in_qsize 50, out_qsize 28
2021-03-06 14:03:18,499 : INFO : EPOCH 11 - PROGRESS: at 6.68% examples, 7463 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:03:22,839 : INFO : EPOCH 11 - PROGRESS: at 6.89% examples, 6810 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:03:31,051 : INFO : EPOCH 11 - PROGRESS: at 7.05% examples, 5770 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:03:34,241 : INFO : EPOCH 11 - PROGRESS: at 7.23% examples, 5542 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:03:35,605 : INFO : EPOCH 11 - PROGRESS: at 7.40% examples, 5523 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:03:37,431 : INFO : EPOCH 11 - PROGRESS: at 7.58% examples, 5453 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:03:38,510 : INFO : EPOCH 11 - PROGRESS: at 8.09% examples, 5709 words/s, in_qsize 80, out_q

2021-03-06 14:05:44,605 : INFO : EPOCH 11 - PROGRESS: at 56.70% examples, 9425 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:05:46,105 : INFO : EPOCH 11 - PROGRESS: at 56.91% examples, 9381 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:05:47,363 : INFO : EPOCH 11 - PROGRESS: at 57.13% examples, 9355 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:05:48,465 : INFO : EPOCH 11 - PROGRESS: at 57.39% examples, 9336 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:05:49,696 : INFO : EPOCH 11 - PROGRESS: at 57.88% examples, 9348 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:05:50,788 : INFO : EPOCH 11 - PROGRESS: at 58.09% examples, 9329 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:05:51,978 : INFO : EPOCH 11 - PROGRESS: at 58.66% examples, 9345 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:05:53,086 : INFO : EPOCH 11 - PROGRESS: at 58.91% examples, 9326 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:05:54,188 : INFO : EPOCH 11 - PROGRESS: at 59.34% examples, 9346 words/s, in_qsize

2021-03-06 14:07:21,240 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-03-06 14:07:21,486 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-03-06 14:07:21,517 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-03-06 14:07:21,613 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-03-06 14:07:21,664 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-03-06 14:07:21,695 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-03-06 14:07:21,717 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-03-06 14:07:21,813 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-03-06 14:07:21,852 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-03-06 14:07:21,960 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-03-06 14:07:21,979 : INFO : worker thread finished; awaiting finish of 0 more thread

2021-03-06 14:10:05,483 : INFO : EPOCH 12 - PROGRESS: at 56.28% examples, 10338 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:10:06,913 : INFO : EPOCH 12 - PROGRESS: at 56.71% examples, 10327 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:10:08,243 : INFO : EPOCH 12 - PROGRESS: at 56.91% examples, 10285 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:10:09,299 : INFO : EPOCH 12 - PROGRESS: at 57.93% examples, 10387 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:10:10,558 : INFO : EPOCH 12 - PROGRESS: at 58.66% examples, 10430 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:10:11,720 : INFO : EPOCH 12 - PROGRESS: at 58.89% examples, 10398 words/s, in_qsize 79, out_qsize 7
2021-03-06 14:10:12,739 : INFO : EPOCH 12 - PROGRESS: at 60.63% examples, 10655 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:10:14,015 : INFO : EPOCH 12 - PROGRESS: at 61.05% examples, 10648 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:10:15,015 : INFO : EPOCH 12 - PROGRESS: at 61.73% examples, 10706 words/s,

2021-03-06 14:11:07,271 : INFO : EPOCH - 12 : training on 3848224 raw words (2737455 effective words) took 225.5s, 12140 effective words/s
2021-03-06 14:11:21,382 : INFO : EPOCH 13 - PROGRESS: at 0.16% examples, 479 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:11:23,062 : INFO : EPOCH 13 - PROGRESS: at 0.34% examples, 837 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:11:24,587 : INFO : EPOCH 13 - PROGRESS: at 0.50% examples, 1121 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:11:27,049 : INFO : EPOCH 13 - PROGRESS: at 1.17% examples, 2317 words/s, in_qsize 60, out_qsize 21
2021-03-06 14:11:37,491 : INFO : EPOCH 13 - PROGRESS: at 7.06% examples, 9056 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:11:38,635 : INFO : EPOCH 13 - PROGRESS: at 7.23% examples, 8940 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:11:40,627 : INFO : EPOCH 13 - PROGRESS: at 7.58% examples, 8798 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:11:41,853 : INFO : EPOCH 13 - PROGRESS: at 7.92% examples, 8866 

2021-03-06 14:13:48,767 : INFO : EPOCH 13 - PROGRESS: at 66.47% examples, 12455 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:13:49,880 : INFO : EPOCH 13 - PROGRESS: at 67.18% examples, 12502 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:13:51,442 : INFO : EPOCH 13 - PROGRESS: at 68.48% examples, 12601 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:13:52,577 : INFO : EPOCH 13 - PROGRESS: at 69.35% examples, 12686 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:13:54,100 : INFO : EPOCH 13 - PROGRESS: at 69.93% examples, 12694 words/s, in_qsize 77, out_qsize 2
2021-03-06 14:13:55,271 : INFO : EPOCH 13 - PROGRESS: at 70.95% examples, 12812 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:13:56,454 : INFO : EPOCH 13 - PROGRESS: at 71.71% examples, 12846 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:13:57,488 : INFO : EPOCH 13 - PROGRESS: at 72.22% examples, 12852 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:13:59,031 : INFO : EPOCH 13 - PROGRESS: at 72.45% examples, 12778 words/s,

2021-03-06 14:15:34,193 : INFO : EPOCH 14 - PROGRESS: at 10.32% examples, 6226 words/s, in_qsize 77, out_qsize 2
2021-03-06 14:15:36,108 : INFO : EPOCH 14 - PROGRESS: at 11.21% examples, 6457 words/s, in_qsize 80, out_qsize 10
2021-03-06 14:15:47,214 : INFO : EPOCH 14 - PROGRESS: at 14.89% examples, 6937 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:15:50,581 : INFO : EPOCH 14 - PROGRESS: at 15.18% examples, 6731 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:15:52,165 : INFO : EPOCH 14 - PROGRESS: at 15.67% examples, 6769 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:15:53,662 : INFO : EPOCH 14 - PROGRESS: at 16.47% examples, 6894 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:15:55,974 : INFO : EPOCH 14 - PROGRESS: at 16.88% examples, 6864 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:15:57,973 : INFO : EPOCH 14 - PROGRESS: at 18.37% examples, 7172 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:15:59,705 : INFO : EPOCH 14 - PROGRESS: at 19.08% examples, 7267 words/s, in_qsiz

2021-03-06 14:18:11,918 : INFO : EPOCH 14 - PROGRESS: at 77.07% examples, 10536 words/s, in_qsize 58, out_qsize 0
2021-03-06 14:18:13,735 : INFO : EPOCH 14 - PROGRESS: at 78.38% examples, 10581 words/s, in_qsize 54, out_qsize 0
2021-03-06 14:18:15,461 : INFO : EPOCH 14 - PROGRESS: at 80.45% examples, 10622 words/s, in_qsize 50, out_qsize 0
2021-03-06 14:18:16,541 : INFO : EPOCH 14 - PROGRESS: at 81.70% examples, 10662 words/s, in_qsize 47, out_qsize 0
2021-03-06 14:18:18,378 : INFO : EPOCH 14 - PROGRESS: at 82.81% examples, 10665 words/s, in_qsize 44, out_qsize 0
2021-03-06 14:18:19,515 : INFO : EPOCH 14 - PROGRESS: at 83.51% examples, 10673 words/s, in_qsize 42, out_qsize 0
2021-03-06 14:18:20,690 : INFO : EPOCH 14 - PROGRESS: at 83.89% examples, 10649 words/s, in_qsize 39, out_qsize 3
2021-03-06 14:18:20,699 : INFO : worker thread finished; awaiting finish of 39 more threads
2021-03-06 14:18:21,675 : INFO : worker thread finished; awaiting finish of 38 more threads
2021-03-06 14:18:2

2021-03-06 14:19:41,093 : INFO : EPOCH 15 - PROGRESS: at 25.07% examples, 12314 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:19:43,400 : INFO : EPOCH 15 - PROGRESS: at 26.32% examples, 12213 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:19:44,817 : INFO : EPOCH 15 - PROGRESS: at 27.07% examples, 12170 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:19:45,987 : INFO : EPOCH 15 - PROGRESS: at 27.94% examples, 12167 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:19:47,086 : INFO : EPOCH 15 - PROGRESS: at 29.20% examples, 12274 words/s, in_qsize 80, out_qsize 1
2021-03-06 14:19:48,812 : INFO : EPOCH 15 - PROGRESS: at 30.83% examples, 12361 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:19:50,951 : INFO : EPOCH 15 - PROGRESS: at 31.94% examples, 12335 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:19:52,165 : INFO : EPOCH 15 - PROGRESS: at 32.38% examples, 12197 words/s, in_qsize 77, out_qsize 2
2021-03-06 14:19:53,710 : INFO : EPOCH 15 - PROGRESS: at 34.82% examples, 12566 words/s,

2021-03-06 14:21:32,314 : INFO : EPOCH 15 - PROGRESS: at 81.63% examples, 13590 words/s, in_qsize 47, out_qsize 0
2021-03-06 14:21:33,339 : INFO : EPOCH 15 - PROGRESS: at 83.55% examples, 13705 words/s, in_qsize 41, out_qsize 1
2021-03-06 14:21:34,382 : INFO : EPOCH 15 - PROGRESS: at 84.31% examples, 13703 words/s, in_qsize 40, out_qsize 0
2021-03-06 14:21:34,722 : INFO : worker thread finished; awaiting finish of 39 more threads
2021-03-06 14:21:35,100 : INFO : worker thread finished; awaiting finish of 38 more threads
2021-03-06 14:21:36,207 : INFO : EPOCH 15 - PROGRESS: at 85.44% examples, 13680 words/s, in_qsize 37, out_qsize 1
2021-03-06 14:21:36,209 : INFO : worker thread finished; awaiting finish of 37 more threads
2021-03-06 14:21:36,417 : INFO : worker thread finished; awaiting finish of 36 more threads
2021-03-06 14:21:37,518 : INFO : EPOCH 15 - PROGRESS: at 86.27% examples, 13658 words/s, in_qsize 35, out_qsize 1
2021-03-06 14:21:37,522 : INFO : worker thread finished; await

2021-03-06 14:23:11,690 : INFO : EPOCH 16 - PROGRESS: at 35.18% examples, 11813 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:23:12,804 : INFO : EPOCH 16 - PROGRESS: at 35.39% examples, 11743 words/s, in_qsize 80, out_qsize 7
2021-03-06 14:23:14,286 : INFO : EPOCH 16 - PROGRESS: at 37.95% examples, 12329 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:23:18,962 : INFO : EPOCH 16 - PROGRESS: at 38.04% examples, 11774 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:23:20,251 : INFO : EPOCH 16 - PROGRESS: at 38.35% examples, 11756 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:23:22,925 : INFO : EPOCH 16 - PROGRESS: at 38.57% examples, 11567 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:23:23,992 : INFO : EPOCH 16 - PROGRESS: at 39.18% examples, 11712 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:23:25,168 : INFO : EPOCH 16 - PROGRESS: at 39.64% examples, 11714 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:23:26,418 : INFO : EPOCH 16 - PROGRESS: at 40.39% examples, 11777 words/s,

2021-03-06 14:24:59,767 : INFO : worker thread finished; awaiting finish of 36 more threads
2021-03-06 14:25:01,099 : INFO : EPOCH 16 - PROGRESS: at 86.27% examples, 12795 words/s, in_qsize 35, out_qsize 1
2021-03-06 14:25:01,103 : INFO : worker thread finished; awaiting finish of 35 more threads
2021-03-06 14:25:01,488 : INFO : worker thread finished; awaiting finish of 34 more threads
2021-03-06 14:25:01,601 : INFO : worker thread finished; awaiting finish of 33 more threads
2021-03-06 14:25:01,836 : INFO : worker thread finished; awaiting finish of 32 more threads
2021-03-06 14:25:01,874 : INFO : worker thread finished; awaiting finish of 31 more threads
2021-03-06 14:25:01,990 : INFO : worker thread finished; awaiting finish of 30 more threads
2021-03-06 14:25:02,026 : INFO : worker thread finished; awaiting finish of 29 more threads
2021-03-06 14:25:03,013 : INFO : EPOCH 16 - PROGRESS: at 89.02% examples, 12914 words/s, in_qsize 28, out_qsize 1
2021-03-06 14:25:03,019 : INFO : wor

2021-03-06 14:26:54,275 : INFO : EPOCH 17 - PROGRESS: at 46.48% examples, 13190 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:26:57,410 : INFO : EPOCH 17 - PROGRESS: at 47.13% examples, 12999 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:26:59,042 : INFO : EPOCH 17 - PROGRESS: at 47.75% examples, 12989 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:27:00,247 : INFO : EPOCH 17 - PROGRESS: at 48.32% examples, 12973 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:27:02,681 : INFO : EPOCH 17 - PROGRESS: at 48.79% examples, 12816 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:27:04,374 : INFO : EPOCH 17 - PROGRESS: at 49.00% examples, 12685 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:27:05,490 : INFO : EPOCH 17 - PROGRESS: at 49.23% examples, 12622 words/s, in_qsize 80, out_qsize 4
2021-03-06 14:27:06,708 : INFO : EPOCH 17 - PROGRESS: at 49.46% examples, 12550 words/s, in_qsize 78, out_qsize 7
2021-03-06 14:27:08,169 : INFO : EPOCH 17 - PROGRESS: at 51.34% examples, 12909 words/s,

2021-03-06 14:29:01,928 : INFO : worker thread finished; awaiting finish of 28 more threads
2021-03-06 14:29:03,985 : INFO : EPOCH 17 - PROGRESS: at 89.24% examples, 10823 words/s, in_qsize 27, out_qsize 1
2021-03-06 14:29:03,989 : INFO : worker thread finished; awaiting finish of 27 more threads
2021-03-06 14:29:04,847 : INFO : worker thread finished; awaiting finish of 26 more threads
2021-03-06 14:29:05,075 : INFO : EPOCH 17 - PROGRESS: at 89.98% examples, 10831 words/s, in_qsize 25, out_qsize 1
2021-03-06 14:29:05,087 : INFO : worker thread finished; awaiting finish of 25 more threads
2021-03-06 14:29:05,255 : INFO : worker thread finished; awaiting finish of 24 more threads
2021-03-06 14:29:05,340 : INFO : worker thread finished; awaiting finish of 23 more threads
2021-03-06 14:29:05,861 : INFO : worker thread finished; awaiting finish of 22 more threads
2021-03-06 14:29:06,334 : INFO : EPOCH 17 - PROGRESS: at 91.69% examples, 10887 words/s, in_qsize 21, out_qsize 1
2021-03-06 14:

2021-03-06 14:31:07,017 : INFO : EPOCH 18 - PROGRESS: at 45.35% examples, 11767 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:31:12,300 : INFO : EPOCH 18 - PROGRESS: at 45.55% examples, 11310 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:31:13,347 : INFO : EPOCH 18 - PROGRESS: at 45.80% examples, 11268 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:31:14,670 : INFO : EPOCH 18 - PROGRESS: at 46.46% examples, 11313 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:31:16,149 : INFO : EPOCH 18 - PROGRESS: at 47.30% examples, 11400 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:31:17,917 : INFO : EPOCH 18 - PROGRESS: at 48.29% examples, 11452 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:31:19,191 : INFO : EPOCH 18 - PROGRESS: at 49.48% examples, 11610 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:31:20,754 : INFO : EPOCH 18 - PROGRESS: at 50.12% examples, 11631 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:31:22,039 : INFO : EPOCH 18 - PROGRESS: at 50.53% examples, 11621 words/s,

2021-03-06 14:32:33,815 : INFO : worker thread finished; awaiting finish of 19 more threads
2021-03-06 14:32:34,021 : INFO : worker thread finished; awaiting finish of 18 more threads
2021-03-06 14:32:34,302 : INFO : worker thread finished; awaiting finish of 17 more threads
2021-03-06 14:32:34,408 : INFO : worker thread finished; awaiting finish of 16 more threads
2021-03-06 14:32:34,751 : INFO : EPOCH 18 - PROGRESS: at 94.21% examples, 12936 words/s, in_qsize 15, out_qsize 1
2021-03-06 14:32:34,755 : INFO : worker thread finished; awaiting finish of 15 more threads
2021-03-06 14:32:35,012 : INFO : worker thread finished; awaiting finish of 14 more threads
2021-03-06 14:32:35,052 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-03-06 14:32:35,193 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 14:32:35,265 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 14:32:35,310 : INFO : worker thread finished; a

2021-03-06 14:34:56,443 : INFO : EPOCH 19 - PROGRESS: at 58.88% examples, 12568 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:34:58,293 : INFO : EPOCH 19 - PROGRESS: at 59.37% examples, 12500 words/s, in_qsize 78, out_qsize 2
2021-03-06 14:34:59,519 : INFO : EPOCH 19 - PROGRESS: at 60.03% examples, 12534 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:35:00,727 : INFO : EPOCH 19 - PROGRESS: at 61.14% examples, 12656 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:35:01,804 : INFO : EPOCH 19 - PROGRESS: at 62.45% examples, 12894 words/s, in_qsize 80, out_qsize 2
2021-03-06 14:35:03,067 : INFO : EPOCH 19 - PROGRESS: at 63.06% examples, 12924 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:35:04,413 : INFO : EPOCH 19 - PROGRESS: at 63.46% examples, 12896 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:35:06,381 : INFO : EPOCH 19 - PROGRESS: at 63.84% examples, 12817 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:35:07,845 : INFO : EPOCH 19 - PROGRESS: at 64.63% examples, 12876 words/s,

2021-03-06 14:36:04,814 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-03-06 14:36:04,832 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-03-06 14:36:04,858 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-03-06 14:36:04,939 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-03-06 14:36:04,979 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-03-06 14:36:04,983 : INFO : EPOCH - 19 : training on 3848224 raw words (2737764 effective words) took 209.2s, 13088 effective words/s
2021-03-06 14:36:26,040 : INFO : EPOCH 20 - PROGRESS: at 0.16% examples, 322 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:36:27,237 : INFO : EPOCH 20 - PROGRESS: at 0.31% examples, 592 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:36:28,861 : INFO : EPOCH 20 - PROGRESS: at 0.66% examples, 1088 words/s, in_qsize 77, out_qsize 2
2021-03-06 14:36:29,929 : INFO : EPOCH 20 - PROGRESS: at 1.69% examples,

2021-03-06 14:38:30,887 : INFO : EPOCH 20 - PROGRESS: at 66.96% examples, 13886 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:38:32,510 : INFO : EPOCH 20 - PROGRESS: at 67.99% examples, 13928 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:38:33,818 : INFO : EPOCH 20 - PROGRESS: at 68.90% examples, 14001 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:38:34,822 : INFO : EPOCH 20 - PROGRESS: at 69.52% examples, 14040 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:38:36,250 : INFO : EPOCH 20 - PROGRESS: at 70.30% examples, 14091 words/s, in_qsize 71, out_qsize 4
2021-03-06 14:38:39,214 : INFO : EPOCH 20 - PROGRESS: at 72.45% examples, 14231 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:38:40,347 : INFO : EPOCH 20 - PROGRESS: at 72.70% examples, 14174 words/s, in_qsize 78, out_qsize 0
2021-03-06 14:38:42,029 : INFO : EPOCH 20 - PROGRESS: at 73.13% examples, 14108 words/s, in_qsize 76, out_qsize 0
2021-03-06 14:38:43,219 : INFO : EPOCH 20 - PROGRESS: at 74.08% examples, 14206 words/s,

2021-03-06 14:39:47,647 : INFO : EPOCH 21 - PROGRESS: at 9.30% examples, 9584 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:39:52,068 : INFO : EPOCH 21 - PROGRESS: at 10.18% examples, 9359 words/s, in_qsize 80, out_qsize 16
2021-03-06 14:39:56,899 : INFO : EPOCH 21 - PROGRESS: at 15.13% examples, 11597 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:39:58,895 : INFO : EPOCH 21 - PROGRESS: at 15.96% examples, 11556 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:40:00,053 : INFO : EPOCH 21 - PROGRESS: at 16.40% examples, 11551 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:40:01,636 : INFO : EPOCH 21 - PROGRESS: at 17.37% examples, 11728 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:40:02,641 : INFO : EPOCH 21 - PROGRESS: at 18.17% examples, 11905 words/s, in_qsize 80, out_qsize 1
2021-03-06 14:40:03,766 : INFO : EPOCH 21 - PROGRESS: at 19.17% examples, 12161 words/s, in_qsize 80, out_qsize 1
2021-03-06 14:40:05,488 : INFO : EPOCH 21 - PROGRESS: at 20.46% examples, 12530 words/s, i

2021-03-06 14:41:52,851 : INFO : EPOCH 21 - PROGRESS: at 76.25% examples, 14248 words/s, in_qsize 61, out_qsize 0
2021-03-06 14:41:55,238 : INFO : EPOCH 21 - PROGRESS: at 76.52% examples, 14088 words/s, in_qsize 56, out_qsize 4
2021-03-06 14:41:56,447 : INFO : EPOCH 21 - PROGRESS: at 79.81% examples, 14361 words/s, in_qsize 51, out_qsize 0
2021-03-06 14:41:57,597 : INFO : EPOCH 21 - PROGRESS: at 80.73% examples, 14345 words/s, in_qsize 49, out_qsize 0
2021-03-06 14:41:58,606 : INFO : EPOCH 21 - PROGRESS: at 82.46% examples, 14423 words/s, in_qsize 45, out_qsize 0
2021-03-06 14:41:59,926 : INFO : EPOCH 21 - PROGRESS: at 83.55% examples, 14434 words/s, in_qsize 41, out_qsize 1
2021-03-06 14:42:06,551 : INFO : EPOCH 21 - PROGRESS: at 84.29% examples, 13978 words/s, in_qsize 40, out_qsize 0
2021-03-06 14:42:10,937 : INFO : EPOCH 21 - PROGRESS: at 84.69% examples, 13670 words/s, in_qsize 39, out_qsize 1
2021-03-06 14:42:11,274 : INFO : worker thread finished; awaiting finish of 39 more thre

2021-03-06 14:43:36,674 : INFO : EPOCH 22 - PROGRESS: at 24.66% examples, 11169 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:43:43,161 : INFO : EPOCH 22 - PROGRESS: at 25.11% examples, 10347 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:43:44,609 : INFO : EPOCH 22 - PROGRESS: at 26.64% examples, 10512 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:43:46,410 : INFO : EPOCH 22 - PROGRESS: at 28.80% examples, 10713 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:43:49,585 : INFO : EPOCH 22 - PROGRESS: at 30.83% examples, 10724 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:43:50,684 : INFO : EPOCH 22 - PROGRESS: at 31.83% examples, 10831 words/s, in_qsize 77, out_qsize 2
2021-03-06 14:43:52,719 : INFO : EPOCH 22 - PROGRESS: at 34.45% examples, 11270 words/s, in_qsize 72, out_qsize 8
2021-03-06 14:43:54,951 : INFO : EPOCH 22 - PROGRESS: at 37.95% examples, 11985 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:43:59,054 : INFO : EPOCH 22 - PROGRESS: at 38.04% examples, 11535 words/s,

2021-03-06 14:45:25,833 : INFO : worker thread finished; awaiting finish of 19 more threads
2021-03-06 14:45:25,917 : INFO : worker thread finished; awaiting finish of 18 more threads
2021-03-06 14:45:26,065 : INFO : worker thread finished; awaiting finish of 17 more threads
2021-03-06 14:45:26,210 : INFO : worker thread finished; awaiting finish of 16 more threads
2021-03-06 14:45:26,213 : INFO : worker thread finished; awaiting finish of 15 more threads
2021-03-06 14:45:26,281 : INFO : worker thread finished; awaiting finish of 14 more threads
2021-03-06 14:45:26,283 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-03-06 14:45:26,318 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 14:45:26,382 : INFO : EPOCH 22 - PROGRESS: at 95.78% examples, 14568 words/s, in_qsize 10, out_qsize 3
2021-03-06 14:45:26,383 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 14:45:26,385 : INFO : worker thread finished; a

2021-03-06 14:47:29,699 : INFO : EPOCH 23 - PROGRESS: at 62.26% examples, 15242 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:47:30,726 : INFO : EPOCH 23 - PROGRESS: at 63.06% examples, 15336 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:47:34,519 : INFO : EPOCH 23 - PROGRESS: at 63.46% examples, 14985 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:47:36,466 : INFO : EPOCH 23 - PROGRESS: at 64.00% examples, 14918 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:47:38,270 : INFO : EPOCH 23 - PROGRESS: at 65.08% examples, 14977 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:47:39,284 : INFO : EPOCH 23 - PROGRESS: at 65.51% examples, 14971 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:47:40,631 : INFO : EPOCH 23 - PROGRESS: at 65.74% examples, 14872 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:47:41,823 : INFO : EPOCH 23 - PROGRESS: at 67.01% examples, 15004 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:47:43,513 : INFO : EPOCH 23 - PROGRESS: at 67.99% examples, 15024 words/s,

2021-03-06 14:48:53,691 : INFO : EPOCH 24 - PROGRESS: at 11.28% examples, 11330 words/s, in_qsize 80, out_qsize 15
2021-03-06 14:49:01,462 : INFO : EPOCH 24 - PROGRESS: at 15.13% examples, 11803 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:49:02,908 : INFO : EPOCH 24 - PROGRESS: at 16.40% examples, 12162 words/s, in_qsize 80, out_qsize 2
2021-03-06 14:49:04,319 : INFO : EPOCH 24 - PROGRESS: at 18.14% examples, 12795 words/s, in_qsize 79, out_qsize 2
2021-03-06 14:49:05,538 : INFO : EPOCH 24 - PROGRESS: at 19.58% examples, 13311 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:49:07,179 : INFO : EPOCH 24 - PROGRESS: at 19.94% examples, 13161 words/s, in_qsize 80, out_qsize 1
2021-03-06 14:49:08,550 : INFO : EPOCH 24 - PROGRESS: at 20.93% examples, 13334 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:49:09,895 : INFO : EPOCH 24 - PROGRESS: at 22.30% examples, 13767 words/s, in_qsize 76, out_qsize 3
2021-03-06 14:49:20,089 : INFO : EPOCH 24 - PROGRESS: at 25.04% examples, 12683 words/s

2021-03-06 14:51:19,707 : INFO : EPOCH 24 - PROGRESS: at 88.56% examples, 13762 words/s, in_qsize 29, out_qsize 1
2021-03-06 14:51:19,711 : INFO : worker thread finished; awaiting finish of 29 more threads
2021-03-06 14:51:19,783 : INFO : worker thread finished; awaiting finish of 28 more threads
2021-03-06 14:51:19,970 : INFO : worker thread finished; awaiting finish of 27 more threads
2021-03-06 14:51:20,013 : INFO : worker thread finished; awaiting finish of 26 more threads
2021-03-06 14:51:20,169 : INFO : worker thread finished; awaiting finish of 25 more threads
2021-03-06 14:51:20,305 : INFO : worker thread finished; awaiting finish of 24 more threads
2021-03-06 14:51:20,330 : INFO : worker thread finished; awaiting finish of 23 more threads
2021-03-06 14:51:20,434 : INFO : worker thread finished; awaiting finish of 22 more threads
2021-03-06 14:51:20,761 : INFO : EPOCH 24 - PROGRESS: at 91.69% examples, 13976 words/s, in_qsize 21, out_qsize 1
2021-03-06 14:51:20,765 : INFO : wor

2021-03-06 14:53:29,427 : INFO : EPOCH 25 - PROGRESS: at 44.94% examples, 10655 words/s, in_qsize 80, out_qsize 2
2021-03-06 14:53:37,413 : INFO : EPOCH 25 - PROGRESS: at 45.61% examples, 10178 words/s, in_qsize 79, out_qsize 1
2021-03-06 14:53:47,818 : INFO : EPOCH 25 - PROGRESS: at 45.99% examples, 9542 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:53:52,020 : INFO : EPOCH 25 - PROGRESS: at 46.23% examples, 9320 words/s, in_qsize 79, out_qsize 0
2021-03-06 14:53:53,382 : INFO : EPOCH 25 - PROGRESS: at 46.92% examples, 9370 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:53:55,559 : INFO : EPOCH 25 - PROGRESS: at 47.38% examples, 9325 words/s, in_qsize 78, out_qsize 1
2021-03-06 14:53:57,010 : INFO : EPOCH 25 - PROGRESS: at 48.48% examples, 9463 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:53:58,107 : INFO : EPOCH 25 - PROGRESS: at 49.67% examples, 9614 words/s, in_qsize 80, out_qsize 0
2021-03-06 14:54:01,281 : INFO : EPOCH 25 - PROGRESS: at 49.93% examples, 9467 words/s, in_qsi

2021-03-06 14:55:31,828 : INFO : worker thread finished; awaiting finish of 16 more threads
2021-03-06 14:55:31,853 : INFO : EPOCH 25 - PROGRESS: at 94.12% examples, 10573 words/s, in_qsize 15, out_qsize 1
2021-03-06 14:55:31,909 : INFO : worker thread finished; awaiting finish of 15 more threads
2021-03-06 14:55:31,977 : INFO : worker thread finished; awaiting finish of 14 more threads
2021-03-06 14:55:32,011 : INFO : worker thread finished; awaiting finish of 13 more threads
2021-03-06 14:55:32,184 : INFO : worker thread finished; awaiting finish of 12 more threads
2021-03-06 14:55:32,235 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-03-06 14:55:32,347 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-03-06 14:55:32,463 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-03-06 14:55:32,488 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-03-06 14:55:32,549 : INFO : worker thread finished; awa

Done.


#### 文書ベクトル作成用関数の定義
単語の和の平均で文書ベクトルを作成

In [4]:
def wordvec2docvec(sentence):
    # 文章ベクトルの初期値（0ベクトルを初期値とする）
    docvecs = np.zeros(num_features, dtype="float32")

    # 文章に現れる単語のうち、モデルに存在しない単語をカウントする
    denomenator = len(sentence)

    # 文章内の各単語ベクトルを足し合わせる
    for word in sentence:
        try:
            temp = model[word]
        except:
            denomenator -= 1
            continue
        docvecs += temp

    # 文章に現れる単語のうち、モデルに存在した単語の数で割る
    if denomenator > 0:
        docvecs =  docvecs / denomenator

    return docvecs

#### データ準備②

In [5]:
from sklearn.model_selection import train_test_split

print(len(datasets["document"]))
X, Y = [], []
for doc, category in tqdm(zip(datasets["document"], datasets["category"])):
    wakati = make_wakati(doc)
    docvec = wordvec2docvec(wakati)
    X.append(list(docvec))
    Y.append(category2id[category])
data_X = pd.DataFrame(X, columns=["X" + str(i + 1) for i in range(num_features)])
data_Y = pd.DataFrame(Y, columns=["category_id"])

train_x, test_x, train_y, test_y = train_test_split(data_X, data_Y, train_size= 0.7)

7376


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  # This is added back by InteractiveShellApp.init_path()





#### XGBoostで分類器を作成＆予測

In [8]:
import xgboost as xgb
from sklearn.metrics import classification_report

print("Fitting XGboost model ...")
xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_x, train_y)
print("Done.")

# 予測
pred = xgb_model.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

Fitting XGboost model ...




Done.
                precision    recall  f1-score   support

dokujo-tsushin       0.89      0.83      0.86       277
  it-life-hack       0.93      0.91      0.92       252
 kaden-channel       0.90      0.95      0.92       247
livedoor-homme       0.71      0.87      0.78       125
   movie-enter       0.94      0.91      0.93       279
        peachy       0.81      0.84      0.82       247
          smax       0.96      0.95      0.95       248
  sports-watch       0.93      0.92      0.93       275
    topic-news       0.92      0.88      0.90       263

      accuracy                           0.90      2213
     macro avg       0.89      0.89      0.89      2213
  weighted avg       0.90      0.90      0.90      2213



#### LightGBMで分類器の作成＆予測

In [9]:
import lightgbm as lgbm

print("Fitting LightGBM model ...")
lgbm_model = lgbm.LGBMClassifier()
lgbm_model.fit(train_x, train_y)
print("Done.")

# 予測
pred = lgbm_model.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

Fitting LightGBM model ...
Done.
                precision    recall  f1-score   support

dokujo-tsushin       0.88      0.83      0.85       274
  it-life-hack       0.91      0.93      0.92       243
 kaden-channel       0.92      0.94      0.93       253
livedoor-homme       0.69      0.86      0.77       123
   movie-enter       0.95      0.92      0.93       277
        peachy       0.81      0.80      0.80       264
          smax       0.96      0.96      0.96       247
  sports-watch       0.93      0.92      0.92       278
    topic-news       0.89      0.87      0.88       254

      accuracy                           0.89      2213
     macro avg       0.88      0.89      0.89      2213
  weighted avg       0.89      0.89      0.89      2213



#### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(train_x, train_y)
pred = rfc.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

  after removing the cwd from sys.path.


                precision    recall  f1-score   support

dokujo-tsushin       0.85      0.77      0.81       284
  it-life-hack       0.85      0.87      0.86       242
 kaden-channel       0.87      0.87      0.87       260
livedoor-homme       0.53      0.89      0.66        91
   movie-enter       0.93      0.87      0.90       289
        peachy       0.75      0.77      0.76       252
          smax       0.96      0.95      0.96       247
  sports-watch       0.91      0.90      0.90       277
    topic-news       0.88      0.81      0.84       271

      accuracy                           0.85      2213
     macro avg       0.84      0.85      0.84      2213
  weighted avg       0.86      0.85      0.85      2213



#### TF-IDFによる文書ベクトル

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = datasets["document"]
tfidf_vectorizer = TfidfVectorizer(analyzer=make_wakati)
tfidfs = tfidf_vectorizer.fit_transform(corpus)
print(tfidfs.shape)
# (7376, 79673)

tfidf_data_X = pd.DataFrame(tfidfs.toarray(),columns=["X"+str(i) for i in range(tfidfs.shape[1])])
train_x, test_x, train_y, test_y = train_test_split(data_X, data_Y, train_size=0.7)
lgbm_model = lgbm.LGBMClassifier()
lgbm_model.fit(train_x, train_y)
pred = lgbm_model.predict(test_x)
print(classification_report(pred, test_y["category_id"], target_names=categories))

(7376, 59937)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                precision    recall  f1-score   support

dokujo-tsushin       0.86      0.87      0.86       262
  it-life-hack       0.86      0.87      0.86       271
 kaden-channel       0.86      0.92      0.89       222
livedoor-homme       0.67      0.81      0.74       127
   movie-enter       0.94      0.88      0.91       271
        peachy       0.81      0.78      0.79       268
          smax       0.92      0.89      0.91       274
  sports-watch       0.94      0.93      0.94       270
    topic-news       0.91      0.89      0.90       248

      accuracy                           0.87      2213
     macro avg       0.86      0.87      0.87      2213
  weighted avg       0.88      0.87      0.87      2213

