# 中文文本大亂鬥
本文出自: https://tlyu0419.github.io/2020/04/04/Text-Classification/

In [1]:
import sys
import os

import logging
import multiprocessing
from gensim.models import Word2Vec

import pandas as pd
import sqlite3
import numpy as np
import re
import jieba

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB  
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier 
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics

from keras.utils import to_categorical
from tensorflow.python.keras import layers, models, optimizers
from tensorflow.python.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.python.keras.preprocessing import text, sequence
from tensorflow.python.keras.layers import Dense, Input, Flatten, Dropout, LSTM, BatchNormalization
from tensorflow.python.keras import Sequential
from tensorflow.python import keras
from tensorflow.python.keras.callbacks import EarlyStopping
import tensorflow as tf

In [2]:
df = pd.read_pickle('data/undnews.pickle')

In [3]:
# 將資料中切出 20% 作為測試資料
X_train, X_test, y_train, y_test = train_test_split(df['CONTENT_SEG'], df['SUB2'], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(40000,) (10000,) (40000,) (10000,)


## 文本特徵處理

### CountVector
將文本中的詞語轉換為詞頻矩陣。

### TF-IDF
文本加權方，採用統計思想，使用文本出現的次數和整個與料庫中文黨頻率進行計算字詞重要度。\
TF(Term Frequency): 表示某個關鍵字在整篇文章中出現的頻率。\
IDF: 表示計算倒文本頻率。倒文本頻率又稱逆文黨頻率，是文檔的倒數，主要用於降低所有文檔中一些常見卻對文檔影響不大的詞語權重。\

優點: 簡單快速，容易理解。過濾一些常見但是不重要的字詞。

缺點: 用詞頻來衡量文章中的一個詞重要性不夠全面。

### word2Vec
word2vec 是 word Embedding 的方法之一， word2vec 有兩種訓練模式:
* CBOW(Continuous Bag-of-Words Model)，通過上下文來預測當前值。
* Skip-gram(Continuous Skip-gram Model)，用當前值來預測上下文。

優點
* 由於 Word2vec 會考慮上下文，跟之前的 Embedding 方法相比，效果要更好。
* 比之前的 Embedding 方法维度更少，所以速度更快。
* 通用性很强，可以用在各種 NLP 任務中。

缺點
* 由於詞和向量是一對一的關係，所以多義詞的問題無法解決。
* Word2vec 是一種靜態的方式，雖然通用性强，但是無法針對特定任務做動態優化。

In [4]:
# CountVector：計算文本中每個詞出現的次數，整理出來的維度數是文本中所有詞的數量
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', min_df=10)
count_vect.fit(df['CONTENT_SEG'])
counts_train = CountVectorizer(vocabulary=count_vect.vocabulary_).fit_transform(X_train)
counts_test = CountVectorizer(vocabulary=count_vect.vocabulary_).fit_transform(X_test)
counts_train

<40000x49471 sparse matrix of type '<class 'numpy.int64'>'
	with 5199910 stored elements in Compressed Sparse Row format>

In [5]:
# TFIDFVector：在 CountVector 的基礎上，進一步考量文本的長度資訊(TF)與每個詞在多少文本中出現的狀況(IDF)的分數。
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', min_df=10)
tfidf_vect.fit(df['CONTENT_SEG'])
tfidf_train = TfidfVectorizer(vocabulary=tfidf_vect.vocabulary_).fit_transform(X_train)
tfidf_test = TfidfVectorizer(vocabulary=tfidf_vect.vocabulary_).fit_transform(X_test)
tfidf_train

<40000x49471 sparse matrix of type '<class 'numpy.float64'>'
	with 5199910 stored elements in Compressed Sparse Row format>

In [6]:
# word2vec：詞向量是用一個多維度的向量來表示詞意的方法，用這個方法可以解決 CountVector/TFIDFVector 在處理文本特徵的資料時出現非常龐大且稀疏的矩陣的問題。詞向量的取得方式有以下兩種
# 運用我們這裡的資料自行訓練詞向量
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
w2v_model = Word2Vec(df['CONTENT_SEG'].apply(lambda x: x.split(' ',-1)), # input要是list不是str
                     min_count=10,
                     size=200,
                     workers=multiprocessing.cpu_count())  # 訓練skip-gram模型

2021-11-18 10:29:09,660 : INFO : running C:\Users\l8527\.conda\envs\tf-image\lib\site-packages\ipykernel_launcher.py -f C:\Users\l8527\AppData\Roaming\jupyter\runtime\kernel-0ced018c-979f-44f9-ade8-50d7a586db14.json
2021-11-18 10:29:11,210 : INFO : collecting all words and their counts
2021-11-18 10:29:11,210 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-11-18 10:29:11,888 : INFO : PROGRESS: at sentence #10000, processed 4607451 words, keeping 163203 word types
2021-11-18 10:29:12,601 : INFO : PROGRESS: at sentence #20000, processed 9274490 words, keeping 247204 word types
2021-11-18 10:29:13,321 : INFO : PROGRESS: at sentence #30000, processed 13783730 words, keeping 352486 word types
2021-11-18 10:29:13,905 : INFO : PROGRESS: at sentence #40000, processed 17253409 words, keeping 401667 word types
2021-11-18 10:29:14,621 : INFO : collected 450076 word types from a corpus of 21451617 raw words and 50000 sentences
2021-11-18 10:29:14,622 : INFO : Loadin

2021-11-18 10:29:55,292 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-11-18 10:29:55,293 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-11-18 10:29:55,294 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-11-18 10:29:55,301 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-18 10:29:55,302 : INFO : EPOCH - 3 : training on 21451617 raw words (13580460 effective words) took 10.9s, 1245062 effective words/s
2021-11-18 10:29:56,327 : INFO : EPOCH 4 - PROGRESS: at 8.43% examples, 1193341 words/s, in_qsize 20, out_qsize 3
2021-11-18 10:29:57,342 : INFO : EPOCH 4 - PROGRESS: at 17.07% examples, 1223833 words/s, in_qsize 21, out_qsize 2
2021-11-18 10:29:58,372 : INFO : EPOCH 4 - PROGRESS: at 26.10% examples, 1239414 words/s, in_qsize 22, out_qsize 1
2021-11-18 10:29:59,379 : INFO : EPOCH 4 - PROGRESS: at 34.93% examples, 1254396 words/s, in_qsize 24, out_qsize 1
2021-11-18 10:30:00,380 : INFO : EPO

In [7]:
def AvgVector(w2v_model, sentence):
    vec = []
    for i in sentence.split(' ', -1):
        if i in w2v_model.wv.index2word:
            vec.append(w2v_model[i])
    vector = np.mean(vec, axis=0)
    vector = pd.Series(vector)
    return vector

In [8]:
%time
w2v_train = X_train.apply(lambda x: AvgVector(w2v_model, x))
w2v_test = X_test.apply(lambda x: AvgVector(w2v_model, x))
w2v_train.shape

Wall time: 0 ns


  vec.append(w2v_model[i])


(40000, 200)

In [9]:
# 借用外部大量的文本訓練而得的詞向量
# 直接讀取詞向量的模型即可
pretrain_w2v_model = Word2Vec.load('data/wordvec_wiki/wiki.zh.text.model')

2021-11-18 11:53:58,973 : INFO : loading Word2Vec object from data/wordvec_wiki/wiki.zh.text.model
2021-11-18 11:54:01,657 : INFO : loading wv recursively from data/wordvec_wiki/wiki.zh.text.model.wv.* with mmap=None
2021-11-18 11:54:01,657 : INFO : loading vectors from data/wordvec_wiki/wiki.zh.text.model.wv.vectors.npy with mmap=None
2021-11-18 11:54:14,099 : INFO : setting ignored attribute vectors_norm to None
2021-11-18 11:54:14,099 : INFO : loading vocabulary recursively from data/wordvec_wiki/wiki.zh.text.model.vocabulary.* with mmap=None
2021-11-18 11:54:14,100 : INFO : loading trainables recursively from data/wordvec_wiki/wiki.zh.text.model.trainables.* with mmap=None
2021-11-18 11:54:14,100 : INFO : loading syn1neg from data/wordvec_wiki/wiki.zh.text.model.trainables.syn1neg.npy with mmap=None
2021-11-18 11:54:26,848 : INFO : setting ignored attribute cum_table to None
2021-11-18 11:54:26,849 : INFO : loaded data/wordvec_wiki/wiki.zh.text.model


## 機器學習模型

## 監督式學習
### Logistic Regression(對數機率模型)
處理問題上的區別:
  * Linear Regression 線性迴歸屬於連續型的模型值，也就是預測一個連續的應變數
  * Logistic Regression 邏輯迴歸使迴歸可以用來處理二元分類問題

建立迴歸方程式的區別
  * Linear Regression 線性迴歸使用特徵對目標直接建立迴歸方程式
  * Logistic Regression邏輯迴歸對勝算比(Odds Ratio)，也就是對與不對的比率，取對數log來建立迴歸方程式

優點: 容易理解與實作，計算成本不高 \
缺點: 分類的準確度不高，不能很好處理大量多類特徵或變量，容易產生低度擬和的問題(基於以上缺點而衍生出softmax可用於多分類)

資料來源: https://matters.news/@CHWang/machine-learning-%E7%B5%A6%E8%87%AA%E5%B7%B1%E7%9A%84%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92%E7%AD%86-logistic-regression%E9%82%8F%E8%BC%AF%E8%BF%B4%E6%AD%B8-%E4%BA%8C%E5%85%83%E5%88%86%E9%A1%9E%E5%95%8F%E9%A1%8C-%E5%8E%9F%E7%90%86%E8%A9%B3%E7%B4%B0%E4%BB%8B%E7%B4%B9-bafyreiettlsnp4azq5dqwyubb5w76j5t4x4pxxhnuteofrqmqszg4jbuve

In [10]:
# 詞頻特徵
%time
clf = LogisticRegression(max_iter=1000, n_jobs=-1)   
clf.fit(counts_train, y_train)
print('='*40, ' Score on Counts feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(counts_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(counts_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.999925
Score on Test:  0.865


In [11]:
# TFIDF特徵
%time
clf = LogisticRegression(max_iter=1000, n_jobs=-1)   
clf.fit(tfidf_train, y_train)
print('='*40, ' Score on TFIDF feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(tfidf_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(tfidf_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.934725
Score on Test:  0.8802


In [12]:
# 平均詞向量特徵
%time
clf = LogisticRegression(max_iter=1000, n_jobs=-1)   
clf.fit(w2v_train, y_train)
print('='*40, ' Score on AvgVector feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(w2v_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(w2v_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.861125
Score on Test:  0.8459


## 監督式學習
### Naive Bayes(預測建模算法)
優點: Naive Bayes 發源於古典數學理論，有著堅實的數學理論，及穩定的分類效率。對大量訓練和查詢具有較高的速度。既使面對超大規模的訓練集也只會有相對較少數的特徵數，並且對項目的訓練和分類也僅僅試特徵概率的數學運算。適合增量式訓練，對於缺失數據不太敏感。

缺點: 需要計算先驗概率，決策分類存在錯誤率，對輸入數據的表達形式很敏感，使用了樣本數性獨立性的假設，因此如果樣本屬性有關聯時期效果不佳。

In [13]:
### 詞頻特徵
%time
param_grid = {'alpha':[1.4, 1.2, 1, 0.8, 0.6]}
estimators = GridSearchCV(estimator = MultinomialNB(),
                      param_grid = param_grid,
                      n_jobs = -1,
                      cv = 5)
estimators.fit(counts_train, y_train)
print('='*40, ' Score on CV result ', '='*40)
print('Best Score: ', estimators.best_score_)
print('Best Params: ', estimators.best_params_)
clf = MultinomialNB(alpha = estimators.best_params_['alpha'])   
clf.fit(counts_train, y_train)
print('='*40, ' Score on Counts feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(counts_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(counts_test)))
print('='*100)

Wall time: 0 ns
Best Score:  0.8553499999999999
Best Params:  {'alpha': 0.6}
Score on Train:  0.893675
Score on Test:  0.8548


In [14]:
# TFIDF特徵
%time
param_grid = {'alpha':[1.4, 1.2, 1, 0.8, 0.6]}
estimators = GridSearchCV(estimator = MultinomialNB(),
                      param_grid = param_grid,
                      n_jobs = -1,
                      cv = 5)
estimators.fit(tfidf_train, y_train)
print('='*40, ' Score on CV result ', '='*40)
print('Best Score: ', estimators.best_score_)
print('Best Params: ', estimators.best_params_)
%time
clf = MultinomialNB(alpha = estimators.best_params_['alpha'])   
clf.fit(tfidf_train, y_train)
print('='*40, ' Score on TFIDF feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(tfidf_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(tfidf_test)))
print('='*100)

Wall time: 0 ns
Best Score:  0.84795
Best Params:  {'alpha': 0.6}
Wall time: 0 ns
Score on Train:  0.879375
Score on Test:  0.849


In [15]:
# 平均詞向量特徵
%time
scaler = MinMaxScaler()
scaler.fit(w2v_train)

w2v_train_mm = scaler.transform(w2v_train)
w2v_test_mm = scaler.transform(w2v_test)

param_grid = {'alpha':[1.4, 1.2, 1, 0.8, 0.6]}
estimators = GridSearchCV(estimator = MultinomialNB(),
                        param_grid = param_grid,
                        n_jobs = -1,
                        cv = 5)
estimators.fit(w2v_train_mm, y_train)
print('='*40, ' Score on CV result ', '='*40)
print('Best Score: ', estimators.best_score_)
print('Best Params: ', estimators.best_params_)
%time
clf = MultinomialNB(alpha = estimators.best_params_['alpha'])   
clf.fit(w2v_train_mm, y_train)
print('='*40, ' Score on AvgVector feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(w2v_train_mm)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(w2v_test_mm)))
print('='*100)

Wall time: 0 ns
Best Score:  0.7050750000000001
Best Params:  {'alpha': 0.8}
Wall time: 0 ns
Score on Train:  0.70595
Score on Test:  0.6999


## 監督式學習
### SVM

In [16]:
# 詞頻特徵
%time
clf = SVC(kernel='linear')
clf.fit(counts_train, y_train)
print('='*40, ' Score on Counts feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(counts_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(counts_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.99985
Score on Test:  0.8523


In [17]:
# TFIDF特徵
%time
clf = SVC(kernel='linear')
clf.fit(tfidf_train, y_train)
print('='*40, ' Score on TFIDF feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(tfidf_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(tfidf_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.955325
Score on Test:  0.8846


In [18]:
# 平均詞向量特徵
%time
clf = SVC(kernel='linear')
clf.fit(w2v_train, y_train)
print('='*40, ' Score on AvgVector feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(w2v_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(w2v_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.86935
Score on Test:  0.8525


## 監督式學習
### KNN
一般監督式學習是透過資料訓練出一個模型，但KNN並沒有做訓練的動作。\
KNN如果做訓練的動作主要是為了以某資料料結構儲存個點的關係，進而加速搜尋鄰近K點鄰距的效果。

優點︰精度高、對異常值不敏感、無資料輸入假定。\
缺點︰時間複雜度高、空間複雜度高，訓練模型依賴訓練集資料且不可丟棄。

In [19]:
# 詞頻特徵
%time
param_grid = {'n_neighbors':list(range(1,9))}
estimators = GridSearchCV(estimator = KNeighborsClassifier(),
                        param_grid = param_grid,
                        n_jobs = -1,
                        cv = 5)
estimators.fit(counts_train, y_train)
print('='*40, ' Score on CV result ', '='*40)
print('Best Score: ', estimators.best_score_)
print('Best Params: ', estimators.best_params_)
%time
clf = KNeighborsClassifier(n_neighbors = estimators.best_params_['n_neighbors'])   
clf.fit(counts_train, y_train)
print('='*40, ' Score on Counts feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(counts_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(counts_test)))
print('='*100)

Wall time: 0 ns




Best Score:  nan
Best Params:  {'n_neighbors': 1}
Wall time: 0 ns
Score on Train:  0.99995
Score on Test:  0.6639


In [21]:
# TFIDF特徵
%time
param_grid = {'n_neighbors':list(range(1,9))}
estimators = GridSearchCV(estimator = KNeighborsClassifier(),
                        param_grid = param_grid,
                        n_jobs = -1,
                        cv = 5)
estimators.fit(tfidf_train, y_train)
print('='*40, ' Score on CV result ', '='*40)
print('Best Score: ', estimators.best_score_)
print('Best Params: ', estimators.best_params_)
%time
clf = KNeighborsClassifier(n_neighbors = estimators.best_params_['n_neighbors'], n_jobs=-1)   
clf.fit(tfidf_train, y_train)
print('='*40, ' Score on TFIDF feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(tfidf_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(tfidf_test)))
print('='*100)

Wall time: 0 ns




Best Score:  nan
Best Params:  {'n_neighbors': 1}
Wall time: 0 ns
Score on Train:  0.99995
Score on Test:  0.7869


In [22]:
# 平均詞向量特徵
%time
param_grid = {'n_neighbors':list(range(1,9))}
estimators = GridSearchCV(estimator = KNeighborsClassifier(),
                        param_grid = param_grid,
                        n_jobs = -1,
                        cv = 5)
estimators.fit(w2v_train, y_train)
print('='*40, ' Score on CV result ', '='*40)
print('Best Score: ', estimators.best_score_)
print('Best Params: ', estimators.best_params_)
%time
clf = KNeighborsClassifier(n_neighbors = estimators.best_params_['n_neighbors'], n_jobs=-1)   
clf.fit(w2v_train, y_train)
print('='*40, ' Score on AvgVector feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(w2v_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(w2v_test)))
print('='*100)

Wall time: 0 ns




Best Score:  0.8205500000000001
Best Params:  {'n_neighbors': 8}
Wall time: 0 ns
Score on Train:  0.8657
Score on Test:  0.8272


### RandomForest

In [23]:
# 詞頻特徵
%time
clf = RandomForestClassifier(n_estimators = 500, max_features = 'sqrt', n_jobs=-1, random_state = 10)  
clf.fit(counts_train, y_train)
print('='*40, ' Score on Counts feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(counts_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(counts_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.99995
Score on Test:  0.8595


In [24]:
# TFIDF特徵
%time
clf = RandomForestClassifier(n_estimators = 500, max_features = 'sqrt', n_jobs=-1, random_state = 10)  
clf.fit(tfidf_train, y_train)
print('='*40, ' Score on TFIDF feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(tfidf_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(tfidf_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.99995
Score on Test:  0.8546


In [25]:
# 平均詞向量特徵
%time
clf = RandomForestClassifier(n_estimators = 500, max_features = 'sqrt', n_jobs=-1, random_state = 10)  
clf.fit(w2v_train, y_train)
print('='*40, ' Score on AvgVector feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(w2v_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(w2v_test)))
print('='*100)

Wall time: 0 ns
Score on Train:  0.99995
Score on Test:  0.8292


### XGBoost

In [26]:
# 詞頻特徵
%time
clf = xgb.XGBClassifier(n_estimators=500, objective='multi:softmax', n_jobs=-1, silent=False)
clf.fit(counts_train, y_train)
print('='*40, ' Score on Counts feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(counts_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(counts_test)))
print('='*100)

Wall time: 0 ns




Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Score on Train:  0.999925
Score on Test:  0.8742


In [27]:
# TFIDF特徵
%time
clf = xgb.XGBClassifier(n_estimators=500, objective='multi:softmax', n_jobs=-1, silent=False)
clf.fit(tfidf_train, y_train)
print('='*40, ' Score on TFIDF feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(tfidf_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(tfidf_test)))
print('='*100)

Wall time: 0 ns




Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Score on Train:  0.99995
Score on Test:  0.8745


In [28]:
# 平均詞向量特徵
%time
clf = xgb.XGBClassifier(n_estimators=500, objective='multi:softmax', n_jobs=-1, silent=False)
clf.fit(w2v_train, y_train)
print('='*40, ' Score on AvgVector feature ', '='*40)
print('Score on Train: ', metrics.accuracy_score(y_train, clf.predict(w2v_train)))
print('Score on Test: ', metrics.accuracy_score(y_test, clf.predict(w2v_test)))
print('='*100)

Wall time: 0 ns
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






Score on Train:  0.99995
Score on Test:  0.8537


## 深度學習模型

## MLP

In [32]:
# 詞頻特徵
## 設定超參數
LEARNING_RATE = 1e-3
EPOCHS = 100
BATCH_SIZE = 128
MOMENTUM = 0.95
# 建立資料格式
# 考量時間與記憶體容量，僅保留數量最多的1萬個詞
tokenizer = text.Tokenizer(num_words=10000) 
tokenizer.fit_on_texts(df['CONTENT_SEG'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

counts_train = tokenizer.texts_to_sequences(X_train) 
counts_train = tokenizer.sequences_to_matrix(counts_train, mode='freq')
counts_test = tokenizer.texts_to_sequences(X_test)
counts_test = tokenizer.sequences_to_matrix(counts_test, mode='freq')
print('Shape of counts_train tensor:', counts_train.shape)
print('Shape of counts_test tensor:', counts_test.shape)

y_train_dummy = to_categorical(np.asarray(y_train))
y_test_dummy = to_categorical(np.asarray(y_test))
print('Shape of y_train_dummy tensor:', y_train_dummy.shape)
print('Shape of y_test_dummy tensor:', y_test_dummy.shape)
# 搭建模型框架
keras.backend.clear_session()
model = Sequential()
model.add(Dense(units=512, input_shape=(counts_train.shape[1],), activation='relu'))
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dense(y_train_dummy.shape[1], activation='softmax'))
model.summary()
# 載入 Callbacks, 並將 monitor 設定為監控 validation loss
earlystop = EarlyStopping(monitor="val_acc", 
                        patience=10, 
                        verbose=1)
model.compile(loss='categorical_crossentropy',
            optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE, epsilon=None, decay=0.0),
            metrics=['accuracy'])
%time
model.fit(counts_train, y_train_dummy,
        epochs=EPOCHS, 
        validation_split=0.2,
        batch_size=BATCH_SIZE,
        shuffle=True,
        callbacks=[earlystop])
model.evaluate(counts_test, y_test_dummy)

Found 450075 unique tokens.
Shape of counts_train tensor: (40000, 10000)
Shape of counts_test tensor: (10000, 10000)
Shape of y_train_dummy tensor: (40000, 10)
Shape of y_test_dummy tensor: (10000, 10)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               5120512   
_________________________________________________________________
batch_normalization (BatchNo (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
____________________________________________________



Epoch 2/100




Epoch 3/100




Epoch 4/100




Epoch 5/100




Epoch 6/100




Epoch 7/100




Epoch 8/100




Epoch 9/100




Epoch 10/100




Epoch 11/100




Epoch 12/100




Epoch 13/100




Epoch 14/100




Epoch 15/100




Epoch 16/100




Epoch 17/100




Epoch 18/100




Epoch 19/100




Epoch 20/100




Epoch 21/100




Epoch 22/100




Epoch 23/100




Epoch 24/100




Epoch 25/100




Epoch 26/100




Epoch 27/100




Epoch 28/100




Epoch 29/100




Epoch 30/100




Epoch 31/100




Epoch 32/100




Epoch 33/100




Epoch 34/100




Epoch 35/100




Epoch 36/100




Epoch 37/100




Epoch 38/100




Epoch 39/100




Epoch 40/100




Epoch 41/100




Epoch 42/100




Epoch 43/100




Epoch 44/100




Epoch 45/100




Epoch 46/100




Epoch 47/100




Epoch 48/100




Epoch 49/100




Epoch 50/100




Epoch 51/100




Epoch 52/100




Epoch 53/100




Epoch 54/100




Epoch 55/100




Epoch 56/100




Epoch 57/100




Epoch 58/100




Epoch 59/100




Epoch 60/100




Epoch 61/100




Epoch 62/100




Epoch 63/100




Epoch 64/100




Epoch 65/100




Epoch 66/100




Epoch 67/100




Epoch 68/100




Epoch 69/100




Epoch 70/100




Epoch 71/100




Epoch 72/100




Epoch 73/100




Epoch 74/100




Epoch 75/100




Epoch 76/100




Epoch 77/100




Epoch 78/100




Epoch 79/100




Epoch 80/100




Epoch 81/100




Epoch 82/100




Epoch 83/100




Epoch 84/100




Epoch 85/100




Epoch 86/100




Epoch 87/100




Epoch 88/100




Epoch 89/100




Epoch 90/100




Epoch 91/100




Epoch 92/100




Epoch 93/100




Epoch 94/100




Epoch 95/100




Epoch 96/100




Epoch 97/100




Epoch 98/100




Epoch 99/100




Epoch 100/100






[1.6590462923049927, 0.8503999710083008]

In [33]:
# TF-IDF特徵
## 超參數設定
LEARNING_RATE = 1e-3
EPOCHS = 100
BATCH_SIZE = 128
MOMENTUM = 0.95
# 建立資料格式
# 考量時間與記憶體容量，僅保留數量最多的1萬個詞
tokenizer = text.Tokenizer(num_words=10000) 
tokenizer.fit_on_texts(df['CONTENT_SEG'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

tfidf_train = tokenizer.texts_to_sequences(X_train) 
tfidf_train = tokenizer.sequences_to_matrix(tfidf_train, mode='tfidf')
tfidf_test = tokenizer.texts_to_sequences(X_test)
tfidf_test = tokenizer.sequences_to_matrix(tfidf_test, mode='tfidf')
print('Shape of tfidf_train tensor:', tfidf_train.shape)
print('Shape of tfidf_test tensor:', tfidf_test.shape)

y_train_dummy = to_categorical(np.asarray(y_train))
y_test_dummy = to_categorical(np.asarray(y_test))
print('Shape of y_train tensor:', y_train_dummy.shape)
print('Shape of y_test tensor:', y_test_dummy.shape)
# 搭建模型框架
keras.backend.clear_session()
model = Sequential()
model.add(Dense(units=512, input_shape=(tfidf_train.shape[1],), activation='relu')) ################
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dense(y_train_dummy.shape[1], activation='softmax'))
model.summary()
# 載入 Callbacks, 並將 monitor 設定為監控 validation loss
earlystop = EarlyStopping(monitor="val_acc", 
                        patience=5, 
                        verbose=1)
model.compile(loss='categorical_crossentropy',
            optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE, epsilon=None, decay=0.0),
            metrics=['accuracy'])
%time
model.fit(tfidf_train, y_train_dummy,
        epochs=EPOCHS, 
        validation_split=0.2,
        batch_size=BATCH_SIZE,
        shuffle=True,
        callbacks=[earlystop])
model.evaluate(tfidf_test, y_test_dummy)

Found 450075 unique tokens.
Shape of tfidf_train tensor: (40000, 10000)
Shape of tfidf_test tensor: (10000, 10000)
Shape of y_train tensor: (40000, 10)
Shape of y_test tensor: (10000, 10)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               5120512   
_________________________________________________________________
batch_normalization (BatchNo (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________




Epoch 2/100




Epoch 3/100




Epoch 4/100




Epoch 5/100




Epoch 6/100




Epoch 7/100




Epoch 8/100




Epoch 9/100




Epoch 10/100




Epoch 11/100




Epoch 12/100




Epoch 13/100




Epoch 14/100




Epoch 15/100




Epoch 16/100




Epoch 17/100




Epoch 18/100




Epoch 19/100




Epoch 20/100




Epoch 21/100




Epoch 22/100




Epoch 23/100




Epoch 24/100




Epoch 25/100




Epoch 26/100




Epoch 27/100




Epoch 28/100




Epoch 29/100




Epoch 30/100




Epoch 31/100




Epoch 32/100




Epoch 33/100




Epoch 34/100




Epoch 35/100




Epoch 36/100




Epoch 37/100




Epoch 38/100




Epoch 39/100




Epoch 40/100




Epoch 41/100




Epoch 42/100




Epoch 43/100




Epoch 44/100




Epoch 45/100




Epoch 46/100




Epoch 47/100




Epoch 48/100




Epoch 49/100




Epoch 50/100




Epoch 51/100




Epoch 52/100




Epoch 53/100




Epoch 54/100




Epoch 55/100




Epoch 56/100




Epoch 57/100




Epoch 58/100




Epoch 59/100




Epoch 60/100




Epoch 61/100




Epoch 62/100




Epoch 63/100




Epoch 64/100




Epoch 65/100




Epoch 66/100




Epoch 67/100




Epoch 68/100




Epoch 69/100




Epoch 70/100




Epoch 71/100




Epoch 72/100




Epoch 73/100




Epoch 74/100




Epoch 75/100




Epoch 76/100




Epoch 77/100




Epoch 78/100




Epoch 79/100




Epoch 80/100




Epoch 81/100




Epoch 82/100




Epoch 83/100




Epoch 84/100




Epoch 85/100




Epoch 86/100




Epoch 87/100




Epoch 88/100




Epoch 89/100




Epoch 90/100




Epoch 91/100




Epoch 92/100




Epoch 93/100




Epoch 94/100




Epoch 95/100




Epoch 96/100




Epoch 97/100




Epoch 98/100




Epoch 99/100




Epoch 100/100






[1.3170613050460815, 0.8665000200271606]

### CNN

In [37]:
# 詞向量特徵
MAX_SEQUENCE_LENGTH = 300 # 每条新闻最大长度
EMBEDDING_DIM = 200 # 词向量空间维度

## 超參數設定
LEARNING_RATE = 1e-3
EPOCHS = 10
BATCH_SIZE = 128
MOMENTUM = 0.95
# 建立資料格式
tokenizer = text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['CONTENT_SEG'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

vec_train = tokenizer.texts_to_sequences(X_train) 
vec_train = sequence.pad_sequences(vec_train, maxlen=MAX_SEQUENCE_LENGTH)
vec_test = tokenizer.texts_to_sequences(X_test)
vec_test = sequence.pad_sequences(vec_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of vec_train tensor:', vec_train.shape)
print('Shape of vec_test tensor:', vec_test.shape)

y_train_dummy = to_categorical(np.asarray(y_train))
y_test_dummy = to_categorical(np.asarray(y_test))
print('Shape of y_train_dummy tensor:', y_train_dummy.shape)
print('Shape of y_test_dummy tensor:', y_test_dummy.shape)
# 將詞替換成對應的向量
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in w2v_model:
        embedding_matrix[i] = np.asarray(w2v_model[word],
                                        dtype='float32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
# 搭建模型框架
keras.backend.clear_session()
model = Sequential()
model.add(embedding_layer)
model.add(BatchNormalization())

model.add(Conv1D(256, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(3))

model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(y_train_dummy.shape[1], activation='softmax'))
model.summary()
# 載入 Callbacks, 並將 monitor 設定為監控 validation loss
earlystop = EarlyStopping(monitor="val_acc", 
                        patience=5, 
                        verbose=1)
model.compile(loss='categorical_crossentropy',
            optimizer= tf.keras.optimizers.Adam(lr=LEARNING_RATE, epsilon=None, decay=0.0),
            metrics=['accuracy'])

model.fit(vec_train, y_train_dummy,
        epochs=EPOCHS, 
        validation_split=0.2,
        batch_size=BATCH_SIZE,
        shuffle=True,
        callbacks=[earlystop])
model.evaluate(vec_test, y_test_dummy)

Found 450075 unique tokens.
Shape of vec_train tensor: (40000, 300)
Shape of vec_test tensor: (10000, 300)
Shape of y_train_dummy tensor: (40000, 10)
Shape of y_test_dummy tensor: (10000, 10)


  if word in w2v_model:
  embedding_matrix[i] = np.asarray(w2v_model[word],


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 200)          90015200  
_________________________________________________________________
batch_normalization (BatchNo (None, 300, 200)          800       
_________________________________________________________________
conv1d (Conv1D)              (None, 298, 256)          153856    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 99, 256)           0         
_________________________________________________________________
flatten (Flatten)            (None, 25344)             0         
_________________________________________________________________
dense (Dense)                (None, 200)               5069000   
_________________________________________________________________
dense_1 (Dense)              (None, 10)                2



Epoch 2/10




Epoch 3/10




Epoch 4/10




Epoch 5/10




Epoch 6/10




Epoch 7/10




Epoch 8/10




Epoch 9/10




Epoch 10/10






[1.1649012565612793, 0.7645000219345093]

In [40]:
# 預訓練的詞向量特徵
MAX_SEQUENCE_LENGTH = 300 # 每条新闻最大长度
EMBEDDING_DIM = 400 # 词向量空间维度

## 超參數設定
LEARNING_RATE = 1e-3
EPOCHS = 10
BATCH_SIZE = 128
MOMENTUM = 0.95

# 建立資料格式
tokenizer = text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['CONTENT_SEG'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

vec_train = tokenizer.texts_to_sequences(X_train) 
vec_train = sequence.pad_sequences(vec_train, maxlen=MAX_SEQUENCE_LENGTH)
vec_test = tokenizer.texts_to_sequences(X_test)
vec_test = sequence.pad_sequences(vec_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of vec_train tensor:', vec_train.shape)
print('Shape of vec_test tensor:', vec_test.shape)

y_train_dummy = to_categorical(np.asarray(y_train))
y_test_dummy = to_categorical(np.asarray(y_test))
print('Shape of y_train tensor:', y_train_dummy.shape)
print('Shape of y_test tensor:', y_test_dummy.shape)
# 將詞替換成對應的向量
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in pretrain_w2v_model:
        embedding_matrix[i] = np.asarray(pretrain_w2v_model[word],
                                        dtype='float32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
# 搭建模型框架
keras.backend.clear_session()
model = Sequential()
model.add(embedding_layer)
model.add(BatchNormalization())

model.add(Conv1D(256, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(3))

model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(y_train_dummy.shape[1], activation='softmax'))
model.summary()
# 載入 Callbacks, 並將 monitor 設定為監控 validation loss
earlystop = EarlyStopping(monitor="val_loss", 
                        patience=5, 
                        verbose=1)
model.compile(loss='categorical_crossentropy',
            optimizer=  tf.keras.optimizers.Adam(lr=LEARNING_RATE, epsilon=None, decay=0.0),
            metrics=['accuracy'])

model.fit(vec_train, y_train_dummy,
        epochs=EPOCHS, 
        validation_split=0.2,
        batch_size=BATCH_SIZE,
        shuffle=True,
        callbacks=[earlystop])
model.evaluate(vec_test, y_test_dummy)

Found 450075 unique tokens.
Shape of vec_train tensor: (40000, 300)
Shape of vec_test tensor: (10000, 300)
Shape of y_train tensor: (40000, 10)
Shape of y_test tensor: (10000, 10)


  if word in pretrain_w2v_model:
  embedding_matrix[i] = np.asarray(pretrain_w2v_model[word],


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 400)          180030400 
_________________________________________________________________
batch_normalization (BatchNo (None, 300, 400)          1600      
_________________________________________________________________
conv1d (Conv1D)              (None, 298, 256)          307456    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 99, 256)           0         
_________________________________________________________________
flatten (Flatten)            (None, 25344)             0         
_________________________________________________________________
dense (Dense)                (None, 400)               10138000  
_________________________________________________________________
dense_1 (Dense)              (None, 10)                4

[0.8943570256233215, 0.7912999987602234]

### RNN

In [11]:
# 詞向量特徵
MAX_SEQUENCE_LENGTH = 300 # 每条新闻最大长度
EMBEDDING_DIM = 200 # 词向量空间维度

## 超參數設定
LEARNING_RATE = 1e-3
EPOCHS = 10
BATCH_SIZE = 128
MOMENTUM = 0.95
# 建立資料格式
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(df['CONTENT_SEG'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

pre_vec_train = tokenizer.texts_to_sequences(X_train) 
pre_vec_train = sequence.pad_sequences(pre_vec_train, maxlen=MAX_SEQUENCE_LENGTH)
pre_vec_test = tokenizer.texts_to_sequences(X_test)
pre_vec_test = sequence.pad_sequences(pre_vec_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of pre_vec_train tensor:', pre_vec_train.shape)
print('Shape of pre_vec_test tensor:', pre_vec_test.shape)

y_train_dummy = to_categorical(np.asarray(y_train))
y_test_dummy = to_categorical(np.asarray(y_test))
print('Shape of y_train tensor:', y_train_dummy.shape)
print('Shape of y_test tensor:', y_test_dummy.shape)
# 將詞替換成對應的向量
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in w2v_model:
        embedding_matrix[i] = np.asarray(w2v_model[word],
                                        dtype='float32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
# 搭建模型框架
keras.backend.clear_session()
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(BatchNormalization())
model.add(Dense(y_train_dummy.shape[1], activation='softmax'))
model.summary()
# 載入 Callbacks, 並將 monitor 設定為監控 validation loss
earlystop = EarlyStopping(monitor="val_acc", 
                        patience=5, 
                        verbose=1)
model.compile(loss='categorical_crossentropy',
            optimizer= tf.keras.optimizers.Adam(lr=LEARNING_RATE, epsilon=None, decay=0.0),
            metrics=['acc'])

model.fit(pre_vec_train, y_train_dummy,
        epochs=EPOCHS, 
        validation_split=0.2,
        batch_size=BATCH_SIZE,
        shuffle=True,
        callbacks=[earlystop]
        )
model.evaluate(pre_vec_test, y_test_dummy)

Found 450075 unique tokens.
Shape of pre_vec_train tensor: (40000, 300)
Shape of pre_vec_test tensor: (10000, 300)
Shape of y_train tensor: (40000, 10)
Shape of y_test tensor: (10000, 10)


  if word in w2v_model:
  embedding_matrix[i] = np.asarray(w2v_model[word],


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 200)          90015200  
_________________________________________________________________
lstm (LSTM)                  (None, 200)               320800    
_________________________________________________________________
batch_normalization (BatchNo (None, 200)               800       
_________________________________________________________________
dense (Dense)                (None, 10)                2010      
Total params: 90,338,810
Trainable params: 323,210
Non-trainable params: 90,015,600
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.4427754878997803, 0.8496999740600586]

In [12]:
# 預訓練的詞向量特徵
MAX_SEQUENCE_LENGTH = 300 # 每条新闻最大长度
EMBEDDING_DIM = 400 # 词向量空间维度

## 超參數設定
LEARNING_RATE = 1e-3
EPOCHS = 10
BATCH_SIZE = 128
MOMENTUM = 0.95
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(df['CONTENT_SEG'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

pre_vec_train = tokenizer.texts_to_sequences(X_train) 
pre_vec_train = sequence.pad_sequences(pre_vec_train, maxlen=MAX_SEQUENCE_LENGTH)
pre_vec_test = tokenizer.texts_to_sequences(X_test)
pre_vec_test = sequence.pad_sequences(pre_vec_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of pre_vec_train tensor:', pre_vec_train.shape)
print('Shape of pre_vec_test tensor:', pre_vec_test.shape)

y_train_dummy = to_categorical(np.asarray(y_train))
y_test_dummy = to_categorical(np.asarray(y_test))
print('Shape of y_train_dummy tensor:', y_train_dummy.shape)
print('Shape of y_test_dummy tensor:', y_test_dummy.shape)

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in pretrain_w2v_model:
        embedding_matrix[i] = np.asarray(pretrain_w2v_model[word],
                                        dtype='float32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

keras.backend.clear_session()
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(BatchNormalization())
model.add(Dense(y_train_dummy.shape[1], activation='softmax'))
model.summary()

# 載入 Callbacks, 並將 monitor 設定為監控 validation loss
earlystop = EarlyStopping(monitor="val_acc", 
                        patience=5, 
                        verbose=1)

model.compile(loss='categorical_crossentropy',
            optimizer= tf.keras.optimizers.Adam(lr=LEARNING_RATE, epsilon=None, decay=0.0),
            metrics=['acc'])

%time
model.fit(pre_vec_train, y_train_dummy,
        epochs=EPOCHS, 
        validation_split=0.2,
        batch_size=BATCH_SIZE,
        shuffle=True,
        callbacks=[earlystop]
        )
model.evaluate(pre_vec_test, y_test_dummy)

Found 450075 unique tokens.
Shape of pre_vec_train tensor: (40000, 300)
Shape of pre_vec_test tensor: (10000, 300)
Shape of y_train_dummy tensor: (40000, 10)
Shape of y_test_dummy tensor: (10000, 10)


  if word in pretrain_w2v_model:
  embedding_matrix[i] = np.asarray(pretrain_w2v_model[word],


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 400)          180030400 
_________________________________________________________________
lstm (LSTM)                  (None, 200)               480800    
_________________________________________________________________
batch_normalization (BatchNo (None, 200)               800       
_________________________________________________________________
dense (Dense)                (None, 10)                2010      
Total params: 180,514,010
Trainable params: 483,210
Non-trainable params: 180,030,800
_________________________________________________________________
Wall time: 0 ns
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.51242595911026, 0.8428999781608582]