### Tensorflow 2.0 RNN(遞歸神經網路)-LSTM(長短期記憶網路)情感分類範例
資料集使用keras.dataset中的IMDB影評資料，共50000條評論，訓練集25000和測試集25000被標記為正向(1)/負向(0)評論。<br>
Ref: (https://blog.csdn.net/Forlogen/article/details/101363987)<br>
     (https://medium.com/bandai的機器學習筆記/05-29-機器學習筆記-讓a-i自己寫文章-英文實作篇-莎士比亞-92b3fba71126)

In [1]:
# tensorflow的版本要2.0以上才能用

In [2]:
# 『情緒分析』(Sentiment Analysis) 
# 匯入需要的套件
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import os
import matplotlib.pyplot as plt
# IPython 提供的Magic Command
%matplotlib inline

In [3]:
# 其他常用Magic Command
# %load %save

In [4]:
tf.__version__

'2.9.0'

In [5]:
# 超参数
vocab_size = 10000
max_review_length = 256  # RNN句子長度要相同
embedding_dim = 100  # 維度
units = 64
num_classes = 2
batch_size = 256
epochs = 5

In [6]:
# 載入IMDB資料集
imdb = keras.datasets.imdb

In [7]:
(train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=vocab_size)
train_labels[0]
# 正向

1

In [8]:
test_labels[0]
# 負向

0

In [9]:
print(len(train_labels), len(train_data))

25000 25000


In [10]:
print(len(test_labels), len(test_data))

25000 25000


In [11]:
# 訓練集的資料個數和標籤個數
print("train_data length: {}, train_labels length: {}".format(len(train_data),len(train_labels)))

train_data length: 25000, train_labels length: 25000


In [12]:
train_data
# 把評論的文字拆成對應的編號

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

In [13]:
len(train_data[0])
# 看一下每筆資料長度是否相同

218

In [14]:
# 訓練集資料0和資料1的長度
# 因為只取最常見的前10000個單詞，單詞會被編為1-10000的整數值
print("train_data 0 length: {}, 1 length: {}".format(len(train_data[0]), len(train_data[1])))

train_data 0 length: 218, 1 length: 189


In [15]:
# 建立字典，這裡不須自己寫程式處理，直接使用imdb.get_word_index()就可抓到字典
word_index = imdb.get_word_index()

In [16]:
word_index

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [17]:
# 定義反編碼函式
word_index = {key: (value + 3) for key, value in word_index.items()}

word_index["<PAD>"] = 0   # 把句子補成相同長度
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNSED>"] = 3

In [18]:
def decode_review(text):
    # dict.get(key, default=None)，字典中查找key值，有就回傳對應的value；否則回傳預設值。
    return ' '.join([reversed_word_index.get(i, '?') for i in text])

In [19]:
print(list(word_index.items())[:5])

[('fawn', 34704), ('tsukino', 52009), ('nunnery', 52010), ('sonja', 16819), ('vani', 63954)]


In [20]:
reversed_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [21]:
print(list(reversed_word_index.items())[:5])

[(34704, 'fawn'), (52009, 'tsukino'), (52010, 'nunnery'), (16819, 'sonja'), (63954, 'vani')]


In [22]:
#顯示訓練集資料0的編碼值
print("Coded review:")
print(train_data[0])

#顯示訓練集資料0的反編碼(英語)值
print("Decoded review:")
print(decode_review(train_data[0]))

Coded review:
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Decoded review:
<START> this film was just bri

In [23]:
"""
為了後續資料訓練的需求，每筆資料序列的長度必須一致，所以要進行補齊或裁切的動作
將所有的輸入填充或縮減為最長資料的長度
資料前處理
設定最長序列為256，將每筆資料不足的部分填充為<PAD>
maxlen: None或整數，為序列的最大長度。大於此長度的序列將被截短；小於此長度的序列將在後面補0或指定的值(value)。
padding: 'pre'或'post'，補在序列前面或是後面。 
"""

train_data = keras.preprocessing.sequence.pad_sequences(train_data, 
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=max_review_length)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, 
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=max_review_length )

tf.keras.utils.pad_sequences #2.9的版本用這個

In [24]:
# 補齊長度後的訓練集資料0和資料1的長度
print("train_data 0 length: {}, 1 length: {}".format(len(train_data[0]), len(train_data[1])))

train_data 0 length: 256, 1 length: 256


In [25]:
len(word_index)

88588

In [26]:
# 建立模型，方法一:
"""
        Embedding: Turns positive integers (indexes) into dense vectors of fixed size.
        e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
        This layer can only be used as the first layer in a model.
        輸入圖層。一個可訓練的查找表，它將每個字符的數字映射到具有embedding_dim維度的向量
"""
"""
LSTM: 四個unit
    Input Gate : 控制是否讓此次的值輸入下一次運算，並計算值(激勵函數))
    Forget Gate : 清空memory?
    Memory Cell : 要不要把計算出來的值儲存，讓下一個cell使用
                  (此次計算的值 + 原來memory紀錄的值) * Forget Gate(機率)
    Output Gate : 值要不要輸出
LSTM 除了從頭開始訓練，也會從後面往回推回來訓練(雙向)
"""
def get_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(word_index), embedding_dim),   # 把正整數轉成密度向量，固定大小 = 100
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units)),
        tf.keras.layers.Dense(units, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

In [27]:
model = get_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         8858800   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              84480     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 8,951,601
Trainable params: 8,951,601
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.compile(optimizer = keras.optimizers.Adam(0.001),
              loss = keras.losses.BinaryCrossentropy(from_logits=True),
              metrics = ['accuracy'])

In [31]:
model.fit(train_data, train_labels,
          epochs=epochs, batch_size=batch_size,
          validation_data = (test_data, test_labels))

Epoch 1/5


  return dispatch_target(*args, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17b487c4c10>

In [32]:
# Gated Recurrent Unit - RNN的變形，能加快執行速度及減少記憶體的耗用
def GRU_Model():
    model = keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, output_dim=32, input_length=max_review_length),
        keras.layers.GRU(32, return_sequences = True),
        keras.layers.GRU(1, activation='sigmoid', return_sequences=False)
    ])
    
    model.compile(optimizer=keras.optimizers.Adam(0.001),
                  loss=keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    return model

In [33]:
model = GRU_Model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 256, 32)           320000    
                                                                 
 gru (GRU)                   (None, 256, 32)           6336      
                                                                 
 gru_1 (GRU)                 (None, 1)                 105       
                                                                 
Total params: 326,441
Trainable params: 326,441
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.fit(train_data, train_labels,
          epochs=epochs, batch_size=batch_size,
          validation_data = (test_data, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17b43203e80>

In [35]:
# 建立模型，方法二: 自定義模型，比較有彈性
class RNNModel(keras.Model):

    def __init__(self, units, num_classes, num_layers):
        super(RNNModel,self).__init__()
        self.units = units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_review_length)
        self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(self.units))
        self.dense = keras.layers.Dense(1)
        
    def call(self, x, training=None, mask=None):
        x = self.embedding(x)
        x = self.lstm(x)
        x = self.dense(x)
        
        return x

In [36]:
model = RNNModel(units, num_classes, num_layers=2)

model.compile(optimizer = keras.optimizers.Adam(0.001),
              loss = keras.losses.BinaryCrossentropy(from_logits=True),
              metrics = ['accuracy'])

In [37]:
model.fit(train_data, train_labels,
          epochs=epochs, batch_size=batch_size,
          validation_data = (test_data, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17b3ee30bb0>

In [39]:
get_ipython().run_cell_magic('time', '', 'history = model.fit(train_data, train_labels, batch_size=batch_size, epochs=epochs, validation_split=0.3)')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: 

In [None]:
result = model.evaluate(test_data, test_labels)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['training','validation'], loc = 'upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()