## LSTM by Hand

通过自定义层实现LSTM，学习自Tensorflow Codelab线下活动(20201114)  
分享内容参考      https://zhuanlan.zhihu.com/p/293208563  
自定义LSTM层来源: https://www.bilibili.com/video/BV1FV41117Uz/  
  
  
[LSTM简介](https://zh.wikipedia.org/wiki/%E9%95%B7%E7%9F%AD%E6%9C%9F%E8%A8%98%E6%86%B6)

In [1]:
import tensorflow as tf
import numpy as np
import jieba
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 80
pd.options.display.precision = 4
pd.options.display.max_rows = 999
pd.options.display.float_format = '{:.4f}'.format  # 防止科学计数法，小数显示4位

In [2]:
jieba.enable_paddle()

Paddle enabled successfully......


In [3]:
batch_size = 4
sequence_length = 5
input_size = 30
output_size = 20

x = tf.random.uniform((batch_size, sequence_length, input_size))

In [4]:
x.shape

TensorShape([4, 5, 30])

In [5]:
# LSTM's input: [batch_size, sequence_length, input_size]
# LSTM's output1: [batch_size, sequence_length, input_size]
#        output2: [batch_size, input_size]


In [6]:
xt = x[:, 0, :]

In [7]:
xt.shape

TensorShape([4, 30])

### 按照LSTM的公式写出计算过程

In [8]:
wf = tf.random.uniform((input_size, output_size))
wi = tf.random.uniform((input_size, output_size))
wo = tf.random.uniform((input_size, output_size))
wc = tf.random.uniform((input_size, output_size))

uf = tf.random.uniform((output_size, output_size))
ui = tf.random.uniform((output_size, output_size))
uo = tf.random.uniform((output_size, output_size))
uc = tf.random.uniform((output_size, output_size))

bf = tf.random.uniform((1, output_size))
bi = tf.random.uniform((1, output_size))
bo = tf.random.uniform((1, output_size))
bc = tf.random.uniform((1, output_size))



In [9]:
sequence_outputs = []
for i in range(sequence_length):

    if i == 0:
        xt = x[:, 0, :]
        ft = tf.sigmoid(tf.matmul(xt, wf) + bf)
        it = tf.sigmoid(tf.matmul(xt, wi) + bi)
        ot = tf.sigmoid(tf.matmul(xt, wo) + bo)
        cht = tf.tanh(tf.matmul(xt, wc) + bc)

        ct = it * cht
        ht = ot * tf.tanh(ct)
    
    else:
        xt = x[:, 0, :]
        ft = tf.sigmoid(tf.matmul(xt, wf) + bf)
        it = tf.sigmoid(tf.matmul(xt, wi) + bi)
        ot = tf.sigmoid(tf.matmul(xt, wo) + bo)
        cht = tf.tanh(tf.matmul(xt, wc) + bc)

        ct = ft * ct + it * cht
        ht = ot * tf.tanh(ct)
    
    sequence_outputs.append(ht)

In [10]:
sequence_outputs = tf.stack(sequence_outputs)
sequence_outputs = tf.transpose(sequence_outputs, (1, 0, 2))

In [11]:
sequence_outputs

<tf.Tensor: shape=(4, 5, 20), dtype=float32, numpy=
array([[[0.7571146 , 0.76023316, 0.75638837, 0.7600371 , 0.7607675 ,
         0.76129615, 0.7582463 , 0.7598633 , 0.76045233, 0.7598843 ,
         0.7596849 , 0.7601457 , 0.75801605, 0.7604064 , 0.76047444,
         0.75955904, 0.7567714 , 0.7599652 , 0.7603811 , 0.7583166 ],
        [0.9596851 , 0.9629085 , 0.9577052 , 0.9630542 , 0.96312475,
         0.9633364 , 0.96006715, 0.96258503, 0.9626385 , 0.9621381 ,
         0.9618838 , 0.96242076, 0.96143407, 0.96283245, 0.9631819 ,
         0.96082014, 0.9594744 , 0.9626155 , 0.96266365, 0.96117103],
        [0.9909878 , 0.99412656, 0.9886444 , 0.9943617 , 0.9942011 ,
         0.9946081 , 0.99110043, 0.9939784 , 0.9937889 , 0.99327695,
         0.9929588 , 0.9934904 , 0.99324894, 0.99398035, 0.9944741 ,
         0.9922902 , 0.99102724, 0.99385715, 0.9938059 , 0.99254787],
        [0.99533576, 0.9984539 , 0.99292237, 0.99870116, 0.9984954 ,
         0.9990082 , 0.9953959 , 0.9983723 , 0.9

### 利用LSTM计算过程创建自定义LSTM层

In [12]:
class CustomLSTM(tf.keras.layers.Layer):
    
    """
    LSTM's input: [batch_size, sequence_length, input_size]
    LSTM's output1: [batch_size, sequence_length, input_size]
           output2: [batch_size, input_size]
    """
    
    def __init__(self, output_size, return_sequence=False):
        super(CustomLSTM, self).__init__()
        self.output_size = output_size
        self.return_sequence = return_sequence
    
    def build(self, input_shape):
        super(CustomLSTM, self).build(input_shape)
        input_size = int(input_shape[-1])
        
        self.wf = self.add_weight('wf', shape=(input_size, self.output_size))
        self.wi = self.add_weight('wi', shape=(input_size, self.output_size))
        self.wo = self.add_weight('wo', shape=(input_size, self.output_size))
        self.wc = self.add_weight('wc', shape=(input_size, self.output_size))

        self.uf = self.add_weight('uf', shape=(self.output_size, self.output_size))
        self.ui = self.add_weight('ui', shape=(self.output_size, self.output_size))
        self.uo = self.add_weight('uo', shape=(self.output_size, self.output_size))
        self.uc = self.add_weight('uc', shape=(self.output_size, self.output_size))

        self.bf = self.add_weight('bf', shape=(1, self.output_size))
        self.bi = self.add_weight('bi', shape=(1, self.output_size))
        self.bo = self.add_weight('bo', shape=(1, self.output_size))
        self.bc = self.add_weight('bc', shape=(1, self.output_size))

    def call(self, x):
        sequence_outputs = []
        for i in range(sequence_length):
            if i == 0:
                xt  = x[:, 0, :]
                ft  = tf.sigmoid(tf.matmul(xt, self.wf) + self.bf)
                it  = tf.sigmoid(tf.matmul(xt, self.wi) + self.bi)
                ot  = tf.sigmoid(tf.matmul(xt, self.wo) + self.bo)
                cht = tf.tanh(   tf.matmul(xt, self.wc) + self.bc)
                ct  = it * cht
                ht  = ot * tf.tanh(ct)

            else:
                xt  = x[:, 0, :]
                ft  = tf.sigmoid(tf.matmul(xt, self.wf) + self.bf)
                it  = tf.sigmoid(tf.matmul(xt, self.wi) + self.bi)
                ot  = tf.sigmoid(tf.matmul(xt, self.wo) + self.bo)
                cht = tf.tanh(  tf.matmul(xt, self.wc) + self.bc)
                ct  = ft * ct + it * cht
                ht  = ot * tf.tanh(ct)
                
            sequence_outputs.append(ht)
            
        sequence_outputs = tf.stack(sequence_outputs)
        sequence_outputs = tf.transpose(sequence_outputs, (1, 0, 2))
        if self.return_sequence:
            return sequence_outputs
        return sequence_outputs[:, -1, :]

### 模拟数据观察自定义LSTM层的输出结果

In [13]:
x = tf.random.uniform((batch_size, sequence_length, input_size))

In [14]:
lstm = CustomLSTM(output_size=output_size)

In [15]:
lstm(x)

<tf.Tensor: shape=(4, 20), dtype=float32, numpy=
array([[-1.13457926e-01, -1.16072342e-01,  1.11910909e-01,
        -7.60515332e-02,  5.98283000e-02, -7.51171052e-01,
         2.45601207e-01,  1.67988434e-01, -2.49476284e-01,
         1.23574570e-01, -1.69230461e-01,  2.44344841e-03,
         2.81893879e-01, -1.09449737e-01,  5.97361065e-02,
        -5.30856311e-01, -3.31262834e-02, -8.70979056e-02,
        -3.75998281e-02,  1.13871396e-01],
       [-3.98984402e-01, -2.50022531e-01,  3.39957178e-01,
        -7.99013376e-02,  1.78711563e-01, -7.31003881e-01,
         1.55839369e-01, -7.69722238e-02, -4.26059775e-02,
        -2.04587296e-01, -1.92697451e-01,  1.14462167e-01,
         2.89757639e-01, -4.26560223e-01,  2.42148545e-02,
        -3.62793118e-01, -1.44181341e-01, -5.50301820e-02,
        -2.69002914e-01, -6.95905909e-02],
       [-1.25852779e-01, -2.03309968e-01,  1.47665858e-01,
        -5.91915734e-02,  3.53709096e-03, -6.62378013e-01,
         1.45682007e-01,  1.52762264e-0

### 使用自定义的LSTM层使用随机数据进行训练

In [16]:
model = tf.keras.Sequential([
    CustomLSTM(output_size=32), 
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer = tf.keras.optimizers.Adam()
)

In [17]:
x_batch = tf.random.uniform((batch_size, sequence_length, input_size))
y_batch = tf.random.uniform((batch_size,), maxval=2, dtype=tf.int32)

In [18]:
x_batch.shape

TensorShape([4, 5, 30])

In [19]:
y_batch.shape

TensorShape([4])

In [20]:
y_batch

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 1, 0, 1], dtype=int32)>

In [21]:
model.train_on_batch(x_batch, y_batch)



0.720816969871521

In [22]:
x_data = tf.random.uniform((batch_size * 1000, sequence_length, input_size))
y_data = tf.random.uniform((batch_size * 1000,), maxval=2, dtype=tf.int32)

In [23]:
model.fit(x_data, y_data, batch_size=4)



<tensorflow.python.keras.callbacks.History at 0x7fa1861d6390>

In [24]:
model.fit(x_data, y_data, batch_size=4)



<tensorflow.python.keras.callbacks.History at 0x7fa18d48c550>

In [25]:
model.fit(x_data, y_data, batch_size=4)



<tensorflow.python.keras.callbacks.History at 0x7fa18d4b2710>

### 使用自定义LSTM层对文本数据集进行实战

In [26]:
from zh_dataset_inews import title_train, label_train, content_train, title_test, label_test, content_test

In [87]:
for x, y in zip(title_train[:10], label_train[:10]):
    print(x, y)

周六晚到卖场听夜场摇滚 1
北京老教授泄露，持有山河药辅节后下跌公告，速速看看！！！ 1
张滩镇积极开展基干民兵训练活动 0
俩小伙无证骑摩托，未成年还试图闯卡！ 2
不好意思，你不配做深圳人!_搜狐汽车_搜狐网 2
蔡英文元旦升旗遇抗议 民众：枪毙蔡英文 2
巢湖市绞吸机械清淤公司重在回访-照明器材项目合作–光波网 1
出租屋半年被偷8次：整栋楼共用一个锁芯 2
从林芝到拉萨，还可以这样玩! 1
为何说奇瑞是技术达人? 看了“雄狮”你就懂了 1


In [27]:
title_train_cut = [' '.join(jieba.cut(x, cut_all=False)) for x in title_train]
title_test_cut  = [' '.join(jieba.cut(x, cut_all=False)) for x in title_test]

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/w2/n3x0x93n5klgjv3j05dp2wqc0000gp/T/jieba.cache
Loading model cost 1.064 seconds.
Prefix dict has been built successfully.


In [28]:
len(title_train_cut)

5355

In [29]:
title_train_cut[:10]

['周六 晚到 卖场 听 夜场 摇滚',
 '北京 老 教授 泄露 ， 持有 山河 药辅 节后 下跌 公告 ， 速速 看看 ！ ！ ！',
 '张滩 镇 积极开展 基干民兵 训练 活动',
 '俩 小伙 无证 骑 摩托 ， 未成年 还 试图 闯卡 ！',
 '不好意思 ， 你 不配 做 深圳 人 ! _ 搜狐 汽车 _ 搜狐网',
 '蔡 英文 元旦 升旗 遇 抗议   民众 ： 枪毙 蔡 英文',
 '巢湖市 绞吸 机械 清淤 公司 重在 回访 - 照明 器材 项目 合作 – 光波 网',
 '出租屋 半年 被 偷 8 次 ： 整栋 楼 共用 一个 锁 芯',
 '从 林芝 到 拉萨 ， 还 可以 这样 玩 !',
 '为何 说 奇瑞 是 技术 达 人 ?   看 了 “ 雄狮 ” 你 就 懂 了']

In [30]:
text_vector = tf.keras.layers.experimental.preprocessing.TextVectorization()
# 学习词表
text_vector.adapt(title_train_cut)


In [46]:
vocab_size = len(text_vector.get_vocabulary())
embedding_dim = 128

In [47]:
type(x_data)

tensorflow.python.framework.ops.EagerTensor

通过 text_vector('你 好') 和  text_vector('你好')对比发现，这里没有进行分词   

In [48]:
text_vector('你 好')

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([18, 98])>

In [49]:
text_vector('你好')

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([2896])>

In [50]:
title_train_text_vector = text_vector(title_train_cut) # [text_vector(x) for x in title_train_cut]
title_test_text_vector  = text_vector(title_test_cut) # [text_vector(x) for x in title_test_cut]


In [51]:
test_input_dataset = tf.data.Dataset.from_tensor_slices(title_train_text_vector)

In [52]:
title_train_text_vector[:10].shape

TensorShape([10, 44])

In [53]:
x_train = tf.convert_to_tensor(title_train_text_vector)
x_test  = tf.convert_to_tensor(title_test_text_vector)

In [54]:
type(x_train)

tensorflow.python.framework.ops.EagerTensor

In [55]:
y_train = tf.convert_to_tensor(label_train)
y_test  = tf.convert_to_tensor(label_test)

In [56]:
test_embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)

In [57]:
x_train.shape

TensorShape([5355, 44])

In [58]:
x_train_embedding = test_embedding_layer(x_train)

In [59]:
x_train_embedding.shape

TensorShape([5355, 44, 128])

In [80]:
model_text = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    CustomLSTM(output_size=32), 
    tf.keras.layers.Dense(3, activation='softmax')
])

model_text.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer = tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [81]:
x_train.shape

TensorShape([5355, 44])

In [82]:
y_train.shape

TensorShape([5355])

In [83]:
model_text.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         2328064   
_________________________________________________________________
custom_lstm_3 (CustomLSTM)   (None, 32)                20608     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 99        
Total params: 2,348,771
Trainable params: 2,348,771
Non-trainable params: 0
_________________________________________________________________


In [84]:
history_model_text = model_text.fit(
    x_train, y_train, 
    validation_split=0.1, 
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [60]:
model_text_after_embedding = tf.keras.Sequential([
    CustomLSTM(output_size=32), 
    tf.keras.layers.Dense(3, activation='softmax')
])

model_text_after_embedding.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer = tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [63]:
history_model_text_after_embedding = model_text_after_embedding.fit(
    x_train_embedding, y_train, 
    validation_split=0.1, 
    epochs=20,
    batch_size=128
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [65]:
x_train_embedding_batch = x_train_embedding[:4, :, :]
x_train_embedding_batch.shape

TensorShape([4, 44, 128])

In [67]:
y_train_embedding_batch = y_train[:4]
y_train_embedding_batch.shape

TensorShape([4])

In [69]:
model_text_after_embedding.train_on_batch(x_train_embedding_batch, y_train_embedding_batch)



[0.9686881899833679, 0.5148147940635681]

In [75]:
x_train_embedding.shape

TensorShape([5355, 44, 128])

In [74]:
model_text_after_embedding.fit(x_train_embedding, y_train, batch_size = 10)



<tensorflow.python.keras.callbacks.History at 0x7fa1118b1f10>

In [71]:
x_train_embedding.shape

TensorShape([5355, 44, 128])

In [None]:
model_text.evaluate(x_test, y_test)

In [None]:
y_test_pred = model_text.predict(x_test)

In [None]:
len(y_test_pred.argmax(axis=1))

In [None]:
len(x_test)

In [None]:
output_check = pd.DataFrame({'title_test': title_test, 'label_test': label_test, 'y_test_pred': y_test_pred.argmax(axis=1)})

In [None]:
output_check

In [None]:
output_check.query('label_test != y_test_pred')

In [85]:
print(1)

1
