### This is a template of seq2seq model with attention layer

#### Description:

- 使用Seq2Seq無掩碼、Seq2Seq基礎模型、Seq2Seq+Attention三種方法
- Key concept: Encoder, Decoder


Seq2Seq模型實現並不難，但是需要考慮如下幾個細節：

- 輸入序列長度不一致如何處理，掩碼屏蔽存在什麼問題
- 如何解決RNN模型固有的長期依賴問題
- Encoder 和 Decoder 之間傳遞的到底是什麼


Ref:https://hk.codetreasures.com/blog/detail/e5rBqs3qAw

- Sequence pad: 將不同長度句子透過填充  ex: 填充"" 將sequence 轉為相同長度


- Mask: 填充之後的句子存在大量的無意義字符"" 這是我們實際結果不需要的，Mask 功用為告知模型忽略填充字符的影響呢，專注於實際重要的數據, 也能避免loss 計算時被考慮進去
>在Keras中只需加入mask_zero = True 即可

- Encoder和Decoder之間存在一個context，這個 context 其實是：Encoder最後一個時間步RNN的隱藏層狀態

- Teacher forcing:Decoder 在訓練過程中引入先驗知識：將標籤 Y 作為輸入；而推理過程則採用上一個時間步的輸出作為下一個時間步的輸入

In [4]:
import numpy as np
import tensorflow as tf
import collections
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import layers, optimizers, datasets
import os,sys,tqdm



import random
import string


## 1. Create dataset

- 隨機生成文字序列, 非正常數據
- y1開頭加入 START, y2不加入

In [10]:
words = ["#","<start>",'<end>']+list(string.ascii_uppercase)

ids = [0,1,2]+[i+3 for i in range(26)]
words_to_ids = dict(zip(words,ids))
ids_to_words = dict(zip(ids,words))

MAX_LEN = 11 # 填充後的序列長度

def gen_string():
    '''
    生成字符串及其倒序
    '''
    length = np.random.randint(6,11)
    x = [np.random.choice(list(string.ascii_uppercase)) for _ in range(length)]
    x_reversed = reversed(x)
    return list(x),list(x_reversed)


def create_dataset(batch_size):
    '''
    數據集的創建
    '''
    x,y1,y2 = [],[],[]
    for i in range(batch_size):
        example = gen_string()
        x.append([words_to_ids[c] for c in example[0]])
        y1.append([1]+[words_to_ids[c] for c in example[1]])
        y2.append([words_to_ids[c] for c in example[1]])  
        
    x = keras.preprocessing.sequence.pad_sequences(x,maxlen = 10,padding = 'post')
    y1 = keras.preprocessing.sequence.pad_sequences(y1,maxlen = MAX_LEN,padding = 'post')
    y2 = keras.preprocessing.sequence.pad_sequences(y2,maxlen = MAX_LEN,padding = 'post')
    
    return x,y1,y2
        
        
        

In [15]:
train_x,train_y1,train_y2 = create_dataset(32)

(array([[28, 27, 23, 14,  9, 18, 27,  3,  0,  0],
        [13, 18,  7, 17,  5, 10, 28,  0,  0,  0],
        [20, 27,  8, 22, 20,  7,  9,  4,  0,  0],
        [15, 19,  7, 28, 26, 27,  0,  0,  0,  0],
        [19,  7,  9,  3, 12, 21, 20, 12,  0,  0],
        [26, 16, 20, 14, 14, 27,  4,  4,  0,  0],
        [15, 17, 19,  9, 15, 19, 26,  6, 18,  0],
        [16,  5, 19, 27, 11, 15,  0,  0,  0,  0],
        [17, 11, 24, 20,  7, 17,  0,  0,  0,  0],
        [16, 19,  9, 28,  4,  3,  0,  0,  0,  0],
        [23, 18,  4,  8, 17,  4, 20, 14,  0,  0],
        [24, 21, 19,  5,  4, 10,  8, 18, 26,  0],
        [ 6, 22,  5, 11,  7, 23,  7, 17,  0,  0],
        [19, 15, 25, 13,  4, 13,  0,  0,  0,  0],
        [26,  7, 21,  5, 22, 19,  0,  0,  0,  0],
        [ 5,  9,  4, 26, 13, 20,  9, 15,  0,  0],
        [10, 23, 28,  6, 22, 11,  6, 12, 18, 15],
        [ 3, 17, 23,  6,  8, 23,  4,  0,  0,  0],
        [ 5, 24, 20, 11, 18, 23, 12,  3, 26,  0],
        [20, 11, 22,  5,  5, 18, 24,  0,  0,  0],


## 2. Basic model

- No mask

In [12]:
class Encoder(keras.models.Model):
    def __init__(self,vocab_size,embed_size = 50,units = 128,**kwargs):
        super().__init__(**kwargs)
        self.embedding_layer = keras.layers.Embedding(input_dim = vocab_size,output_dim = embed_size)
        self.lstm_layer = keras.layers.LSTM(units,return_sequences = True,return_state = True)
        
        
    def call(self,inputs):
        
        embed = self.embedding_layer(inputs)
        encoder_output, state_h,state_c = self.lstm_layer(embed)
        encoder_state = [state_h,state_c]
        
        return encoder_output,encoder_state
    
    
class Decoder(keras.models.Model):
    
    def __init__(self,vocab_size,embed_size=50,units = 128,**kwargs):
        
        super().__init__(**kwargs)
        
        self.embedding_layer = keras.layers.Embedding(input_dim=  vocab_size,output_dim=embed_size)
        self.lstm_layer = keras.layers.LSTM(units,return_sequences = True,return_state = True)
        
    def call(self,inputs,state):
        
        embed = self.embedding_layer(inputs)
        decoder_output,state_h,state_c = self.lstm_layer(embed,initial_state = state)
        
        decoder_state = [state_h,state_c]
        
        
        return decoder_output,decoder_state
    
    

        
        

In [14]:
encoder_inputs = keras.layers.Input(shape = [None],name = 'encoder_inputs')
decoder_inputs = keras.layers.Input(shape = [None],name = 'decoder_inputs')

encoder_output,encoder_state = Encoder(vocab_size=len(words_to_ids))(encoder_inputs)
decoder_output,decoder_state = Decoder(vocab_size=len(words_to_ids))(decoder_inputs,encoder_state)

output = keras.layers.Dense(len(words_to_ids),activation = 'softmax',name = 'dense')(decoder_output)

model = keras.models.Model([encoder_inputs,decoder_inputs],[output])

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.fit([train_x,train_y1],train_y2,epochs = 20)