# Neural Chinese Poetry to English Translation

## 1. Data Cleaning
- Read the data
- Save them as inputs and outputs

In [None]:
with open('poet-with-translation3.txt', 'r', encoding='utf8') as f:
    data = f.readlines()
i = 25
data_test = []
while i < len(data):
    data_test.append(data.pop(i))
    i+=24
print(len(data_test))


125


In [None]:
from tqdm import tqdm
import re
inputs = []
outputs = []
for line in tqdm(data):
    [ch, en] = line.strip('\n').split('\t') 
    en = re.sub(r"([?.!,¿;:])", r" \1 ", en.strip().lower())
    en = re.sub(r"(['-])", r" ", en)
    en = re.sub(r'[" "]+', " ", en)
    ch = re.sub(r"([“”‘’])", r" ", ch.strip())
    inputs.append(ch)
    outputs.append(en.strip().lower())


100%|██████████| 3005/3005 [00:00<00:00, 66717.57it/s]


In [None]:
print(inputs[:10])

['孤鸿海上来，', '池潢不敢顾；', '侧见双翠鸟，', '巢在三珠树。', '矫矫珍木巅，', '得无金丸惧？', '美服患人指，', '高明逼神恶。', '今我游冥冥，', '弋者何所慕？']


In [None]:
print(outputs[:10])

['a lonely swan from the sea flies ,', 'to alight on puddles it does not deign .', 'nesting in the poplar of pearls', 'it spies and questions green birds twain :', 'don t you fear the threat of slings ,', 'perched on top of branches so high ?', 'nice clothes invite pointing fingers ,', 'high climbers god s good will defy .', 'bird hunters will crave me in vain ,', 'for i roam the limitless sky .']


### 1.1 English word splitter
我们将英文用空格隔开即可，但是需要稍微修改一下，将大写字母全部用小写字母代替。在上文中使用`.lower`进行了替代。

```py
for line in tqdm(data):
    [en, ch] = line.strip('\n').split('\t')
    inputs.append(en[:-1].lower())
    outputs.append(ch[:-1])

```
此处我们只需要将英文用空格分开即可。

In [None]:
outputs = [en.split(' ') for en in outputs]

In [None]:
print(outputs[:10])

[['a', 'lonely', 'swan', 'from', 'the', 'sea', 'flies', ','], ['to', 'alight', 'on', 'puddles', 'it', 'does', 'not', 'deign', '.'], ['nesting', 'in', 'the', 'poplar', 'of', 'pearls'], ['it', 'spies', 'and', 'questions', 'green', 'birds', 'twain', ':'], ['don', 't', 'you', 'fear', 'the', 'threat', 'of', 'slings', ','], ['perched', 'on', 'top', 'of', 'branches', 'so', 'high', '?'], ['nice', 'clothes', 'invite', 'pointing', 'fingers', ','], ['high', 'climbers', 'god', 's', 'good', 'will', 'defy', '.'], ['bird', 'hunters', 'will', 'crave', 'me', 'in', 'vain', ','], ['for', 'i', 'roam', 'the', 'limitless', 'sky', '.']]


### 1.2 Chinese word splitter
- Using package Jieba


In [None]:
import jieba
jieba_inputs = [[char for char in line if char != ' '] for line in inputs[:10]]
print(jieba_inputs)

[['孤', '鸿', '海', '上', '来', '，'], ['池', '潢', '不', '敢', '顾', '；'], ['侧', '见', '双', '翠', '鸟', '，'], ['巢', '在', '三', '珠', '树', '。'], ['矫', '矫', '珍', '木', '巅', '，'], ['得', '无', '金', '丸', '惧', '？'], ['美', '服', '患', '人', '指', '，'], ['高', '明', '逼', '神', '恶', '。'], ['今', '我', '游', '冥', '冥', '，'], ['弋', '者', '何', '所', '慕', '？']]


In [None]:
inputs = [[char for char in line if char != ' '] for line in tqdm(inputs)]
print(inputs[:10])

100%|██████████| 3005/3005 [00:00<00:00, 219564.55it/s]

[['孤', '鸿', '海', '上', '来', '，'], ['池', '潢', '不', '敢', '顾', '；'], ['侧', '见', '双', '翠', '鸟', '，'], ['巢', '在', '三', '珠', '树', '。'], ['矫', '矫', '珍', '木', '巅', '，'], ['得', '无', '金', '丸', '惧', '？'], ['美', '服', '患', '人', '指', '，'], ['高', '明', '逼', '神', '恶', '。'], ['今', '我', '游', '冥', '冥', '，'], ['弋', '者', '何', '所', '慕', '？']]





### 1.3 Generate dictionaries


In [None]:
def get_vocab(data, init=['<PAD>']):
    vocab = init
    for line in tqdm(data):
        for word in line:
            if word not in vocab:
                vocab.append(word)
    return vocab

SOURCE_CODES = ['<PAD>']
TARGET_CODES = ['<PAD>', '<GO>', '<EOS>']
encoder_vocab = get_vocab(inputs, init=SOURCE_CODES)
decoder_vocab = get_vocab(outputs, init=TARGET_CODES)

100%|██████████| 3005/3005 [00:00<00:00, 14386.78it/s]
100%|██████████| 3005/3005 [00:00<00:00, 10150.15it/s]


In [None]:
print(encoder_vocab)
print(decoder_vocab[:10])
print(len(decoder_vocab))

['<PAD>', '孤', '鸿', '海', '上', '来', '，', '池', '潢', '不', '敢', '顾', '；', '侧', '见', '双', '翠', '鸟', '巢', '在', '三', '珠', '树', '。', '矫', '珍', '木', '巅', '得', '无', '金', '丸', '惧', '？', '美', '服', '患', '人', '指', '高', '明', '逼', '神', '恶', '今', '我', '游', '冥', '弋', '者', '何', '所', '慕', '兰', '叶', '春', '葳', '蕤', '桂', '华', '秋', '皎', '洁', '欣', '此', '生', '意', '自', '尔', '为', '佳', '节', '谁', '知', '林', '栖', '闻', '风', '坐', '相', '悦', '草', '有', '本', '心', '求', '折', '幽', '归', '独', '卧', '滞', '虑', '洗', '清', '持', '谢', '因', '之', '传', '远', '情', '日', '夕', '怀', '空', '感', '至', '精', '飞', '沈', '理', '隔', '江', '南', '丹', '橘', '经', '冬', '犹', '绿', '岂', '伊', '地', '气', '暖', '岁', '寒', '可', '以', '荐', '嘉', '客', '奈', '阻', '重', '深', '运', '命', '惟', '遇', '循', '环', '寻', '徒', '言', '桃', '李', '阴', '暮', '从', '碧', '山', '下', '月', '随', '却', '径', '苍', '横', '微', '携', '及', '田', '家', '童', '稚', '开', '荆', '扉', '竹', '入', '青', '萝', '拂', '行', '衣', '欢', '憩', '酒', '聊', '共', '挥', '长', '歌', '吟', '松', '曲', '尽', '河', '星', '稀', '醉', '君', '复', '乐', '陶', '然', '忘', 

### 1.4 Generate encode and decode data

In [None]:
encoder_inputs = [[encoder_vocab.index(word) for word in line] for line in inputs]
decoder_inputs = [[decoder_vocab.index('<GO>')] + [decoder_vocab.index(word) for word in line] for line in outputs]
decoder_targets = [[decoder_vocab.index(word) for word in line] + [decoder_vocab.index('<EOS>')] for line in outputs]

In [None]:
print(decoder_inputs[:4])
print(decoder_targets[:4])

[[1, 3, 4], [1, 3, 4], [1, 5, 4], [1, 6, 7]]
[[3, 4, 2], [3, 4, 2], [5, 4, 2], [6, 7, 2]]


In [None]:
import numpy as np

def get_batch(encoder_inputs, decoder_inputs, decoder_targets, batch_size=4):
    batch_num = len(encoder_inputs) // batch_size
    for k in range(batch_num):
        begin = k * batch_size
        end = begin + batch_size
        en_input_batch = encoder_inputs[begin:end]
        de_input_batch = decoder_inputs[begin:end]
        de_target_batch = decoder_targets[begin:end]
        max_en_len = max([len(line) for line in en_input_batch])
        max_de_len = max([len(line) for line in de_input_batch])
        en_input_batch = np.array([line + [0] * (max_en_len-len(line)) for line in en_input_batch])
        de_input_batch = np.array([line + [0] * (max_de_len-len(line)) for line in de_input_batch])
        de_target_batch = np.array([line + [0] * (max_de_len-len(line)) for line in de_target_batch])
        yield en_input_batch, de_input_batch, de_target_batch


In [None]:
batch = get_batch(encoder_inputs, decoder_inputs, decoder_targets, batch_size=4)
next(batch)

(array([[1, 2, 0, 0, 0],
        [3, 4, 2, 0, 0],
        [3, 5, 6, 7, 2],
        [8, 8, 9, 0, 0]]), array([[1, 3, 4],
        [1, 3, 4],
        [1, 5, 4],
        [1, 6, 7]]), array([[3, 4, 2],
        [3, 4, 2],
        [5, 4, 2],
        [6, 7, 2]]))

## 2. Model construction

In [None]:
import tensorflow as tf

### 2.1 Build up components of model

In [None]:
def normalize(inputs, 
              epsilon = 1e-8,
              scope="ln",
              reuse=None):
    '''Applies layer normalization.

    Args:
      inputs: A tensor with 2 or more dimensions, where the first dimension has
        `batch_size`.
      epsilon: A floating number. A very small number for preventing ZeroDivision Error.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.Variable(tf.zeros(params_shape))
        gamma = tf.Variable(tf.ones(params_shape))
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta

    return outputs

#### embedding layer

In [None]:
def embedding(inputs, 
              vocab_size, 
              num_units, 
              zero_pad=True, 
              scale=True,
              scope="embedding", 
              reuse=None):
    '''Embeds a given tensor.
    Args:
      inputs: A `Tensor` with type `int32` or `int64` containing the ids
         to be looked up in `lookup table`.
      vocab_size: An int. Vocabulary size.
      num_units: An int. Number of embedding hidden units.
      zero_pad: A boolean. If True, all the values of the fist row (id 0)
        should be constant zeros.
      scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    Returns:
      A `Tensor` with one more rank than inputs's. The last dimensionality
        should be `num_units`.

    For example,

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[ 0.          0.        ]
      [ 0.09754146  0.67385566]
      [ 0.37864095 -0.35689294]]
     [[-1.01329422 -1.09939694]
      [ 0.7521342   0.38203377]
      [-0.04973143 -0.06210355]]]
    ```

    ```
    import tensorflow as tf

    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[-0.19172323 -0.39159766]
      [-0.43212751 -0.66207761]
      [ 1.03452027 -0.26704335]]
     [[-0.11634696 -0.35983452]
      [ 0.50208133  0.53509563]
      [ 1.22204471 -0.96587461]]]
    ```    
    '''
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table',
                                       dtype=tf.float32,
                                       shape=[vocab_size, num_units],
                                       initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)

        if scale:
            outputs = outputs * (num_units ** 0.5) 

    return outputs

#### multihead layer
该层实现了下面功能：
![image.png](attachment:image.png)

In [None]:
def multihead_attention(key_emb,
                        que_emb,
                        queries, 
                        keys, 
                        num_units=None, 
                        num_heads=8, 
                        dropout_rate=0,
                        is_training=True,
                        causality=False,
                        scope="multihead_attention", 
                        reuse=None):
    '''Applies multihead attention.
    
    Args:
      queries: A 3d tensor with shape of [N, T_q, C_q].
      keys: A 3d tensor with shape of [N, T_k, C_k].
      num_units: A scalar. Attention size.
      dropout_rate: A floating point number.
      is_training: Boolean. Controller of mechanism for dropout.
      causality: Boolean. If true, units that reference the future are masked. 
      num_heads: An int. Number of heads.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns
      A 3d tensor with shape of (N, T_q, C)  
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Set the fall back option for num_units
        if num_units is None:
            num_units = queries.get_shape().as_list[-1]
        
        # Linear projections
        Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        
        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 

        # Multiplication
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
        
        # Scale
        outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
        
        # Key Masking
        key_masks = tf.sign(tf.abs(tf.reduce_sum(key_emb, axis=-1))) # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
        
        paddings = tf.ones_like(outputs)*(-2**32+1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  
        # Causality = Future blinding
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
            tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k)
            masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
   
            paddings = tf.ones_like(masks)*(-2**32+1)
            outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  
        # Activation
        outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
         
        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(que_emb, axis=-1))) # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
        outputs *= query_masks # broadcasting. (N, T_q, C)
          
        # Dropouts
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
               
        # Weighted sum
        outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
        
        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
              
        # Residual connection
        outputs += queries
              
        # Normalize
        outputs = normalize(outputs) # (N, T_q, C)
 
    return outputs

#### feedforward layer

In [None]:
def feedforward(inputs, 
                num_units=[2048, 512],
                scope="multihead_attention", 
                reuse=None):
    '''Point-wise feed forward net.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, C].
      num_units: A list of two integers.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns:
      A 3d tensor with the same shape and dtype as inputs
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Inner layer
        params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
                  "activation": tf.nn.relu, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Readout layer
        params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
                  "activation": None, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Residual connection
        outputs += inputs
        
        # Normalize
        outputs = normalize(outputs)
    
    return outputs

#### label smoothing

In [None]:
def label_smoothing(inputs, epsilon=0.1):
    '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
      epsilon: Smoothing rate.
    
    For example,
    
    ```
    import tensorflow as tf
    inputs = tf.convert_to_tensor([[[0, 0, 1], 
       [0, 1, 0],
       [1, 0, 0]],
      [[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]]], tf.float32)
       
    outputs = label_smoothing(inputs)
    
    with tf.Session() as sess:
        print(sess.run([outputs]))
    
    >>
    [array([[[ 0.03333334,  0.03333334,  0.93333334],
        [ 0.03333334,  0.93333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334]],
       [[ 0.93333334,  0.03333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334],
        [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
    ```    
    '''
    K = inputs.get_shape().as_list()[-1] # number of channels
    return ((1-epsilon) * inputs) + (epsilon / K)

### 2.2 Build Model

In [None]:
class Graph():
    def __init__(self, is_training=True):
        tf.reset_default_graph()
        self.is_training = arg.is_training
        self.hidden_units = arg.hidden_units
        self.input_vocab_size = arg.input_vocab_size
        self.label_vocab_size = arg.label_vocab_size
        self.num_heads = arg.num_heads
        self.num_blocks = arg.num_blocks
        self.max_length = arg.max_length
        self.lr = arg.lr
        self.dropout_rate = arg.dropout_rate
        
        # input placeholder
        self.x = tf.placeholder(tf.int32, shape=(None, None))
        self.y = tf.placeholder(tf.int32, shape=(None, None))
        self.de_inp = tf.placeholder(tf.int32, shape=(None, None))
        
        # Encoder
        with tf.variable_scope("encoder"):
            # embedding
            self.en_emb = embedding(self.x, vocab_size=self.input_vocab_size, num_units=self.hidden_units, scale=True, scope="enc_embed")
            self.enc = self.en_emb + embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
                                          vocab_size=self.max_length,num_units=self.hidden_units, zero_pad=False, scale=False,scope="enc_pe")
            ## Dropout
            self.enc = tf.layers.dropout(self.enc, 
                                        rate=self.dropout_rate, 
                                        training=tf.convert_to_tensor(self.is_training))

            ## Blocks
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    self.enc = multihead_attention(key_emb = self.en_emb,
                                                   que_emb = self.en_emb,
                                                   queries=self.enc, 
                                                    keys=self.enc, 
                                                    num_units=self.hidden_units, 
                                                    num_heads=self.num_heads, 
                                                    dropout_rate=self.dropout_rate,
                                                    is_training=self.is_training,
                                                    causality=False)

                    ### Feed Forward
            self.enc = feedforward(self.enc, num_units=[4*self.hidden_units, self.hidden_units])
        
        # Decoder
        with tf.variable_scope("decoder"):
            # embedding
            self.de_emb = embedding(self.de_inp, vocab_size=self.label_vocab_size, num_units=self.hidden_units, scale=True, scope="dec_embed")
            self.dec = self.de_emb + embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.de_inp)[1]), 0), [tf.shape(self.de_inp)[0], 1]),
                                          vocab_size=self.max_length,num_units=self.hidden_units, zero_pad=False, scale=False,scope="dec_pe")
            ## Dropout
            self.dec = tf.layers.dropout(self.dec, 
                                        rate=self.dropout_rate, 
                                        training=tf.convert_to_tensor(self.is_training))        

            ## Multihead Attention ( self-attention)
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    self.dec = multihead_attention(key_emb = self.de_emb,
                                                   que_emb = self.de_emb,
                                                   queries=self.dec, 
                                                    keys=self.dec, 
                                                    num_units=self.hidden_units, 
                                                    num_heads=self.num_heads, 
                                                    dropout_rate=self.dropout_rate,
                                                    is_training=self.is_training,
                                                    causality=True,
                                                    scope='self_attention')

            ## Multihead Attention ( vanilla attention)
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    self.dec = multihead_attention(key_emb = self.en_emb,
                                                   que_emb = self.de_emb,
                                                   queries=self.dec, 
                                                    keys=self.enc, 
                                                    num_units=self.hidden_units, 
                                                    num_heads=self.num_heads, 
                                                    dropout_rate=self.dropout_rate,
                                                    is_training=self.is_training,
                                                    causality=True,
                                                    scope='vanilla_attention') 

                    ### Feed Forward
            self.outputs = feedforward(self.dec, num_units=[4*self.hidden_units, self.hidden_units])
                
        # Final linear projection
        self.logits = tf.layers.dense(self.outputs, self.label_vocab_size)
        self.preds = tf.to_int32(tf.argmax(self.logits, axis=-1))
        self.istarget = tf.to_float(tf.not_equal(self.y, 0))
        self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
        tf.summary.scalar('acc', self.acc)
                
        if is_training:  
            # Loss
            self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=self.label_vocab_size))
            self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.y_smoothed)
            self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
               
            # Training Scheme
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
            self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
                   
            # Summary 
            tf.summary.scalar('mean_loss', self.mean_loss)
            self.merged = tf.summary.merge_all()

## 3. Training model


### 3.1 Set hyperparameters

In [None]:
def create_hparams():
    params = tf.contrib.training.HParams(
        num_heads = 8,
        num_blocks = 6,
        # vocab
        input_vocab_size = 50,
        label_vocab_size = 50,
        # embedding size
        max_length = 100,
        hidden_units = 512,
        dropout_rate = 0.2,
        lr = 0.0003,
        is_training = True)
    return params

        
arg = create_hparams()
arg.input_vocab_size = len(encoder_vocab)
arg.label_vocab_size = len(decoder_vocab)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



### 3.2 Model training

In [None]:
import os

epochs = 25
batch_size = 64

g = Graph(arg)

saver =tf.train.Saver()
with tf.Session() as sess:
    merged = tf.summary.merge_all()
    sess.run(tf.global_variables_initializer())
    if os.path.exists('logs/model.meta'):
        saver.restore(sess, 'logs/model')
    writer = tf.summary.FileWriter('tensorboard/lm', tf.get_default_graph())
    for k in range(epochs):
        total_loss = 0
        batch_num = len(encoder_inputs) // batch_size
        batch = get_batch(encoder_inputs, decoder_inputs, decoder_targets, batch_size)
        for i in tqdm(range(batch_num)):
            encoder_input, decoder_input, decoder_target = next(batch)
            feed = {g.x: encoder_input, g.y: decoder_target, g.de_inp:decoder_input}
            cost,_ = sess.run([g.mean_loss,g.train_op], feed_dict=feed)
            total_loss += cost
            if (k * batch_num + i) % 10 == 0:
                rs=sess.run(merged, feed_dict=feed)
                writer.add_summary(rs, k * batch_num + i)
        if (k+1) % 5 == 0:
            print('epochs', k+1, ': average loss = ', total_loss/batch_num)
    saver.save(sess, 'logs/model')
    writer.close()

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use `tf.cast` instead.


100%|██████████| 316/316 [00:44<00:00,  4.65it/s]
100%|██████████| 316/316 [00:27<00:00,  7.07it/s]
100%|██████████| 316/316 [00:26<00:00,  7.89it/s]
100%|██████████| 316/316 [00:26<00:00,  7.50it/s]
100%|██████████| 316/316 [00:26<00:00,  7.80it/s]
  1%|          | 2/316 [00:00<00:24, 12.98it/s]

epochs 5 : average loss =  2.4461895170845565


100%|██████████| 316/316 [00:26<00:00,  7.62it/s]
100%|██████████| 316/316 [00:26<00:00,  7.30it/s]
100%|██████████| 316/316 [00:26<00:00,  7.74it/s]
100%|██████████| 316/316 [00:26<00:00,  7.42it/s]
100%|██████████| 316/316 [00:26<00:00,  8.00it/s]
  1%|          | 2/316 [00:00<00:25, 12.20it/s]

epochs 10 : average loss =  1.7457856860341905


100%|██████████| 316/316 [00:26<00:00,  7.74it/s]
100%|██████████| 316/316 [00:26<00:00,  7.30it/s]
100%|██████████| 316/316 [00:26<00:00,  7.79it/s]
100%|██████████| 316/316 [00:26<00:00,  7.54it/s]
100%|██████████| 316/316 [00:26<00:00,  7.90it/s]
  1%|          | 2/316 [00:00<00:24, 13.06it/s]

epochs 15 : average loss =  1.5167816183235072


100%|██████████| 316/316 [00:26<00:00,  7.81it/s]
100%|██████████| 316/316 [00:26<00:00,  7.25it/s]
100%|██████████| 316/316 [00:26<00:00,  7.87it/s]
100%|██████████| 316/316 [00:26<00:00,  7.61it/s]
100%|██████████| 316/316 [00:26<00:00,  7.81it/s]
  1%|          | 2/316 [00:00<00:25, 12.27it/s]

epochs 20 : average loss =  1.43891846190525


100%|██████████| 316/316 [00:26<00:00,  7.74it/s]
100%|██████████| 316/316 [00:26<00:00,  7.11it/s]
100%|██████████| 316/316 [00:26<00:00,  7.77it/s]
100%|██████████| 316/316 [00:26<00:00,  7.51it/s]
100%|██████████| 316/316 [00:26<00:00,  7.86it/s]


epochs 25 : average loss =  1.3995937561687035


### 3.3 Validation and model evaluation

In [None]:
arg.is_training = False

g = Graph(arg)

saver =tf.train.Saver()
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
chencherry = SmoothingFunction()
with tf.Session() as sess:
    saver.restore(sess, 'logs/model')
    counter = 0
    score_list = []
    for line in data_test:
      ch_test, en_test = line.strip('\n').split('\t')
      
      en_test_copy = en_test
      en_test = en_test[:-1]
      ch_test_copy = ch_test
      en_test = en_test.strip().lower().split(' ')
      en_test = [en_test]
      ch_test = [char for char in ch_test if char != ' ']
      try:
        x = np.array([encoder_vocab.index(pny) for pny in ch_test])
        x = x.reshape(1, -1)
        de_inp = [[decoder_vocab.index('<GO>')]]
        while True:
            y = np.array(de_inp)
            preds = sess.run(g.preds, {g.x: x, g.de_inp: y})
            if preds[0][-1] == decoder_vocab.index('<EOS>'):
                break
            de_inp[0].append(preds[0][-1])
        got = [decoder_vocab[idx] for idx in de_inp[0][1:]]
        got_sentence = ' '.join(decoder_vocab[idx] for idx in de_inp[0][1:])
        got = got[:-1]
        counter+=1
        score = 0
        score = sentence_bleu(en_test, got, smoothing_function=chencherry.method4)
        score_list.append(score)
        print(score)
        print("CHINESE INPUT:",ch_test_copy)
        print("MACHINE GOT:",got_sentence)
        print("STANDARD RESULT:",en_test_copy)
        
      except:
        continue
    print(counter)
    print(sum(score_list)/len(score_list))
      
    #   inputs_test.append(ch_test[:-1])
    #   outputs_test.append(en_test.replace(',',' ,')[:-1].lower())
    # outputs_test = [en.split(' ') for en in outputs_test]
    
    # while True:
    #     line = input('输入测试拼音: ')
    #     if line == 'exit': break
    #     line = line.lower().replace(',', ' ,').strip('\n').split(' ')
    #     x = np.array([encoder_vocab.index(pny) for pny in line])
    #     x = x.reshape(1, -1)
    #     de_inp = [[decoder_vocab.index('<GO>')]]
    #     while True:
    #         y = np.array(de_inp)
    #         preds = sess.run(g.preds, {g.x: x, g.de_inp: y})
    #         if preds[0][-1] == decoder_vocab.index('<EOS>'):
    #             break
    #         de_inp[0].append(preds[0][-1])
    #     got = ' '.join(decoder_vocab[idx] for idx in de_inp[0][1:])
    #     print(got)

INFO:tensorflow:Restoring parameters from logs/model
0.15948194035504248
CHINESE INPUT: 友善点。
MACHINE GOT: be friendly .
STANDARD RESULT: Be kind.
0
CHINESE INPUT: 来加入我们吧。
MACHINE GOT: come to our party and trouble .
STANDARD RESULT: Join us.
0
CHINESE INPUT: 做得好！
MACHINE GOT: well done !
STANDARD RESULT: Good job!
0.19953087735062713
CHINESE INPUT: 起立。
MACHINE GOT: get up at once right away .
STANDARD RESULT: Stand up.
0
CHINESE INPUT: 他很懒。
MACHINE GOT: he is very important to play soccer .
STANDARD RESULT: He's lazy.
0.19817632389021378
CHINESE INPUT: 很晚了。
MACHINE GOT: it is very late for the party tonight .
STANDARD RESULT: It's late.
0.19879212680993805
CHINESE INPUT: 汤姆累了。
MACHINE GOT: tom was tired of tired .
STANDARD RESULT: Tom tried.
0.2326589746035907
CHINESE INPUT: 他很穷。
MACHINE GOT: he was very poor .
STANDARD RESULT: He is poor.
0.19681030214394876
CHINESE INPUT: 我想是这样的。
MACHINE GOT: i think that it s the same thing .
STANDARD RESULT: I think so.
0.2272851691411099
CHINESE I