### 以下代码用来实现经典的编码器－解码器代码，并使用 cmn_eng　2.2w条中英文翻译数据，作为实例
如果希望更详细的解读，强烈推荐阅读：https://zhuanlan.zhihu.com/p/28054589

In [1]:
!pip install sklearn

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [2]:
import tensorflow as tf
import os

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [3]:
print(tf.__version__)

2.0.0


In [4]:
import unicodedata
import re
import numpy as np
import os
import io
import time

In [5]:
with open('./data/cmn-eng.txt', 'r', encoding='utf-8') as f:
    contexts = f.readlines()
contexts[10]

'Got it?\t你懂了吗？\n'

In [6]:
def data_pro(contexts):
    processed_contexts_en = []
    processed_contexts_ch = []
    for line in contexts:
        en, ch = re.split(r'\t', line.strip())
#         print(en, ch)
        en = re.sub(r'([\?\.\!\,¿])', r' \1', en)
#         print(en)
        en = re.sub(r'\s+', ' ', en)
        ch = re.sub(r'\s+', ' ', ch)
        en = re.sub(r'[^a-zA-Z\?\.\!\,。？！，、¿]+', r' ', en)
        ch = re.sub(r'[^a-zA-Z\?\.\!\,。？！，、¿\u4e00-\u9fa5]+', r' ', ch)
        en = '<start> ' + en + ' <end>'
        ch = '<start> ' + ' '.join([i for i in ch]) + ' <end>'
        
        processed_contexts_en.append(en)
        processed_contexts_ch.append(ch)
    
    return processed_contexts_en, processed_contexts_ch

max_len_en = set()
max_len_ch = set()

def get_vocab(processed_contexts_en, processed_contexts_ch):
    en_vocab = set()
    ch_vocab = set()
    for en, ch in zip(processed_contexts_en, processed_contexts_ch):
        len_en = len([w for w in re.split(r'\s', en)])
        len_ch = len([w for w in re.split(r'\s', ch)])
        max_len_en.add(len_en)
        max_len_ch.add(len_ch)
        
        en_vocab.update(set([w for w in re.split(r'\s', en)]))
        ch_vocab.update(set([w for w in re.split(r'\s', ch)]))

    return en_vocab, ch_vocab


processed_contexts_en, processed_contexts_ch = data_pro(contexts)
en_vocab, ch_vocab = get_vocab(processed_contexts_en, processed_contexts_ch)
print(len(en_vocab))
print(len(ch_vocab))

print('max_len_ch: ', max(max_len_ch))
print('max_len_en: ', max(max_len_en))

6927
3424
max_len_ch:  51
max_len_en:  38


In [7]:
en_vocab2id = {i:k for k,i in enumerate(en_vocab)}
ch_vocab2id = {i:k for k,i in enumerate(ch_vocab)}
id2en_vocab = {k:i for k,i in enumerate(en_vocab)}
id2ch_vocab = {k:i for k,i in enumerate(ch_vocab)}


en_vocab2id.get('hello')

6790

In [8]:
# train_data, test_data = train_test_split(pro_data, test_size=0.15,shuffle=True)
train_x, test_x, train_y, test_y = train_test_split(processed_contexts_en, processed_contexts_ch, test_size=0.15)
print('train_data 数据大小：', len(train_x))
print('test_data 数据大小：', len(train_y))

train_y[-1], train_x[-1]

train_data 数据大小： 18763
test_data 数据大小： 18763


('<start> 我 不 歧 视 人 。 <end>', '<start> I don t discriminate . <end>')

In [19]:
def data_to_tensor(x,y):
    x = tf.cast(x,dtype=str)
    y = tf.cast(y,dtype=str)
    print(x, y)
    en = re.split('\s',x)
    en_ = [int(en_vocab2id[i]) for i in en]
    ch = re.split('\s', y)
    ch_ = [int(ch_vocab2id[i]) for i in ch]
    
    return tf.constant(en_),tf.constant(ch_)

train_db = tf.data.Dataset.from_tensor_slices(
    (train_x, train_y)).map(data_to_tensor).batch(64)
next(iter(train_db))

TypeError: in converted code:

    <ipython-input-19-a757b8c673ea>:2 data_to_tensor  *
        x = tf.cast(x,dtype=str)
    C:\Users\Jack\AppData\Local\conda\conda\envs\tf2\lib\site-packages\tensorflow_core\python\util\dispatch.py:180 wrapper
        return target(*args, **kwargs)
    C:\Users\Jack\AppData\Local\conda\conda\envs\tf2\lib\site-packages\tensorflow_core\python\ops\math_ops.py:686 cast
        base_type = dtypes.as_dtype(dtype).base_dtype
    C:\Users\Jack\AppData\Local\conda\conda\envs\tf2\lib\site-packages\tensorflow_core\python\framework\dtypes.py:721 as_dtype
        (type_value,))

    TypeError: Cannot convert value <class 'str'> to a TensorFlow DType.


#### 本编码器采用 Bahdanau 注意力

In [10]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = tf.keras.layers.Embedding(vocab_size=vocab_size,embedding_dim=embedding_dim)
        self.enc_units = enc_units
        self.batch_sz = batch_sz
        # 返回隐状态 和 整个输出序列
        self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        em = self.embedding_dim(x)
        output, state = self.gru(x, initial_state = hidden)
        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros(self.batch_sz, self.enc_units)

In [11]:
class BahdanauAttention(tf.keras.layers):
    def __init__(self):
        pass
    
    def call(self):
        pass


TypeError: module.__init__() takes at most 2 arguments (3 given)

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self):
        pass
    
    def call(self):
        passs
        
