### 以下代码用来实现经典的编码器－解码器代码，并使用 cmn_eng　2.2w条中英文翻译数据，作为实例
如果希望更详细的解读，强烈推荐阅读：https://zhuanlan.zhihu.com/p/28054589

In [1]:
!pip install sklearn

Looking in indexes: https://mirrors.aliyun.com/pypi/simple


In [2]:
import tensorflow as tf
import os

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [3]:
print(tf.__version__)

2.0.0


In [4]:
import unicodedata
import re
import numpy as np
import os
import io
import time

In [5]:
with open('./data/cmn-eng.txt', 'r', encoding='utf-8') as f:
    contexts = f.readlines()
contexts[10]

'Got it?\t你懂了吗？\n'

In [6]:
def data_pro(contexts):
    processed_contexts_en = []
    processed_contexts_ch = []
    for line in contexts:
        en, ch = re.split(r'\t', line.strip())
#         print(en, ch)
        en = re.sub(r'([\?\.\!\,¿])', r' \1', en)
#         print(en)
        en = re.sub(r'\s+', ' ', en)
        ch = re.sub(r'\s+', ' ', ch)
        en = re.sub(r'[^a-zA-Z\?\.\!\,。？！，、¿]+', r' ', en)
        ch = re.sub(r'[^a-zA-Z\?\.\!\,。？！，、¿\u4e00-\u9fa5]+', r' ', ch)
        en = '<start> ' + en + ' <end>'
        ch = '<start> ' + ' '.join([i for i in ch]) + ' <end>'
        
        processed_contexts_en.append(en)
        processed_contexts_ch.append(ch)
    
    return processed_contexts_en, processed_contexts_ch

max_len_en = set()
max_len_ch = set()

def get_vocab(processed_contexts_en, processed_contexts_ch):
    en_vocab = set()
    ch_vocab = set()
    for en, ch in zip(processed_contexts_en, processed_contexts_ch):
        len_en = len([w for w in re.split(r'\s', en)])
        len_ch = len([w for w in re.split(r'\s', ch)])
        max_len_en.add(len_en)
        max_len_ch.add(len_ch)
        
        en_vocab.update(set([w for w in re.split(r'\s', en)]))
        ch_vocab.update(set([w for w in re.split(r'\s', ch)]))

    return en_vocab, ch_vocab


processed_contexts_en, processed_contexts_ch = data_pro(contexts)
en_vocab, ch_vocab = get_vocab(processed_contexts_en, processed_contexts_ch)
print(len(en_vocab))
print(len(ch_vocab))
print(processed_contexts_en[10])
print(processed_contexts_ch[10])

print('max_len_ch: ', max(max_len_ch))
print('max_len_en: ', max(max_len_en))

6927
3424
<start> Got it ? <end>
<start> 你 懂 了 吗 ？ <end>
max_len_ch:  51
max_len_en:  38


In [7]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [8]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [9]:
def load_dataset():
    # 创建清理过的输入输出对
    input_tensor, inp_lang_tokenizer = tokenize(processed_contexts_en)
    target_tensor, targ_lang_tokenizer = tokenize(processed_contexts_ch)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [10]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()
input_tensor[-1], target_tensor[-1]
inp_lang.texts_to_sequences(['<start> hello world <end>', 'head of'])
targ_lang.texts_to_sequences(['<start> 你 好 世 界 <end>', '你 好']), targ_lang.index_word[1]

([[1, 7, 34, 352, 515, 2], [7, 34]], '<start>')

In [11]:
# 计算目标张量的最大长度 （max_length）
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [12]:
# 采用 80 - 20 的比例切分训练集和验证集
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.15)

# 显示长度
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

18763 18763 3312 3312


In [13]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [14]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
4 ----> i
66 ----> ve
230 ----> heard
23 ----> this
420 ----> story
166 ----> before
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> 我
55 ----> 以
103 ----> 前
280 ----> 听
131 ----> 过
25 ----> 这
32 ----> 个
351 ----> 故
64 ----> 事
3 ----> 。
2 ----> <end>


In [15]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [16]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 38]), TensorShape([64, 46]))

#### 本编码器采用 Bahdanau 注意力

In [22]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.enc_units = enc_units
        self.batch_sz = batch_sz
        # 返回隐状态 和 整个输出序列
        self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        # x形状为 (batch_size, seq_len)
        # hidden 为初始化向量,形状为 (batch_size, units_dim)
        em = self.embedding_dim(x)
        output, state = self.gru(em, initial_state = hidden)
        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [23]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# 初始化一个样本输入
sample_hidden = encoder.initialize_hidden_state()
print(sample_hidden)

sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(64, 1024), dtype=float32)
Encoder output shape: (batch size, sequence length, units) (64, 38, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [None]:
# attention 实现的是输入两个向量，输出　上下文向量　和　权重，其中权重主要是为了后续 方便　可视化使用,那么，BahdanauAttention中：
# Ci = reduce_sum(α * Hs)，　αij = exp(eij) / reduce_sum(exp(eij)), eij = v.tanh(w.s + w.h)
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        """
        # 初始化需要训练的参数 eij = v.tanh(w.s + w.h)，乘相当于dense操作
        """
        self.units = units
        
    def build(self, input_shape):
        # 注意这里使用了 build 方法,其主要用于根据input_shape创建 layer的Variable
        self.Ws = tf.Variable(tf.random.normal(shape=(input_shape[-1],units),stddev=0.01,mean=0,dtype=tf.float32))
        self.Wh = tf.Variable(tf.random.normal(shape=(input_shape[-1],units),stddev=0.01,mean=0,dtype=tf.float32))
        self.V = tf.Variable(tf.random.normal(shape=(input_shape[-1],1),stddev=0.01,mean=0,dtype=tf.float32))

    def call(self, query, values):
        """
        输入: query, values,即S_t-1, hidden_ts
        输出: context_vec, attention_weights
        """
        a_score = self.V(self.Ws)
        context_vec = 1
        attention_weights = 1
    
    
        return context_vec, attention_weights

In [None]:
# attention 实现的是输入两个向量，输出　上下文向量　和　权重，其中权重主要是为了后续 方便　可视化使用,那么，BahdanauAttention中：
# Ci = reduce_sum(α * Hs)，　αij = exp(eij) / reduce_sum(exp(eij)), eij = v.tanh(w.s + w.h)
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        """
        # 初始化需要训练的参数 eij = v.tanh(w.s + w.h)，乘相当于dense操作
        """
        self.Ws = tf.keras.layers.Dense(units)
        self.Wh = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        """
        输入: query, values,即S_t-1, hidden_ts
        输出: context_vec, attention_weights
        """
        # 这里根据使用方法，query 是 Ci中的一个
        query = tf.expand_dims(query, axis=1)
        attention_weights = 1
    
    
        return context_vec, attention_weights

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self):
        pass
    
    def call(self):
        passs
        