In [2]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [3]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip',origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True
)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [4]:
# 将 unicode 文件转换为 ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # 在单词与跟在其后的标点符号之间插入一个空格
    # 例如： "he is a boy." => "he is a boy ."
    # 参考：https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # 除了 (a-z, A-Z, ".", "?", "!", ",")，将所有字符替换为空格
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # 给句子加上开始和结束标记
    # 以便模型知道何时开始和结束预测
    w = '<start> ' + w + ' <end>'
    return w

In [5]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [6]:
# 1. 去除重音符号
# 2. 清理句子
# 3. 返回这样格式的单词对：[ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [7]:
en, sp = create_dataset(path_to_file, None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [8]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [9]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    # 返回的对象是ndarray（numpy的n维数组对象）
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')
    return tensor, lang_tokenizer

In [10]:
def load_dataset(path,num_examples=None):
    targ_lang,inp_lang = create_dataset(path,num_examples)

    input_tensor,inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor,targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor,inp_lang_tokenizer,target_tensor,targ_lang_tokenizer

In [11]:
num_examples = 30000
input_tensor,inp_lang_tokenizer,target_tensor,targ_lang_tokenizer = load_dataset(path_to_file,num_examples)

max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [12]:
# 采用 80 - 20 的比例切分训练集和验证集
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# 显示长度
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

24000 24000 6000 6000


In [13]:
def convert(lang,tensor):
    for t in tensor:
        if t!=0:
            print(f'{t}------->{lang.index_word[t]}')

In [14]:
convert(inp_lang_tokenizer,input_tensor_train[0])
print('#########################################')
convert(targ_lang_tokenizer,target_tensor_train[0])

1-------><start>
6------->¿
66------->puede
13------->la
327------->gente
1398------->cambiar
5------->?
2-------><end>
#########################################
1-------><start>
25------->can
329------->people
541------->change
7------->?
2-------><end>


In [15]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang_tokenizer.word_index)+1
vocab_tar_size = len(targ_lang_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

2022-04-13 14:56:53.164898: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-13 14:56:53.317580: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2899885000 Hz
2022-04-13 14:56:53.317931: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5568df5b6670 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-04-13 14:56:53.317943: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-04-13 14:56:53.324057: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [16]:
example_input_batch,example_target_batch = next(iter(dataset))
example_input_batch.shape,example_target_batch.shape

(TensorShape([64, 16]), TensorShape([64, 11]))

encoder基本内容
1. __init__
2. call
3. initilizer(参数初始化器)

In [17]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_sz,embedding_dim,units,batch_sz,):
        super(Encoder, self).__init__()
        self.batch_sz =  batch_sz
        self.enc_units = units
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_sz,
            output_dim=embedding_dim
        )
        self.gru = tf.keras.layers.GRU(
            units = units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )

    def call(self, x, hidden_state):
        x = self.embedding(x)
        outputs,enc_hiddens = self.gru(x,hidden_state)
        return outputs,enc_hiddens

     # 确定初始状态的大小
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz,self.enc_units))

In [18]:
encoder = Encoder(vocab_inp_size,embedding_dim,units,BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output,sample_hidden = encoder(example_input_batch,sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


注意力机制模块：
    inputs：values(batch_sz*seq_length*enc_units)、query(batch_sz*dec_emded_num)
    outputs:context_vector(batch_sz*enc_units)、attention_weights(batch_sz*seq_length)
计算步骤：
   scores = V*tanh(W1*values+W2*query)   batch*seq_length*1
   attention_weights = softmax(scores)   batch*seq_length*1
   context_vector =  reduce_sum(attentions_weights*values,axis=1)  batch_sz*enc_units
具体函数
    1. __init__
    2. call

In [19]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self,query,values):
    # values: batch_sz*seq_length*dec_units
    # query: batch_sz*dec_dim_nums
    query = tf.expand_dims(query,1)     # batch_sz*1*dec_dim_nums
    scores = self.V(tf.nn.tanh(self.W1(values)+self.W2(query))) # batch_sz*seq_length*1
    attention_weights = tf.nn.softmax(scores,axis=1)  # batch_sz*seq_length*1
    context_vector = tf.reduce_sum(attention_weights*values,axis=1)
    # print(f'contex_tvector s shape is {context_vector.shape}')
    return context_vector, attention_weights

In [20]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print(f'sample_hidden s shape is {sample_hidden.shape}')
print(f'sample_out s shape is {sample_output.shape}')
print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

sample_hidden s shape is (64, 1024)
sample_out s shape is (64, 16, 1024)
Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 16, 1)


解码模块：
    inputs:hidden_states(batch_sz*units)、inputs(batch_sz*1)、pre_output(batch_sz*units)
    outputs:hidden_states(batch_sz*units)、outputs(batch_sz*vocab_sz)
计算思路：

In [21]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_sz):
        super(Decoder, self).__init__()
        self.vacab_sz = vocab_size
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim
        )
        self.attention = BahdanauAttention(units)
        self.gru = tf.keras.layers.GRU(
            units=units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden_states, enc_output):
        x = self.embedding(x)                                               # batch_sz*1*embed_dim_num
        # print(f'x s shape is {x.shape}')
        # print(f'enc_output s shape is {enc_output.shape}')
        context_vector,_ = self.attention(hidden_states,enc_output)                     # batch_sz*units
        # print(f'context_vector s shape is {context_vector.shape}')
        x = tf.concat([x,tf.expand_dims(context_vector,1)],-1)              # batch_sz*1*(embed_dim_num+unitis)
        output,state = self.gru(x)
        output = tf.squeeze(output,axis=1)
        output = self.fc(output)
        return output,state,_

In [35]:
decoder = Decoder(vocab_tar_size,embedding_dim,units,BATCH_SIZE)

sample_decoder_output,_,_ = decoder(tf.random.uniform((64,1)),sample_hidden,sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 4935)


定义优化器和损失函数
1. 优化器采用Adam
2. 损失函数用sparseCategoricalCrossentropy
3. 计算loss时需要通过mask来fill掉零值


In [97]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

def loss_function(y_true,y_pred):
    mask = tf.not_equal(y_true,0)
    loss_ = loss_object(y_true,y_pred)
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

检查点（基于对象保存）

In [28]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,'ckpt')
checkpoint = tf.train.Checkpoint(
    optimizer=optimizer,
    encoder=encoder,
    decoder=decoder
)

训练：（示教模式）
1. 将输入喂入编码器，编码器返回编码器输出和编码器隐藏层状态
2. 编码器输出、编码器隐藏层状态和解码器输入喂如解码器，解码器返回预测和解码器隐藏层状态
3. 预测用于计算损失，解码器隐藏层状态传送回模型
4. 使用teacher forcing决定解码器的下一个输入（将目标词作为下一个输入）
5. 计算梯度，应用于优化器和反向传播

In [98]:
def train_step(inp, targ ,enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output,enc_hidden = encoder(inp,enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']]*BATCH_SIZE,1)
        dec_input = tf.cast(dec_input,dec_hidden.dtype)
        for t in range(1,targ.shape[1]):
            predictions,dec_hidden,_ = decoder(dec_input,dec_hidden,enc_output)
            y_true = targ[:,t]
            y_pred = predictions
            dec_input = tf.expand_dims(y_true,1)
            loss += loss_function(y_true,y_pred)
    batch_loss = (loss/int(targ.shape[1]))
    variebles = encoder.variables+decoder.variables
    gradients = tape.gradient(loss,variebles)
    optimizer.apply_gradients(zip(gradients,variebles))
    return batch_loss

In [99]:
EPOCHS = 10
for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch,(inp,targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp,targ,enc_hidden)
        total_loss += batch_loss
        if not batch%100:
             print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy()}')
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                  total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.497164249420166
Epoch 1 Batch 100 Loss 1.5411759614944458
Epoch 1 Batch 200 Loss 1.499068021774292
Epoch 1 Batch 300 Loss 1.3822156190872192
Epoch 1 Loss 1.4581
Time taken for 1 epoch 348.3586993217468 sec

Epoch 2 Batch 0 Loss 1.0842732191085815
Epoch 2 Batch 100 Loss 1.1192883253097534
Epoch 2 Batch 200 Loss 1.1374292373657227
Epoch 2 Batch 300 Loss 0.827667772769928
Epoch 2 Loss 1.0672
Time taken for 1 epoch 340.27195405960083 sec

Epoch 3 Batch 0 Loss 0.8148168921470642
Epoch 3 Batch 100 Loss 0.8648380041122437
Epoch 3 Batch 200 Loss 0.6795008778572083
Epoch 3 Batch 300 Loss 0.6758356690406799
Epoch 3 Loss 0.7507
Time taken for 1 epoch 340.4019286632538 sec

Epoch 4 Batch 0 Loss 0.50692218542099
Epoch 4 Batch 100 Loss 0.5582032203674316
Epoch 4 Batch 200 Loss 0.5427631139755249
Epoch 4 Batch 300 Loss 0.5362839698791504
Epoch 4 Loss 0.5019
Time taken for 1 epoch 379.56403136253357 sec

Epoch 5 Batch 0 Loss 0.2816670536994934
Epoch 5 Batch 100 Loss 0.3401074111