In [1]:
import os
import sys
import tensorflow as tf
from tensorflow.io import gfile
import tensorflow.compat.v1.logging as logging
import pprint
import pickle
import numpy as np
import cv2 as cv
from tensorflow import keras
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # -1表示使用 cpu 进行训练

In [2]:
# # 设置内存自增长
# tf.debugging.set_log_device_placement(True)
# gpus = tf.config.experimental.list_physical_devices('GPU')
# # 打印物理GPU有几个，就是电脑实际装的个数
# print(len(gpus))
# for gpu in gpus:
#     # 设置 GPU 所占用内存自动增长
#     tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
input_ch_file = "../dataset/data/train.ch"   # 中文训练数据集
input_en_file = "../dataset/data/train.en"   # 英文文训练数据集
input_vocab_ch_file = "../dataset/vocab_ch.txt"
input_vocab_en_file = "../dataset/vocab_en.txt"

<div  align="center">    
    <img src='../image/vocab_ch.png', width= 600, height = 200>
    <img src='../image/vocab_en.png', width= 600, height = 200>
</div>

In [4]:
class Vocab(object):
    def __init__(self, filename, word_num_threshold):
        '''
            filename:文件的路径
            word_num_threshold:如果单词出现的次数太少了,我们用 unk 进行代替
        '''
        # 用于解码， id到词的映射
        self._id_to_word = {}
        # 用于编码， 词到id的映射
        self._word_to_id = {}
        # 未知字符
        self._unk = -1
        # 开始字符
        self._start = -1
        # 结束字符
        self._eos = -1
        
        # 前面我们统计过的词频，太少我们就不要了
        self._word_num_threshold = word_num_threshold
        self._read_dict(filename)

    def _read_dict(self, filename):
        with gfile.GFile(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, occurence = line.strip('\r\n').split('\t')
            # 词出现的次数
            occurence = int(occurence)
            # 如果词出现的频次太少了，我们就跳过
            if word != '<UNK>' and occurence < self._word_num_threshold:
                continue
            # 从顺序开始，第一次出现的词，对应的id为0， 以此类推
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word =='<s>':
                self._start = idx
            elif word == '</s>':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception("重复添加！！！")
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word

    @property
    def unk(self):
        return self._unk
    @property
    def start(self):
        return self._start
    @property
    def eos(self):
        return self._eos
    # 单个词到id的转换
    def word_to_id(self, word):
        return self._word_to_id.get(word, self.unk)
    # 单个id到词的转换
    def id_to_word(self, cur_id):
        return self._id_to_word.get(cur_id, '<UNK>')
    # 整个词表的大小
    def size(self):
        return len(self._word_to_id)
    # 编码，把句子转换成id  用于训练模型前的编码
    def encode(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]
        return word_ids
    # 解码，把id数组转化成句子， 预测时，模型输出的是id， 用于解码
    def decode(self, sentence_id):
        words = [self.id_to_word(word_id) for word_id in sentence_id]
        return ' '.join(words)

In [5]:
# 把描述的句子转换成id
def convert_token_to_id(filename, vocab):
    '''
        filename:输入的文件名字
        vocab:词表,是上面实现的类
    '''
    word_to_token_ids = []
    with gfile.GFile(filename, 'r') as f:
        lines = f.readlines()
    for line in lines:
        # 进行编码
        token_ids = vocab.encode(line)
        word_to_token_ids.append(token_ids)
    return word_to_token_ids

In [6]:
vocab_ch = Vocab(input_vocab_ch_file, 5)
vocab_en = Vocab(input_vocab_en_file, 5)
vocab_ch_size = vocab_ch.size()
vocab_en_size = vocab_en.size()
logging.info("vocab_ch_size: %d" % vocab_ch_size)
logging.info("vocab_en_size: %d" % vocab_en_size)

word_ch_to_token_ids = convert_token_to_id(input_ch_file, vocab_ch)
word_en_to_token_ids = convert_token_to_id(input_en_file, vocab_en)


logging.info("num of all ch: %d" % len(word_ch_to_token_ids))
logging.info("num of all en: %d" % len(word_en_to_token_ids))

logging.info("ch")
pprint.pprint(word_ch_to_token_ids[0])
logging.info("en")
pprint.pprint(word_en_to_token_ids[0])

INFO:tensorflow:vocab_ch_size: 18807
INFO:tensorflow:vocab_en_size: 14252
INFO:tensorflow:num of all ch: 100000
INFO:tensorflow:num of all en: 100000
INFO:tensorflow:ch
[1614,
 3,
 594,
 122,
 436,
 3,
 6,
 3,
 54,
 87,
 1140,
 1134,
 278,
 5390,
 44,
 6149,
 967,
 4,
 695,
 3,
 430,
 1767,
 3,
 3201,
 6,
 3,
 14171,
 15368,
 3,
 83,
 1552,
 245,
 48,
 6423,
 6149,
 0]
INFO:tensorflow:en
[9,
 1042,
 4,
 3,
 0,
 945,
 3533,
 190,
 90,
 5,
 3,
 0,
 3533,
 190,
 90,
 93,
 1502,
 1342,
 5,
 1065,
 17,
 11,
 1571,
 363,
 449,
 1131,
 3815,
 948,
 108,
 3,
 1207,
 4,
 2707,
 82,
 20,
 4,
 5,
 463,
 8,
 340,
 54,
 654,
 12,
 8,
 12,
 654,
 1131,
 290,
 3,
 29,
 6,
 47,
 1243,
 0]


<div  align="center">    
    <img src='../image/train_ch.png', width= 600, height = 200>
    <img src='../image/vocab_ch.png', width= 600, height = 200>
</div>

产生批量数据

In [7]:
class TranslateData(object):
    def __init__(self,
                 word_ch_to_token_ids,
                 word_en_to_token_ids,
                 num_timesteps,
                 vocab_ch,
                 vocab_en,
                 deterministic = False):
        '''
            word_ch_to_token_ids:
            word_en_to_token_ids:  句子到id的映射
            num_timesteps:固定句子的长度,因为我们知道有些句子很长,但是出现的次数是很少的
            vocab_ch:
            vocab_en: 词表
            deterministic:是否进行shuffle, 默认是进行shuffle
        '''
        self._vocab_ch = vocab_ch  
        self._vocab_en = vocab_en
    
        self._word_ch_to_token_ids = word_ch_to_token_ids
        self._word_en_to_token_ids = word_en_to_token_ids
        # 固定一个句子的长度
        self._num_timesteps = num_timesteps
        # 其时批次的下标
        self._indicator = 0
        # 是否进行shuffle
        self._deterministic = deterministic
    # 有多少个训练样本,英文和中文的长度是一样
    def size(self):
        assert len(self._word_ch_to_token_ids) == len(self._word_en_to_token_ids)
        return len(self._word_ch_to_token_ids)
    # 进行下标打乱
    def _random_shuffle(self):
        p = np.random.permutation(self.size())
        self._word_ch_to_token_ids = self._word_ch_to_token_ids[p]
        self._word_en_to_token_ids = self._word_en_to_token_ids[p]
    # 转成id
    def _sentence_ids(self, word_to_token_ids, vocab):
        batch_sentence_ids = []
        batch_weights = []
        for i in range(len(word_to_token_ids)):
            chosen_token_ids = word_to_token_ids[i]
            # 拿到当前句子的长度
            chosen_token_length = len(chosen_token_ids)
            # 为什么会有weight， 因为描述的长度小于我们固定的长度，那么我们不做长句的惩罚.
            # 假设我们选取的句子长度为5
            # 我 爱 你 . ..  真实: i love you 模型：i love you very much.   weight:[1, 1, 1, 0, 0]
            weight = [1 for i in range(chosen_token_length)]
            # 如果句子长度大于我们固定的长度
            if chosen_token_length >= self._num_timesteps:
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            # 对于句子,我们需要在最开始加上开始符号,最后面加上符号
            chosen_token_ids.insert(0, 1)
            chosen_token_ids.append(2)
            # 对于这两个地方,我们就不计算权重
            weight.insert(0, 0)
            weight.append(0)
            
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        return batch_sentence_ids, batch_weights
    # 下一个批次
    def next(self, batch_size):
        end_indicator = self._indicator + batch_size
        # 如果已经取到文件末尾了，是否进行shuffle，还有就是把其实的index置为0
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0
            end_indicator = self._indicator + batch_size
        assert end_indicator <= self.size()
        # 采用切片,进行一个批次数据的截取
        batch_word_ch_token_ids = self._word_ch_to_token_ids[self._indicator: end_indicator]
        batch_word_en_token_ids = self._word_en_to_token_ids[self._indicator: end_indicator]
        # 产生对应长度的句子和权重
        batch_ch_sentence_ids, batch_ch_weights = self._sentence_ids(batch_word_ch_token_ids, self._vocab_ch)
        batch_en_sentence_ids, batch_en_weights = self._sentence_ids(batch_word_en_token_ids, self._vocab_en)

        self._indicator = end_indicator
        return batch_ch_sentence_ids, batch_ch_weights, batch_en_sentence_ids, batch_en_weights


translate_data = TranslateData(word_ch_to_token_ids, word_en_to_token_ids, \
                                20, vocab_ch, vocab_en)
translate_data_size = translate_data.size()
logging.info("translate_data_size: %d" % translate_data_size)

batch_ch_sentence_ids, batch_ch_weights, batch_en_sentence_ids, batch_en_weights = translate_data.next(5)
logging.info("中文取一个批次的训练数据集")
pprint.pprint(batch_ch_sentence_ids)
pprint.pprint(batch_ch_weights)
logging.info("英文文取一个批次的训练数据集")
pprint.pprint(batch_en_sentence_ids)
pprint.pprint(batch_en_weights)

INFO:tensorflow:translate_data_size: 100000
INFO:tensorflow:中文取一个批次的训练数据集
array([[    1,  1614,     3,   594,   122,   436,     3,     6,     3,
           54,    87,  1140,  1134,   278,  5390,    44,  6149,   967,
            4,   695,     3,     2],
       [    1,     9,   843,  5576,     3,   245,     9, 11060,     3,
            0,     3,  6046,     6,    88,     6,     3,   346,     6,
            3,  9036,  1474,     2],
       [    1,     6,     6,     6,    19,   847,  5478,  2830,  2572,
          511,  6149,   967,   564,     4,  2133,     3,    97,    89,
         6285,   239,    25,     2],
       [    1,     6,  1519,    32,  9741,  3873, 11061,  9742,  1780,
          333,     6,  4059,    23,    42,     7,    33,    28,     8,
           33,  1186,  1231,     2],
       [    1,   278,  2869,    94,     3,   116,   751,  3201,  2036,
            3,  1642,   751,   509,    32,  4579, 16880,    49,  1015,
         4452,     0,     2,     2]])
array([[0, 1, 1, 1, 1, 1, 1, 1

decoder网络的搭建

<div  align="center">    
    <img src='../image/框架图.png', width= 600, height = 200>
</div>

In [8]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoding_units, batch_size):
        super(Encoder, self).__init__()
        '''
            vocab_size:词表的个数
            embedding_dim: 单词的embedding的维度
            encoding_units:循环神经网络的单元个数
            batch_size:每个批次数据的大小
        '''
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding_layer = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.encoding_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    @tf.function
    def call(self, x, hidden):
        # x shape == (batch_size, num_timesteps+2), 句子的长度 + 2 (开始和结束字符)
        # after embedding layer x shape == (batch, num_timesteps+2, embedding_dim)
        x = self.embedding_layer(x)
        # output shape == (batch_size, num_timesteps+2, encoding_units)
        # state shape == (batch_size, encoding_units)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    # 为什么要有这个初始化的hidden呢, 因为后面进行预测的时候,我们需要一样的hidden, 所以全为零,才一致
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoding_units))

inputs = batch_ch_sentence_ids
encoder = Encoder(vocab_ch.size(), 64, 64, 5)
encoder_hidden = encoder.initialize_hidden_state()
encoder_output, encoder_state = encoder(inputs, encoder_hidden)
print("encoder_output size is :", encoder_output.shape)
print("encoder_state size is :", encoder_state.shape)

encoder_output size is : (5, 22, 64)
encoder_state size is : (5, 64)


In [9]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, hidden_state, encoder_output):
        '''
            hidden_state:隐层 
            encoder_output: encoder 的输出
            
        '''
        # hidden shape == (batch_size, encoding_units)
        # encoder_output == (batch_size, num_timesteps+2, encoding_units)
        
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        hidden_with_time_axis = tf.expand_dims(hidden_state, 1)

        # score shape == (batch_size, num_timesteps+2, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size,  num_timesteps+2, units)
        score = self.V(tf.nn.tanh(self.W1(encoder_output) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, num_timesteps+2, 1)
        # 这边就是求词的权重,所以是第二个维度
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # values shape ==           (batch_size, num_timesteps+2, encoding_units)
        # attention_weights shape == (batch_size, num_timesteps+2, 1)
        # context_vector shape ==   (batch_size, encoding_units)
        context_vector = attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
attention = BahdanauAttention(64)
attention_result, attention_weights = attention(encoder_hidden, encoder_output)
print("attention_result shape is :", attention_result.shape)
print("attention_weights shape is :", attention_weights.shape)

attention_result shape is : (5, 64)
attention_weights shape is : (5, 22, 1)


In [10]:
w = tf.ones((5, 1))
v = tf.ones((5, 20))
w * v

<tf.Tensor: shape=(5, 20), dtype=float32, numpy=
array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1.]], dtype=float32)>

In [11]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoding_units, batch_size):
        '''
            vocab_size:英文词表的大小(因为 decoder 本来就是对英文做的)
            embedding_dim: 词embedding的维度
            decoding_units: decoder的维度
            batch_size:批次数据集的大小
        '''
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.decoding_units = decoding_units
        self.embedding_layer = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.decoding_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
        self.fc = keras.layers.Dense(vocab_size, activation='softmax')

        # 初始化 attention
        self.attention = BahdanauAttention(self.decoding_units)

    def call(self, x, hidden_state, encoding_output):
        '''
            encoding_output: 上一步的输出
            hidden_state   : 上一步的状态
            x              : 当前步的输入
        '''
        # enc_output shape == (batch_size, num_timesteps+2, hidden_size)
        context_vector, attention_weights = self.attention(hidden_state, encoding_output)
        # x shape == (batch_size, 1), 
        # after embedding layer x shape == (batch, embedding_dim)
        x = self.embedding_layer(x)
        x = tf.expand_dims(x, 1)
        # x shape == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # 经过 GRU 网络 # output shape == (batch_size, 1, hidden_size)
        output, state = self.gru(x)

        # output shape == (batch_size*1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab_size)
        # 也就是每个词的概率是多少
        x = self.fc(output)

        return x, state, attention_weights
    
decoder = Decoder(vocab_en.size(), 64, 64, 5)
decoder_output, _, _ = decoder(batch_en_sentence_ids[:, 0],
                               encoder_hidden, 
                               encoder_output)
print ('Decoder output shape: ', decoder_output.shape)

Decoder output shape:  (5, 14252)


训练流程

In [12]:
# 自定义学习率
class CustomizedSchedule(
    keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomizedSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** (-1.5))
        arg3 = tf.math.rsqrt(self.d_model)
        return arg3 * tf.math.minimum(arg1, arg2)


# 定义损失函数,因为我们的输出是经过激活函数的,所以from_logits=False
# reduction='none' 表示我们要自己求和,因为有权重
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

batch_size = 100
embedding_dim = 64
encoding_units, decoding_units = 128, 128
num_timesteps = 20


# 初始化我们的网络
# encoder对中文编码，所以要输入中文的词表大小
encoder = Encoder(vocab_ch.size(), embedding_dim, encoding_units, batch_size)
# decocder是针对英文的
decoder = Decoder(vocab_en.size(), embedding_dim, decoding_units, batch_size)
# 定义自适应学习率
learning_rate = CustomizedSchedule(128)
# 定义优化器
optimizer = keras.optimizers.Adam(learning_rate,
                                  beta_1=0.9,
                                  beta_2=0.98,
                                  epsilon=1e-9)

@tf.function
def train_step(batch_ch_sentence_ids, batch_ch_weights, batch_en_sentence_ids, batch_en_weights, encoder_hidden):
    loss = 0
    acc = 0
    with tf.GradientTape() as tape:
        # batch_sentence_ids == (batch_size, num_timesteps + 2) 因为有开始字符和结束字符
        # batch_weights == (batch_size, num_timesteps + 2) 因为有开始字符和结束字符
        
        # encoding_hidden shape == [batch_size, encoder_units]
        encoding_output, encoding_hidden = encoder(batch_ch_sentence_ids, encoder_hidden)            
        decoding_hidden = encoding_hidden
        # decoder循环神经网络的工作流程
        #    <s> +  encoding_hidden    => 第一个词
        # 第一个词 + context_vector    => 生成第二个词
        # 第二个词     => 生成第三个词
        #  .......   => </s>
        for t in range(num_timesteps + 2 - 1):
            # decoding_input shape == (batch_size, 1) 
            decoding_input = batch_en_sentence_ids[:, t]
            # predictions shape == (batch_size, vocab_size) 
            predictions, decoding_hidden, _ = decoder(decoding_input, decoding_hidden, encoding_output)
            # labels hsape == (batch_size, 1)
            # labels_weight shape == (batch_size, 1)
            # 这边想一想为什么是 t+1 :<s> i love you . -> <s> i love -> i love you .
            # 解码这边是对英文做的，所以拿的都是 en
            labels = batch_en_sentence_ids[:, t+1]
            labels_weight = batch_en_weights[:, t+1]
            loss_ = loss_object(labels, predictions)
            labels_weight = tf.cast(labels_weight, loss_.dtype)
            loss_ *= labels_weight
            # 求和平均一下
            loss_ = tf.reduce_mean(loss_)
            loss += loss_
            ######## 计算准确度，我们使用简单的方法，就是看对应位置，单词预测正确
            pred_word_id = tf.argmax(predictions, 1, output_type = tf.int32)
            correct_pred = tf.equal(pred_word_id, labels)
            correct_prediction_with_mask = tf.multiply(tf.cast(correct_pred, tf.float32), 
                                                       labels_weight)
            acc_ = tf.reduce_sum(correct_prediction_with_mask)
            acc += acc_
        batch_loss = (loss) / (batch_en_sentence_ids.shape[1])
        batch_acc = (acc) / (batch_en_sentence_ids.shape[1])

    trainable_variables =encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(batch_loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    return batch_loss, batch_acc
# 循环10次
epoch_size = 50
for epoch in range(epoch_size):
    start = time.time() 
    # 生成我们的批训练数据集, 我们规定句子的长度为 num_timesteps
    translate_data = TranslateData(word_ch_to_token_ids, word_en_to_token_ids, \
                                num_timesteps, vocab_ch, vocab_en)

    # 初始化hidden_state
    encoding_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    total_acc = 0
    # 看我们整个数据集，能跑几个batch
    batchs_per_epoch = translate_data.size() // batch_size
    for batch in range(batchs_per_epoch):
        batch_ch_sentence_ids, batch_ch_weights, batch_en_sentence_ids, batch_en_weights \
                                                            = translate_data.next(batch_size)

        # 喂进去神经网络
        batch_loss, batch_acc = train_step(batch_ch_sentence_ids, 
                                           batch_ch_weights, 
                                           batch_en_sentence_ids, 
                                           batch_en_weights, encoding_hidden)
        total_loss += batch_loss
        total_acc += batch_acc
        # 打印我们所关心的值
#         print('Batch [{}]  Loss {:.10f} Acc {:.10f}'.format(batch, batch_loss.numpy(), batch_acc.numpy()))

    print('Epoch [{}/{}]  Loss {:.10f}, Acc {:.10f}'.format(epoch + 1, epoch_size, 
                                                            total_loss.numpy()/batchs_per_epoch , 
                                                            total_acc.numpy()/batchs_per_epoch))
    print('Time take for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch [1/50]  Loss 6.3368046875, Acc 6.0101333008
Time take for 1 epoch: 607.8969831466675 secs

Epoch [2/50]  Loss 5.4098110352, Acc 13.3182216797
Time take for 1 epoch: 645.712822675705 secs

Epoch [3/50]  Loss 4.7380170898, Acc 20.4872031250
Time take for 1 epoch: 614.4036946296692 secs

Epoch [4/50]  Loss 4.3464741211, Acc 23.7130468750
Time take for 1 epoch: 666.1944766044617 secs

Epoch [5/50]  Loss 4.0523605957, Acc 26.1925039062
Time take for 1 epoch: 796.5499594211578 secs

Epoch [6/50]  Loss 3.8465437012, Acc 27.9809179687
Time take for 1 epoch: 647.4665360450745 secs

Epoch [7/50]  Loss 3.6935634766, Acc 29.3933847656
Time take for 1 epoch: 625.9186623096466 secs

Epoch [8/50]  Loss 3.5671892090, Acc 30.6027304687
Time take for 1 epoch: 612.6435477733612 secs

Epoch [9/50]  Loss 3.4627448730, Acc 31.6496718750
Time take for 1 epoch: 600.4327054023743 secs

Epoch [10/50]  Loss 3.3737534180, Acc 32.5703320312
Time take for 1 epoch: 631.5030176639557 secs

Epoch [11/50]  Loss 3

KeyboardInterrupt: 

对我们模型进行测试，为了方便，我们使用训练数据集中的数据就好

In [64]:
# 推理的时候就是
# <s>         => 第一个词
# 第一个词     => 生成第二个词
# 第二个词     => 生成第三个词
#    ...      => </s>

word_test_ch_to_token_ids_one = word_ch_to_token_ids[4]
word_test_ch_to_token_ids_one = np.asarray(word_test_ch_to_token_ids_one)
word_test_ch_to_token_ids_one = word_test_ch_to_token_ids_one.reshape((1, -1))

# 因为我们在 encode hidden shape == [batch_size, units]
hidden = [tf.zeros((1, encoding_units))]
encoding_out, encoding_hidden = encoder(word_test_ch_to_token_ids_one, hidden)
print(encoding_out.shape)

decoding_hidden = encoding_hidden
decoding_input = np.array([vocab_en.start])
result = []
for t in range(num_timesteps):
    predictions, decoding_hidden, attention_weights = decoder(
        decoding_input, decoding_hidden, encoding_out)
    
#     predict_idx = tf.random.categorical(predictions, 1)[0][0].numpy()
    predict_idx = tf.argmax(predictions[0]).numpy()
    # 已经到了结束字符,那么我们就不应在预测了
    if predict_idx == vocab_en.eos:
        break
    
    result.append(predict_idx)
    # 然后扩充1个维度
    decoding_input  = tf.expand_dims(predict_idx , 0)

print("预测的结果: ", result)

(1, 22, 128)
预测的结果:  [124, 195, 4, 195, 610, 14, 3, 29, 6, 3, 29, 8, 3, 20, 8, 3, 20, 8, 3, 20]


In [65]:
print("原始的id:", word_en_to_token_ids[4][:num_timesteps])

原始的id: [124, 195, 102, 906, 486, 4, 195, 1134, 14, 27, 146, 28, 680, 16, 42, 8, 4228, 3, 2707, 626]


In [66]:
print("预测的句子", vocab_en.decode(result))
print("原始的句子", vocab_en.decode(word_en_to_token_ids[4][:num_timesteps]))

预测的句子 when i , i believe that the people of the people to the <unk> to the <unk> to the <unk>
原始的句子 when i first came here , i thought that it would be difficult for us to gather the villagers together


我们来测试一下比较简单的句子

In [53]:
# 推理的时候就是
# <s>         => 第一个词
# 第一个词     => 生成第二个词
# 第二个词     => 生成第三个词
#    ...      => </s>

test_sentence = "我 爱 你 中国"
word_test_ch_to_token_ids_one = vocab_ch.encode(test_sentence)
word_test_ch_to_token_ids_one = np.asarray(word_test_ch_to_token_ids_one)
word_test_ch_to_token_ids_one = word_test_ch_to_token_ids_one.reshape((1, -1))

# 因为我们在 encode hidden shape == [batch_size, units]
hidden = [tf.zeros((1, encoding_units))]
encoding_out, encoding_hidden = encoder(word_test_ch_to_token_ids_one, hidden)
print(encoding_out.shape)

decoding_hidden = encoding_hidden
decoding_input = np.array([vocab_en.start])
result = []
for t in range(word_test_ch_to_token_ids_one.shape[1]):
    predictions, decoding_hidden, attention_weights = decoder(
        decoding_input, decoding_hidden, encoding_out)
    
    predict_idx = tf.argmax(predictions[0]).numpy()
    # 已经到了结束字符,那么我们就不应在预测了
    if predict_idx == vocab_en.eos:
        break
    
    result.append(predict_idx)
    # 然后扩充1个维度
    decoding_input  = tf.expand_dims(predict_idx , 0)

print("预测的结果: ", result)

(1, 5, 128)
预测的结果:  [195, 2262, 789, 39, 29]


In [54]:
print("预测的句子", vocab_en.decode(result))

预测的句子 i love your chinese people
