### 导入项目环境

In [None]:
import os
import numpy as np
import re
import sys
import re
import random
import unicodedata # 使用unicodedata模块先将文本标准化
import argparse
import math

from mindspore import Tensor, nn, Model, context, Parameter, DatasetHelper
from mindspore import dataset as ds
from mindspore.mindrecord import FileWriter
from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor
from mindspore.communication.management import init, get_rank
from mindspore.context import ParallelMode
from mindspore.train.serialization import load_param_into_net, load_checkpoint
import mindspore.nn as nn
import mindspore.ops.operations as P
import mindspore.common.dtype as mstype
from mindspore.nn.loss.loss import _Loss
from mindspore.ops import functional as F

from easydict import EasyDict as edict # 以属性的方式去访问字典的值

context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target='Ascend') # 选用Ascend芯片执行运算

### 实验的参数设定表

In [None]:
# 实验的参数设定表
cfg = edict({
    'en_vocab_size': 1154, # 英文字典的大小，也就是英文的 subword 的个数
    'ch_vocab_size': 1116, # 中文字典的大小
    'max_seq_length': 10, # 字数的个数
    'hidden_size': 1024, # 隐藏单元数
    'batch_size': 16, # 批尺寸大小
    'eval_batch_size': 1,
    'learning_rate': 0.001, # 学习率
    'momentum': 0.9, # 动量优化器参数
    'num_epochs': 15,# 训练全部数据集迭代次数
    'save_checkpoint_steps': 125, # 每隔这么多步骤保存检查点
    'keep_checkpoint_max': 10, # 要保留的最近检查点文件的最大数量.当新文件被创建时,旧文件被删除.如果为None或0,则保留所有检查点文件.默认为5(也就是保留5个最近的检查点文件.)
    'dataset_path':'./preprocess', # 预处理路径
    'ckpt_save_path':'./ckpt', # 储存模型的位置
    'checkpoint_path':'./ckpt/gru-15_125.ckpt' # 储存检查点的位置
})


### 数据集

In [None]:
# 得到目标操作（通过encoder-decoder得到相应的输入输出）
def target_operation(encoder_data, decoder_data):
    encoder_data = encoder_data[1:]
    target_data = decoder_data[1:]
    decoder_data = decoder_data[:-1]
    return encoder_data, decoder_data, target_data
# 验证操作
def eval_operation(encoder_data, decoder_data):
    encoder_data = encoder_data[1:]
    decoder_data = decoder_data[:-1]
    return encoder_data, decoder_data
# 得到训练数据集
def create_dataset(data_home, batch_size, repeat_num=1, is_training=True, device_num=1, rank=0):
    if is_training:
        data_dir = os.path.join(data_home, "gru_train.mindrecord") # 合并路径
    else:
        data_dir = os.path.join(data_home, "gru_eval.mindrecord") # 
    data_set = ds.MindDataset(data_dir, columns_list=["encoder_data","decoder_data"], num_parallel_workers=4,
                              num_shards=device_num, shard_id=rank) # 通过训练分别得到encoder和decoder的数据集
    if is_training: # 训练阶段
        operations = target_operation # 调用得到目标数据
        data_set = data_set.map(operations=operations, input_columns=["encoder_data","decoder_data"],
                    output_columns=["encoder_data","decoder_data","target_data"],
                    column_order=["encoder_data","decoder_data","target_data"])
    else: # 验证阶段
        operations = eval_operation
        data_set = data_set.map(operations=operations, input_columns=["encoder_data","decoder_data"],
                   output_columns=["encoder_data","decoder_data"],
                   column_order=["encoder_data","decoder_data"])
    data_set = data_set.shuffle(buffer_size=data_set.get_dataset_size()) # 打乱数据集
    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) # 将数据集分批
    data_set = data_set.repeat(count=repeat_num) # 重复数据集
    return data_set 


### 处理过程

In [None]:
# 预备特殊字元，在开头添加 <SOS>，在结尾添加 <EOS>
EOS = "<eos>"
SOS = "<sos>"
MAX_SEQ_LEN=10

# 多用于那些需要包含音调的字符体系中，Unicode体系中，使用Decompose(分离)分别存储字符(U+0043)本身和音调(U+0327)本身。
# 从给定的字符串中删除重音符号。 输入文本是unicode字符串，返回带有重音符号的输入字符串，作为unicode。
# normalize() 第一个参数指定字符串标准化的方式。 NFD表示字符应该分解为多个组合字符表示。
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
# 标准化处理字符串
def normalizeString(s):
    s = s.lower().strip() # lower将整个字符串改为小写；strip删除字符串前后的空白。
    s = unicodeToAscii(s) # 调用函数将Unicode转化成Ascii
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) # 将符号“.!?”前用空格隔开
    return s


def prepare_data(data_path, vocab_save_path, max_seq_len):
    with open(data_path, 'r', encoding='utf-8') as f:
        data = f.read() # 读取文件

    # 得到文件中的内容
    data = data.split('\n')

    data = data[:2000]

    # 拆分英文句子和中文句子
    en_data = [normalizeString(line.split('\t')[0]) for line in data] # 得到标准化处理的英文句子
    ch_data = [line.split('\t')[1] for line in data] # 得到中文句子

    # 获取单词并存储
    en_vocab = set(' '.join(en_data).split(' ')) # 获取不重复的英文单词
    id2en = [EOS] + [SOS] + list(en_vocab) # 英文单词表中加上两个始末特殊字元
    en2id = {c:i for i,c in enumerate(id2en)} # 遍历所有英文单词组合为一个索引序列
    en_vocab_size = len(id2en) # 查看英文单词个数
    np.savetxt(os.path.join(vocab_save_path, 'en_vocab.txt'), np.array(id2en), fmt='%s') # 将英文单词表保存

    ch_vocab = set(''.join(ch_data)) # 获取不重复的中文单词
    id2ch = [EOS] + [SOS] + list(ch_vocab)  # 中文单词表中加上两个始末特殊字元
    ch2id = {c:i for i,c in enumerate(id2ch)} # 遍历所有中文单词组合为一个索引序列，即获取每个单词的id
    ch_vocab_size = len(id2ch) # 查看中文单词个数
    np.savetxt(os.path.join(vocab_save_path, 'ch_vocab.txt'), np.array(id2ch), fmt='%s') # 将中文单词表保存

    # 将中英文句子转换为单词ids组合 --> [SOS] + sentences ids + [EOS]
    en_num_data = np.array([[1] + [int(en2id[en]) for en in line.split(' ')] + [0] for line in en_data])
    ch_num_data = np.array([[1] + [int(ch2id[ch]) for ch in line] + [0] for line in ch_data])

    # 将上述句子的索引ID组合长度延长到自定义的max_length
    for i in range(len(en_num_data)):
        num = max_seq_len + 1 - len(en_num_data[i])
        if(num >= 0):
            en_num_data[i] += [0]*num
        else:
            en_num_data[i] = en_num_data[i][:max_seq_len] + [0]

    for i in range(len(ch_num_data)):
        num = max_seq_len + 1 - len(ch_num_data[i])
        if(num >= 0):
            ch_num_data[i] += [0]*num
        else:
            ch_num_data[i] = ch_num_data[i][:max_seq_len] + [0]

    return en_num_data, ch_num_data, en_vocab_size, ch_vocab_size


# 转换保存mindspore的中英文单词表
def convert_to_mindrecord(data_path, mindrecord_save_path, max_seq_len):
    en_num_data, ch_num_data, en_vocab_size, ch_vocab_size = prepare_data(data_path, mindrecord_save_path, max_seq_len)

    data_list_train = []
    for en, de in zip(en_num_data, ch_num_data):
        en = np.array(en).astype(np.int32) # 将英文句子ID强制转换为指定的整数类型。
        de = np.array(de).astype(np.int32) # 将中文句子ID强制转换为指定的整数类型。
        data_json = {"encoder_data": en.reshape(-1),
                     "decoder_data": de.reshape(-1)}
        data_list_train.append(data_json) # 将英文作为编码器，中文作为解码器加入
    data_list_eval = random.sample(data_list_train, 20)

    data_dir = os.path.join(mindrecord_save_path, "gru_train.mindrecord") # 把目录和文件名合成一个路径.

    writer = FileWriter(data_dir) # 用于将用户定义的原始数据写入MindRecord File系列。
    schema_json = {"encoder_data": {"type": "int32", "shape": [-1]},
                   "decoder_data": {"type": "int32", "shape": [-1]}}  # 设计编码器和解码器架构
    writer.add_schema(schema_json, "gru_schema") # 添加架构，如果成功添加架构，则返回架构ID，或引发异常。
    writer.write_raw_data(data_list_train) # 默认情况下，写入原始数据，生成MindRecord File的顺序对，并根据预定义的模式对数据进行校验。
    writer.commit() # 将数据刷新到磁盘并生成相应的db文件。

    data_dir = os.path.join(mindrecord_save_path, "gru_eval.mindrecord")
    writer = FileWriter(data_dir)
    writer.add_schema(schema_json, "gru_schema")
    writer.write_raw_data(data_list_eval)
    writer.commit()

    print("en_vocab_size: ", en_vocab_size) # 打印出英文单词长度
    print("ch_vocab_size: ", ch_vocab_size) # 打印出中文单词长度

    return en_vocab_size, ch_vocab_size

### Seq2Seq构建

In [None]:
# 定义GRU中的权重和偏置
def gru_default_state(batch_size, input_size, hidden_size, num_layers=1, bidirectional=False):
    '''GRU（LSTM的变体）的权重初始化'''
    stdv = 1 / math.sqrt(hidden_size) # 设置标准差
    # 输入层权重初始化（权重从一个均匀分布[low,high)中随机采样）
    weight_i = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (input_size, 3*hidden_size)).astype(np.float32)), name='weight_i')
    # 隐藏层权重初始化
    weight_h = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (hidden_size, 3*hidden_size)).astype(np.float32)), name='weight_h')
    # 输入层偏置初始化
    bias_i = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float32)), name='bias_i')
    # 隐藏层偏置初始化
    bias_h = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float32)), name='bias_h')
    return weight_i, weight_h, bias_i, bias_h
# 定义GRU网络
class GRU(nn.Cell):
    def __init__(self, config, is_training=True):
        super(GRU, self).__init__()
        if is_training: # 确认是否训练，若是训练，则采用训练集
            self.batch_size = config.batch_size
        else:
            self.batch_size = config.eval_batch_size
        self.hidden_size = config.hidden_size # 调用参数config
        self.weight_i, self.weight_h, self.bias_i, self.bias_h = \
            gru_default_state(self.batch_size, self.hidden_size, self.hidden_size) # 调用GRU网络中的权重和偏置
        self.rnn = P.DynamicGRUV2() # 调用AI框架Mindspore的GRU网络
        self.cast = P.Cast() # 转换成特定的数据类型

    def construct(self, x, hidden):
        x = self.cast(x, mstype.float16) # 转换成特定的数据类型
        y1, h1, _, _, _, _ = self.rnn(x, self.weight_i, self.weight_h, self.bias_i, self.bias_h, None, hidden)
        return y1, h1

# 定义编码器，将一组序列编码成一个向量，选用GRU在最后一个时间点的输出hidden来作为來context vector。
class Encoder(nn.Cell):
    def __init__(self, config, is_training=True):
        super(Encoder, self).__init__()
        self.vocab_size = config.en_vocab_size # 英文词典大小
        self.hidden_size = config.hidden_size # 隐藏层单元数量
        if is_training: # 确认训练过程
            self.batch_size = config.batch_size
        else:
            self.batch_size = config.eval_batch_size

        self.trans = P.Transpose() # 矩阵转置
        self.perm = (1, 0, 2) # 维度的重新排列
        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size) # 设置嵌入层于特定维度
        self.gru = GRU(config, is_training=is_training).to_float(mstype.float16)  # 转换成特定类型
        self.h = Tensor(np.zeros((self.batch_size, self.hidden_size)).astype(np.float16)) # 隐藏层张量初始输入设为0

    def construct(self, encoder_input):
        embeddings = self.embedding(encoder_input) # 输入嵌入层
        embeddings = self.trans(embeddings, self.perm) # 设置输入层：转置+维度排列
        output, hidden = self.gru(embeddings, self.h) # 经过隐藏层输出
        return output, hidden
# 定义解码器，额外加上一个线性输出层out，用来预测当时时间点的输出字母：
class Decoder(nn.Cell):
    def __init__(self, config, is_training=True):
        super(Decoder, self).__init__()

        self.vocab_size = config.ch_vocab_size # 中文词表大小
        self.hidden_size = config.hidden_size # 隐藏层单元数量

        self.trans = P.Transpose() # 矩阵转置
        self.perm = (1, 0, 2) # 维度的重新排列
        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size) # 设置嵌入层于特定维度
        self.gru = GRU(config, is_training=is_training).to_float(mstype.float16) # 定义GRU网络，确认数据类型
        self.dense = nn.Dense(self.hidden_size, self.vocab_size) # 定义全连接层
        self.softmax = nn.LogSoftmax(axis=2) # 定义LogSoftmax激活函数，数值稳定性优于Softmax
        self.cast = P.Cast() # 转换数据类型

    def construct(self, decoder_input, hidden):
        embeddings = self.embedding(decoder_input) # 解码器嵌入层输入
        embeddings = self.trans(embeddings, self.perm) # 输入嵌入层转置+维度排列
        output, hidden = self.gru(embeddings, hidden) # 通过隐藏层输出
        output = self.cast(output, mstype.float32) # 将输出转换数据格式
        output = self.dense(output) # 通过全连接层输出
        output = self.softmax(output) # 通过激活函数得结果

        return output, hidden
# 构建Seq2Seq模型
class Seq2Seq(nn.Cell):
    def __init__(self, config, is_train=True):
        super(Seq2Seq, self).__init__()
        self.max_len = config.max_seq_length # Token序列的最大长度
        self.is_train = is_train # 确认是否训练

        self.encoder = Encoder(config, is_train) # 确认训练编码器
        self.decoder = Decoder(config, is_train) # 确认训练解码器
        self.expanddims = P.ExpandDims() # 扩展维度
        self.squeeze = P.Squeeze(axis=0) # 移除维度
        self.argmax = P.ArgMaxWithValue(axis=int(2), keep_dims=True) # 输出最大索引值
        self.concat = P.Concat(axis=1) # 横向合并
        self.concat2 = P.Concat(axis=0) # 纵向合并
        self.select = P.Select()

    def construct(self, src, dst):
        encoder_output, hidden = self.encoder(src) # 将编码器输入到隐藏层训练
        decoder_hidden = self.squeeze(encoder_output[self.max_len-2:self.max_len-1:1, ::, ::]) # 将编码器的输出到解码器的隐藏层
        if self.is_train:
            outputs, _ = self.decoder(dst, decoder_hidden)
        else:
            decoder_input = dst[::,0:1:1]
            decoder_outputs = ()
            for i in range(0, self.max_len):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                # 从[seq_length，batch_size, hidden_size]squeeze把第一维移除变成[batch_size, hidden_size]
                decoder_hidden = self.squeeze(decoder_hidden)
                decoder_output, _ = self.argmax(decoder_output) # 获取最大结果
                decoder_output = self.squeeze(decoder_output) # 移除第一维度（seq_length）
                decoder_outputs += (decoder_output,)  # 更新结果
                decoder_input = decoder_output
            outputs = self.concat(decoder_outputs) # 横向合并解码器结果
        return outputs

class WithLossCell(nn.Cell):
    def __init__(self, backbone, config):
        super(WithLossCell, self).__init__(auto_prefix=False)
        self._backbone = backbone
        self.batch_size = config.batch_size
        self.onehot = nn.OneHot(depth=config.ch_vocab_size) # 独热编码中文
        self._loss_fn = NLLLoss() # 调用损失函数
        self.max_len = config.max_seq_length # 最长序列参数
        self.squeeze = P.Squeeze() # 移除维度
        self.cast = P.Cast() # 转换数据类型
        self.argmax = P.ArgMaxWithValue(axis=1, keep_dims=True)
        self.print = P.Print()

    def construct(self, src, dst, label):
        out = self._backbone(src, dst)
        loss_total = 0 # 定义初始损失值
        for i in range(self.batch_size):
            loss = self._loss_fn(self.squeeze(out[::,i:i+1:1,::]), self.squeeze(label[i:i+1:1, ::]))
            loss_total += loss
        loss = loss_total / self.batch_size # 单个批尺寸数据集的损失值
        return loss

class InferCell(nn.Cell):
    def __init__(self, network, config):
        super(InferCell, self).__init__(auto_prefix=False)
        self.expanddims = P.ExpandDims() # 扩展维度
        self.network = network

    def construct(self, src, dst):
        out = self.network(src, dst)
        return out

### 损失函数

In [None]:
# 定义损失函数
class NLLLoss(_Loss):
    '''
       NLLLoss function输入是一个对数概率向量和一个目标标签。NLLLoss() ，即负对数似然损失函数（Negative Log Likelihood）
    '''
    def __init__(self, reduction='mean'):
        super(NLLLoss, self).__init__(reduction)
        self.one_hot = P.OneHot() # 调用MindSpore中独热编码模块
        self.reduce_sum = P.ReduceSum() # # 调用MindSpore中求和模块，计算张量tensor沿着某一维度的和，可以在求和后降维。

    def construct(self, logits, label):
        label_one_hot = self.one_hot(label, F.shape(logits)[-1], F.scalar_to_array(1.0), F.scalar_to_array(0.0)) # 将标签进行独热编码
        # print('NLLLoss label_one_hot:',label_one_hot, label_one_hot.shape)
        # print('NLLLoss logits:',logits, logits.shape)
        # print('xxx:', logits * label_one_hot)
        loss = self.reduce_sum(-1.0 * logits * label_one_hot, (1,))  # 为计算损失值，最小化损失函数值，函数取负号，若实际标签张量在模型输出结果的对应位置的值越接近0，则具有越小的损失值
        return self.get_loss(loss)

### 训练函数

In [None]:
def train():
    parser = argparse.ArgumentParser(description='MindSpore LSTM Example')
    parser.add_argument('--dataset_path', type=str, default='./preprocess', help='dataset path.')
    parser.add_argument('--ckpt_save_path', type=str, default='./', help='checkpoint save path.')
    args = parser.parse_args()
    # 在Ascend芯片设备中训练
    context.set_context(
        mode=context.GRAPH_MODE,
        save_graphs=False,
        device_target='Ascend')

    ds_train = create_dataset(args.dataset_path, cfg.batch_size) # 获取数据集，分批训练

    network = Seq2Seq(cfg) # 根据预设参数构建模型
    network = WithLossCell(network, cfg)  # 记录单个批尺寸数据集的损失值
    optimizer = nn.Adam(network.trainable_params(), learning_rate=cfg.learning_rate, beta1=0.9, beta2=0.98) # 使用Adam优化器
    model = Model(network, optimizer=optimizer) # 构建模型

    loss_cb = LossMonitor() # 监测损失值
    # 保存检查点
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="gru", directory=args.ckpt_save_path, config=config_ck)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) # 监测时间
    callbacks = [time_cb, ckpoint_cb, loss_cb] # 使用回调函数

    model.train(cfg.num_epochs, ds_train, callbacks=callbacks, dataset_sink_mode=False) # 训练模型


### 评估函数

In [None]:
def eval():
    parser = argparse.ArgumentParser(description='MindSpore GRU Example')
    parser.add_argument('--dataset_path', type=str, default='./preprocess', help='dataset path.')
    parser.add_argument('--checkpoint_path', type=str, default='', help='checkpoint path.')
    args = parser.parse_args()
    # 在Ascend芯片设备中训练
    context.set_context(
        mode=context.GRAPH_MODE,# MindSpore图模式
        save_graphs=False,
        device_target='Ascend')

    rank = 0
    device_num = 1 # 设备数
    ds_eval= create_dataset(args.dataset_path, cfg.eval_batch_size, is_training=False) # 获取数据集

    network = Seq2Seq(cfg,is_train=False) # 建立Seq2Seq网络
    network = InferCell(network, cfg)  # 将设定的参数带入网络
    network.set_train(False) # 验证阶段
    parameter_dict = load_checkpoint(args.checkpoint_path) # 加载检查点
    load_param_into_net(network, parameter_dict)
    model = Model(network) # 建立模型

    with open(os.path.join(args.dataset_path,"en_vocab.txt"), 'r', encoding='utf-8') as f:
        data = f.read() # 读取英文词表
    en_vocab = list(data.split('\n')) # 换行分割

    with open(os.path.join(args.dataset_path,"ch_vocab.txt"), 'r', encoding='utf-8') as f:
        data = f.read() # 读取中文词表
    ch_vocab = list(data.split('\n'))
    # 创建中英文对照输出
    for data in ds_eval.create_dict_iterator():
        en_data=''
        ch_data=''
        for x in data['encoder_data'][0]: # 编码器输出
            if x == 0:
                break
            en_data += en_vocab[x] # 将英文数据逐步更新
            en_data += ' ' # 空格间隔
        for x in data['decoder_data'][0]: # 解码器输出
            if x == 0:
                break
            if x == 1:
                continue
            ch_data += ch_vocab[x]  # 将中文数据逐步更新
        output = network(data['encoder_data'],data['decoder_data']) # 输出结果
        print('English:', en_data) # 打印英文结果
        print('expect Chinese:', ch_data) # 打印对应的中文翻译
        out ='' # 中文结果初始空白
        for x in output[0]:
            if x == 0:
                break
            out += ch_vocab[x]
        print('predict Chinese:', out) # 答应翻译中文结果
        print(' ')

### 运行处理过程

In [None]:
convert_to_mindrecord("cmn_zhsim.txt", './', MAX_SEQ_LEN)

### 模型训练

In [None]:
ds_train = create_dataset(cfg.dataset_path, cfg.batch_size) # 获取数据集
network = Seq2Seq(cfg) # 根据设定参数构建Seq2Seq
network = WithLossCell(network, cfg) # 记录损失值
optimizer = nn.Adam(network.trainable_params(), learning_rate=cfg.learning_rate, beta1=0.9, beta2=0.98) # Adam优化器
model = Model(network, optimizer=optimizer) # 加Adam优化器构建网路

loss_cb = LossMonitor() # 检测显示损失值
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) # 保存检查点
ckpoint_cb = ModelCheckpoint(prefix="gru", directory=cfg.ckpt_save_path, config=config_ck) # 保存模型
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) # 检测时间
callbacks = [time_cb, ckpoint_cb, loss_cb] # 设置回调函数

model.train(cfg.num_epochs, ds_train, callbacks=callbacks, dataset_sink_mode=True) # 训练模型

### 模型验证

In [None]:
rank = 0
device_num = 1 # 设备数
ds_eval= create_dataset(cfg.dataset_path, cfg.eval_batch_size, is_training=False) # 验证阶段
network = Seq2Seq(cfg,is_train=False) # 构建Seq2Seq
network = InferCell(network, cfg) # 根据设定参数
network.set_train(False) # 验证阶段
parameter_dict = load_checkpoint(cfg.checkpoint_path) # 保存检查点
load_param_into_net(network, parameter_dict) # 加载参数
model = Model(network) # 构建模型

with open(os.path.join(cfg.dataset_path,"en_vocab.txt"), 'r', encoding='utf-8') as f:
    data = f.read() # 读取英文词表
en_vocab = list(data.split('\n'))

with open(os.path.join(cfg.dataset_path,"ch_vocab.txt"), 'r', encoding='utf-8') as f:
    data = f.read() # 读取中文词表
ch_vocab = list(data.split('\n'))

for data in ds_eval.create_dict_iterator():
    en_data='' # 初始英文词句
    ch_data='' # 初始中文词句
    for x in data['encoder_data'][0].asnumpy(): 
        if x == 0:
            break
        en_data += en_vocab[x] # 从此表中更新英文词句
        en_data += ' '
    for x in data['decoder_data'][0].asnumpy():
        if x == 0:
            break # 如果没有词句就中断
        if x == 1:
            continue
        ch_data += ch_vocab[x] # # 从此表中更新中文词句
    output = network(data['encoder_data'],data['decoder_data']) # 输出encoder和decoder的内容
    print('English:', en_data) # 输出英文词句
    print('expect Chinese:', ch_data) # 输出对应中文词句
    out ='' # 初始结果
    for x in output[0].asnumpy():
        if x == 0:
            break
        out += ch_vocab[x]
    print('predict Chinese:', out) # 输出预测中文词句
    print(' ')