# 完整问答系统实现

### 导入一些包

In [1]:
import os
import jieba
from zhon.hanzi import punctuation # 中文的一些符号
import re
import sys
import time
import tensorflow as tf
import io

import warnings
warnings.filterwarnings("ignore")

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # 这一行注释掉就是使用gpu，不注释就是使用cpu

- punctuation:
- ＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。

### 读取数据

In [3]:
conv_path = './data/xiaohuangji50w_nofenci.conv'

In [4]:
# 用于存储对话的列表
convs = []  
with open(conv_path, encoding='utf-8') as f:
    # 存储一次完整对话
    one_conv = []  
    for line in f:
        
        # 去除换行符，并将原文件中已经分词的标记去掉
        line = line.strip('\n').replace('?', '')  
        line = re.sub(r"[%s]+" % punctuation, "", line)
        
        if line == '':
            continue
        if line[0] == 'E':
            if one_conv:
                convs.append(one_conv)
            one_conv = []
        elif line[0] == 'M':
            
            # 将一次完整的对话存储下来
            one_conv.append(line.split(' ')[1])  

### 查看下数据

In [5]:
convs[:5]

[['呵呵', '是王若猫的'],
 ['不是', '那是什么'],
 ['怎么了', '我很难过安慰我~'],
 ['开心点哈,一切都会好起来', '嗯'],
 ['我还喜欢她,怎么办', '我帮你告诉她发短信还是打电话']]

### 中文分词

In [6]:
# 把对话分成问与答两个部分
seq = []

for conv in convs:
    
    if len(conv) == 1:
        continue
        
    if len(conv) % 2 != 0: 
        # 因为默认是一问一答的，所以需要进行数据的粗裁剪，对话行数要是偶数的
        conv = conv[:-1]
        
    for i in range(len(conv)):
        if i % 2 == 0:
            
            # 使用jieba分词器进行分词
            conv[i] = " ".join(jieba.cut(conv[i]))  
            conv[i + 1] = " ".join(jieba.cut(conv[i + 1]))
            
            # 因为i是从0开始的，因此偶数行为发问的语句，奇数行为回答的语句
            seq.append(conv[i] + '\t' + conv[i + 1])  

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/root/miniconda3/envs/dl/lib/python3.8/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpchj81sl6' -> '/tmp/jieba.cache'
Loading model cost 0.685 seconds.
Prefix dict has been built successfully.


### 查看分完词后的QA 对应语料数据

In [7]:
seq[:10]

['呵呵\t是 王若 猫 的',
 '不是\t那 是 什么',
 '怎么 了\t我 很 难过 安慰 我 ~',
 '开心 点哈 , 一切 都 会 好 起来\t嗯',
 '我 还 喜欢 她 , 怎么办\t我 帮 你 告诉 她 发短信 还是 打电话',
 '短信\t嗯 嗯 我 也 相信',
 '你 知道 谁 么\t肯定 不是 我 是 阮德培',
 '许兵 是 谁\t吴院 四班 小帅哥',
 '这么 假\t三鹿 奶粉 也 假 不 一样 的 卖 啊',
 '许兵 是 傻 逼\t被 你 发现 了']

### 存储结果

In [8]:
seq_train = open('train_data/seq.data', 'w', encoding='utf-8')

for i in range(len(seq)):
    seq_train.write(seq[i] + '\n')

    if i % 1000 == 0:
        print(len(range(len(seq))), '处理进度：', i)

seq_train.close()

454130 处理进度： 0
454130 处理进度： 1000
454130 处理进度： 2000
454130 处理进度： 3000
454130 处理进度： 4000
454130 处理进度： 5000
454130 处理进度： 6000
454130 处理进度： 7000
454130 处理进度： 8000
454130 处理进度： 9000
454130 处理进度： 10000
454130 处理进度： 11000
454130 处理进度： 12000
454130 处理进度： 13000
454130 处理进度： 14000
454130 处理进度： 15000
454130 处理进度： 16000
454130 处理进度： 17000
454130 处理进度： 18000
454130 处理进度： 19000
454130 处理进度： 20000
454130 处理进度： 21000
454130 处理进度： 22000
454130 处理进度： 23000
454130 处理进度： 24000
454130 处理进度： 25000
454130 处理进度： 26000
454130 处理进度： 27000
454130 处理进度： 28000
454130 处理进度： 29000
454130 处理进度： 30000
454130 处理进度： 31000
454130 处理进度： 32000
454130 处理进度： 33000
454130 处理进度： 34000
454130 处理进度： 35000
454130 处理进度： 36000
454130 处理进度： 37000
454130 处理进度： 38000
454130 处理进度： 39000
454130 处理进度： 40000
454130 处理进度： 41000
454130 处理进度： 42000
454130 处理进度： 43000
454130 处理进度： 44000
454130 处理进度： 45000
454130 处理进度： 46000
454130 处理进度： 47000
454130 处理进度： 48000
454130 处理进度： 49000
454130 处理进度： 50000
454130 处理进度： 51000
454130 处理进度： 52000
454130

In [9]:
len(seq)

454130

In [10]:
train_src = 'train_data/seq.data'
max_train_data_size = 50000
vocab_inp_size = 20000
enc_vocab_size = 20000
vocab_tar_size = 20000
embedding_dim = 128
units = 256
BATCH_SIZE = 32
max_length_inp, max_length_tar = 20, 20


# 预处理所有数据，给所有数据补全开头和结尾，主要目的在序列生成的时候有个统一的向量开始，也有一个统一的向量结束
def preprocess_sentence(w):
    w = 'start ' + w + ' end'
    return w

# 创建数据集合
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    
    # 对所有数据进行创建
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]

    return zip(*word_pairs)


# 获取最大的长度
def max_length(tensor):
    return max(len(t) for t in tensor)


# 读取所有sample数据
def read_data(path, num_examples):
    input_lang, target_lang = create_dataset(path, num_examples)

    # 组建输入词表
    input_tensor, input_token = tokenize(input_lang)
    
    # 组建输出词表
    target_tensor, target_token = tokenize(target_lang)

    return input_tensor, input_token, target_tensor, target_token


def tokenize(lang):
    # 用来自动构建词表
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=enc_vocab_size, oov_token='unk')
    lang_tokenizer.fit_on_texts(lang)

    # 将构建的词表映射到所有的词上面转化成序列
    tensor = lang_tokenizer.texts_to_sequences(lang)

    # 向后自动补全
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=max_length_inp, padding='post')

    return tensor, lang_tokenizer


input_tensor, input_token, target_tensor, target_token = read_data(train_src, max_train_data_size)

In [11]:
# max_length_inp

In [12]:
target_token.index_word

{1: 'unk',
 2: 'start',
 3: 'end',
 4: '我',
 5: '你',
 6: '的',
 7: '了',
 8: '是',
 9: '啊',
 10: '不',
 11: '啦',
 12: '好',
 13: '就',
 14: '他',
 15: '吧',
 16: '在',
 17: '有',
 18: '都',
 19: '说',
 20: '主人',
 21: '去',
 22: '那',
 23: '吃',
 24: '爱',
 25: '也',
 26: '么',
 27: '呢',
 28: '就是',
 29: '不要',
 30: '知道',
 31: '什么',
 32: '想',
 33: '她',
 34: '喜欢',
 35: '嗯',
 36: '会',
 37: '给',
 38: '嘛',
 39: '要',
 40: '很',
 41: '吗',
 42: '呀',
 43: '哦',
 44: '人',
 45: '还',
 46: '最',
 47: '怎么',
 48: '没',
 49: '不是',
 50: '才',
 51: '对',
 52: '一个',
 53: '当然',
 54: '小通',
 55: '人家',
 56: '大',
 57: '和',
 58: '来',
 59: '小',
 60: '个',
 61: '鸡',
 62: '没有',
 63: "'",
 64: '这',
 65: '叫',
 66: '谁',
 67: '着',
 68: '又',
 69: '看',
 70: '傻',
 71: '跟',
 72: '让',
 73: '可爱',
 74: '不会',
 75: '猜',
 76: '被',
 77: 'o',
 78: '多',
 79: '因为',
 80: '把',
 81: '逼',
 82: '自己',
 83: '上',
 84: '死',
 85: '可以',
 86: '呵呵',
 87: '我们',
 88: '别',
 89: '做',
 90: '一',
 91: '到',
 92: '还是',
 93: '这样',
 94: '陪',
 95: '老公',
 96: '等',
 97: '好吃',
 98: '睡

In [13]:
input_token.index_word

{1: 'unk',
 2: 'start',
 3: 'end',
 4: '你',
 5: '我',
 6: '了',
 7: '是',
 8: '的',
 9: '不',
 10: '吗',
 11: '什么',
 12: '说',
 13: '谁',
 14: '好',
 15: '吃',
 16: '鸡',
 17: '那',
 18: '么',
 19: '有',
 20: '怎么',
 21: '去',
 22: '给',
 23: '喜欢',
 24: '就',
 25: '想',
 26: '在',
 27: '知道',
 28: '会',
 29: '都',
 30: '啊',
 31: '还',
 32: '个',
 33: '不是',
 34: '没',
 35: '笑话',
 36: '爱',
 37: '他',
 38: '也',
 39: '小通',
 40: '人',
 41: '和',
 42: '来',
 43: '又',
 44: '傻',
 45: '还是',
 46: '一个',
 47: '吧',
 48: '叫',
 49: '小',
 50: '呵呵',
 51: '鸡鸡',
 52: '要',
 53: '就是',
 54: '啥',
 55: '跟',
 56: '怎么办',
 57: '嗯',
 58: '这',
 59: '死',
 60: '逼',
 61: '对',
 62: '为什么',
 63: '这么',
 64: '睡觉',
 65: '嘛',
 66: '看',
 67: '我要',
 68: '能',
 69: '没有',
 70: '很',
 71: '睡',
 72: '快',
 73: '讲个',
 74: '她',
 75: '才',
 76: '不要',
 77: '不会',
 78: '多少',
 79: '多',
 80: '真',
 81: '哪',
 82: '今天',
 83: '认识',
 84: '把',
 85: '找',
 86: '大',
 87: '妹',
 88: '无聊',
 89: '猜',
 90: '问',
 91: '你妹',
 92: '自己',
 93: '干嘛',
 94: '运势',
 95: '回复',
 96: '主人',
 97: '让'

In [14]:
input_token.word_index.get('end')

3

In [15]:
input_token.word_index.get('start', 3)

2

In [16]:
input_tensor[0]

array([ 2, 50,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)

In [17]:
input_token.word_index.get('你', 3)

4

In [18]:
input_token.word_index.get('你好', 3)

101

### GRU模型概要
<img src="./img/42.PNG" alt="FAO" width="500">
<img src="./img/43.PNG" alt="FAO" width="300">

- h t 表示 t 时刻的 hidden state
- y t 表示 t 时刻的 encode ouput

### 构建encoder模型

In [19]:
# 继承tf模型
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        
        # 隐藏层数量
        self.enc_units = enc_units
        
        # 根据词表数量和维度建立嵌入层
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        # 设置GRU模型  
        # Glorot均匀分布初始化方法, 参数从[-limit, limit]的均匀分布产生，其中limit为sqrt(6 / (fan_in + fan_out))。
        # fan_in为权值张量的输入单元数，fan_out是权重张量的输出单元数
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        
        # 正向传播的embedding
        x = self.embedding(x)
        
        # 输出结果， 输出hidden state
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    # 初始化权重
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

### 实例化encoder

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

### 构建attention 模型

In [21]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        
        # 权重矩阵 和 hidden 维度必须保持一致
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        
        # V矩阵为了输出一个值
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):  ##values z query y
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size=32, 1, hidden size=256)
        # 将向量增加一个维度 
        
        # query --> hidden
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size=32, max_length=20, 1)
        # values --> enc_output (batch_size=32, max_length=20, hidden_size=256)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size=32, max_length=20, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size=32, hidden_size=256)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [22]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        
        # decode维度数量
        self.dec_units = dec_units
        
        # 建立词向量
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        # 设置GRU为解码模型
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        
        # 创建全连接
        self.fc = tf.keras.layers.Dense(vocab_size)

        # 创建Attention
        self.attention = Attention(self.dec_units)

    def call(self, x, hidden, enc_output):
        
        # x --> dec_input
        # hidden --> dec_hidden
        # enc_output --> enc_output
        # context_vector (batch_size=32, hidden_size=256), attention_weights (batch_size=32, max_length=20, 1)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape (batch_size=32, 1, hidden_size=128)
        x = self.embedding(x)
        
        # x shape (batch_size=32, 1, hidden_size= 384 =128 + 256)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # output shape (32, 1, 256)
        # output state (32, 1, 256)
        output, state = self.gru(x)

        # output shape (32, 256)
        output = tf.reshape(output, (-1, output.shape[2]))

        # x shape (32, 20000)
        x = self.fc(output)
        
        # 对应结果
        # x -> predictions, 
        # state -> dec_hidden
        # attention_weights -> _
        return x, state, attention_weights

### 实例化Decoder

In [23]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

### 优化器和loss函数定义

In [24]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

### 自定义loss

In [25]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [26]:
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)


@tf.function
def train_step(inp, targ, targ_lang, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['start']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


In [27]:
checkpoint_dir = 'checkpoint'

### 构建训练函数

In [28]:
def train(save_dir):
    checkpoint_dir = save_dir
    print("Preparing data in %s" % train_src)
    steps_per_epoch = len(input_tensor) // BATCH_SIZE
    print(steps_per_epoch)
    
    # enc_hidden 一个初始化权重
    enc_hidden = encoder.initialize_hidden_state()
    
    # 获取最新的一次结果
    ckpt = tf.io.gfile.listdir(checkpoint_dir)
    if ckpt:
        # 如果存在则获取最新预训练结果
        print("reload pretrained model")
        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
        
    # 缓存大小
    BUFFER_SIZE = len(input_tensor)
    
    # 以最小为BUFFER_SIZE的方式打乱数据
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor)).shuffle(BUFFER_SIZE)
    
    # 设置 BATCH_SIZE大小
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    
    # 设置 checkpoint_dir 文件目录
    checkpoint_dir = save_dir
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    start_time = time.time()

    while True:
        start_time_epoch = time.time()
        
        # 设置全局loss
        total_loss = 0
        
        
        # 按batch 获取数据
        for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
            
            # 获取一个batch的loss
            batch_loss = train_step(inp, targ, target_token, enc_hidden)
            
            # 获取全部的loss
            total_loss += batch_loss
            
#             print('每次batch 的loss:', batch_loss.numpy())

        # 获取最新每步耗时
        step_time_epoch = (time.time() - start_time_epoch) / steps_per_epoch
        
        # 获取每步的loss
        step_loss = total_loss / steps_per_epoch
        
        # 获取最近的epoch
        current_steps =+ steps_per_epoch
        
        # 获取每步耗时
        step_time_total = (time.time() - start_time) / current_steps

        # 为了观测数据
        print('训练总步数: {} 每步耗时: {}  最新每步耗时: {} 最新每步loss {:.4f}'.format(current_steps, step_time_total, step_time_epoch,
                                                                      step_loss.numpy()))
        
        # 存下最新的checkpoint
        checkpoint.save(file_prefix=checkpoint_prefix)

        sys.stdout.flush()

In [29]:
input_token.word_index

{'unk': 1,
 'start': 2,
 'end': 3,
 '你': 4,
 '我': 5,
 '了': 6,
 '是': 7,
 '的': 8,
 '不': 9,
 '吗': 10,
 '什么': 11,
 '说': 12,
 '谁': 13,
 '好': 14,
 '吃': 15,
 '鸡': 16,
 '那': 17,
 '么': 18,
 '有': 19,
 '怎么': 20,
 '去': 21,
 '给': 22,
 '喜欢': 23,
 '就': 24,
 '想': 25,
 '在': 26,
 '知道': 27,
 '会': 28,
 '都': 29,
 '啊': 30,
 '还': 31,
 '个': 32,
 '不是': 33,
 '没': 34,
 '笑话': 35,
 '爱': 36,
 '他': 37,
 '也': 38,
 '小通': 39,
 '人': 40,
 '和': 41,
 '来': 42,
 '又': 43,
 '傻': 44,
 '还是': 45,
 '一个': 46,
 '吧': 47,
 '叫': 48,
 '小': 49,
 '呵呵': 50,
 '鸡鸡': 51,
 '要': 52,
 '就是': 53,
 '啥': 54,
 '跟': 55,
 '怎么办': 56,
 '嗯': 57,
 '这': 58,
 '死': 59,
 '逼': 60,
 '对': 61,
 '为什么': 62,
 '这么': 63,
 '睡觉': 64,
 '嘛': 65,
 '看': 66,
 '我要': 67,
 '能': 68,
 '没有': 69,
 '很': 70,
 '睡': 71,
 '快': 72,
 '讲个': 73,
 '她': 74,
 '才': 75,
 '不要': 76,
 '不会': 77,
 '多少': 78,
 '多': 79,
 '真': 80,
 '哪': 81,
 '今天': 82,
 '认识': 83,
 '把': 84,
 '找': 85,
 '大': 86,
 '妹': 87,
 '无聊': 88,
 '猜': 89,
 '问': 90,
 '你妹': 91,
 '自己': 92,
 '干嘛': 93,
 '运势': 94,
 '回复': 95,
 '主人': 96,
 '让': 97

### 预测函数

In [30]:
tf.expand_dims([target_token.word_index['start']], 1)

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[2]], dtype=int32)>

In [31]:
def predict(sentence, model_path):
    
    # 获取模型checkpoint_dir目录
    checkpoint_dir = model_path
    
    # 获取最后一次训练的模型
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    # 预处理数据
    sentence = preprocess_sentence(sentence)
    
    # 将所有的文字转换成数值, 未登入词设置为1
    inputs = [input_token.word_index.get(i, 1) for i in sentence.split(' ')]

    # 将所有词自动往后最大补全
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    
    # 转换成tf的向量
    inputs = tf.convert_to_tensor(inputs)

    # 设置默认结果为空
    result = ''

    # 初始化hidden向量
    hidden = [tf.zeros((1, units))]
    
    # 获取encoder后的结果, 之后会全局使用
    enc_out, enc_hidden = encoder(inputs, hidden)

    # 讲enc_hidden 结果传递给 dec_hidden
    dec_hidden = enc_hidden
    
    # 初始化一个input值 [[2]], 0表示维度坐标系
    dec_input = tf.expand_dims([target_token.word_index['start']], 0)

    # 循环最大长度次
    for t in range(max_length_tar):
        
        # 第一次 dec_input = 'start'
        # dec_input 第二次开始就是自己预测的结果做为 dec_input
        
        # 第一次 dec_hidden = enc_hidden
        # dec_hidden 第二次开始就是自己预测的结果做为 dec_hidden
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        # predictions 为输出的概率序列, predicted_id是序列中概率最大的索引结果
        predicted_id = tf.argmax(predictions[0]).numpy()

        # 如果预测的结果为 end 那么就停止
        if target_token.index_word[predicted_id] == 'end':
            break
            
        # 输出结果从词表中不断拼接
        result += target_token.index_word[predicted_id] + ' '

        # 结果重置为dec_input
        dec_input = tf.expand_dims([predicted_id], 0)

    return result

### 开始训练

In [32]:
train(checkpoint_dir)

Preparing data in train_data/seq.data
1562
reload pretrained model
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
训练总步数: 1562 每步耗时: 0.1446536066590137  最新每步耗时: 0.14465349752374862 最新每步loss 0.3277
训练总步数: 1562 每步耗时: 0.2814886

KeyboardInterrupt: 

### 运行app
- Windows 可以直接运行
- Linux jupyter 执行 ssh -p 22 -L 16006:127.0.0.1:8809 root@47.95.198.64
- http://127.0.0.1:16006

In [None]:
from flask import Flask, render_template, request, jsonify
import execute
import time
import threading
import jieba

"""
定义心跳检测函数
"""


def heartbeat():
    print(time.strftime('%Y-%m-%d %H:%M:%S - heartbeat', time.localtime(time.time())))
    timer = threading.Timer(60, heartbeat)
    timer.start()


timer = threading.Timer(60, heartbeat)
timer.start()


app = Flask(__name__, static_url_path="/static")


@app.route('/message', methods=['POST'])
# """定义应答函数，用于获取输入信息并返回相应的答案"""
def reply():
    # 从请求中获取参数信息
    req_msg = request.form['msg']
    # 将语句使用结巴分词进行分词
    req_msg = " ".join(jieba.cut(req_msg))

    # 调用decode_line对生成回答信息
    model_path = 'model_data'
    res_msg = execute.predict(req_msg, model_path)
    # 将unk值的词用微笑符号袋贴
    res_msg = res_msg.replace('_UNK', '^_^')
    res_msg = res_msg.strip()
    print(res_msg)
    # 如果接受到的内容为空，则给出相应的回复
    if res_msg == ' ':
        res_msg = '请与我聊聊天吧'

    return jsonify({'text': res_msg})


"""
jsonify:是用于处理序列化json数据的函数，就是将数据组装成json格式返回

http://flask.pocoo.org/docs/0.12/api/#module-flask.json
"""


@app.route("/")
def index():
    return render_template("index.html")


'''
'''
# 启动APP
if (__name__ == "__main__"):
    app.run(host='0.0.0.0', port=8824)


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://0.0.0.0:22488/ (Press CTRL+C to quit)


2021-08-06 14:15:09 - heartbeat
2021-08-06 14:16:09 - heartbeat
2021-08-06 14:17:09 - heartbeat
2021-08-06 14:18:09 - heartbeat
2021-08-06 14:19:09 - heartbeat
2021-08-06 14:20:09 - heartbeat
2021-08-06 14:21:09 - heartbeat
2021-08-06 14:22:09 - heartbeat
2021-08-06 14:23:09 - heartbeat
2021-08-06 14:24:09 - heartbeat
2021-08-06 14:25:09 - heartbeat
2021-08-06 14:26:09 - heartbeat
2021-08-06 14:27:09 - heartbeat
2021-08-06 14:28:09 - heartbeat
2021-08-06 14:29:09 - heartbeat
2021-08-06 14:30:09 - heartbeat
2021-08-06 14:31:09 - heartbeat
2021-08-06 14:32:09 - heartbeat
2021-08-06 14:33:09 - heartbeat
2021-08-06 14:34:09 - heartbeat
2021-08-06 14:35:09 - heartbeat
2021-08-06 14:36:09 - heartbeat
2021-08-06 14:37:09 - heartbeat
2021-08-06 14:38:09 - heartbeat
2021-08-06 14:39:09 - heartbeat
2021-08-06 14:40:09 - heartbeat
2021-08-06 14:41:09 - heartbeat
2021-08-06 14:42:09 - heartbeat
2021-08-06 14:43:09 - heartbeat
2021-08-06 14:44:09 - heartbeat
2021-08-06 14:45:09 - heartbeat
2021-08-

2021-08-06 18:36:09 - heartbeat
2021-08-06 18:37:09 - heartbeat
2021-08-06 18:38:09 - heartbeat
2021-08-06 18:39:09 - heartbeat
2021-08-06 18:40:09 - heartbeat
2021-08-06 18:41:09 - heartbeat
2021-08-06 18:42:09 - heartbeat
2021-08-06 18:43:09 - heartbeat
2021-08-06 18:44:09 - heartbeat
2021-08-06 18:45:09 - heartbeat
2021-08-06 18:46:09 - heartbeat
2021-08-06 18:47:09 - heartbeat
2021-08-06 18:48:09 - heartbeat
2021-08-06 18:49:09 - heartbeat
2021-08-06 18:50:09 - heartbeat
2021-08-06 18:51:09 - heartbeat
2021-08-06 18:52:09 - heartbeat
2021-08-06 18:53:09 - heartbeat
2021-08-06 18:54:09 - heartbeat
2021-08-06 18:55:09 - heartbeat
2021-08-06 18:56:09 - heartbeat
2021-08-06 18:57:09 - heartbeat
2021-08-06 18:58:09 - heartbeat
2021-08-06 18:59:09 - heartbeat
2021-08-06 19:00:09 - heartbeat
2021-08-06 19:01:09 - heartbeat
2021-08-06 19:02:09 - heartbeat
2021-08-06 19:03:09 - heartbeat
2021-08-06 19:04:09 - heartbeat
2021-08-06 19:05:09 - heartbeat
2021-08-06 19:06:09 - heartbeat
2021-08-