项目参考:[https://github.com/zhaoyingjun/chatbot](https://github.com/zhaoyingjun/chatbot)


语言模型：
- 统计语言模型：根据贝叶斯公司计算每个单词出现的频率
- n-gram语言模型：根据马尔科夫假设，每个词仅与前面n个词有关系，缩短概率依赖的词的数量。比如2-gram，表示每个词仅与前面一个词有关系；3-gram表示每个词仅与前面2个词有关系。
- 神经网络语言模型：one-hot后输入神经网络，输出对应的向量作为表示

[](https://www.bilibili.com/video/BV1o4411R7B1?from=search&seid=9886358575655357837)

# RNN模型

# LSTM模型

# GRU模型

# Seq2Seq模型

# EncoderDecoder

# Attention机制


In [3]:
import tensorflow as tf
import io

In [35]:
# 数据准备，总共454130条
lines = io.open('./2-train_data/seq.data', encoding='UTF-8').read().strip().split('\n')
# 如：继续\t没有 继续 了
preprocess_lines = [['start '+w+' end' for w in l.split('\t')] for l in lines[:50000]]
# 如：['start 继续 end', 'start 没有 继续 了 end']
input_lang, target_lang = zip(*preprocess_lines)

In [45]:
"""
tf.keras.preprocessing.text.Tokenizer

num_words: 保留词频最高的词的数量
filters: 词典中需要过滤的词
lower: 是否转换成小写
split: 分词分隔符
char_level: 如果为true，每个字符作为一个词
oov_token: 用来代替没有出现在词典中的词(TODO 没弄明白这个是什么意思)
"""
# 分词，并转换成词典索引
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(input_lang + target_lang)
x_tensor = tokenizer.texts_to_sequences(input_lang)
y_tensor = tokenizer.texts_to_sequences(target_lang)
# 序列填充成相同维度
x_pad_tensor = tf.keras.preprocessing.sequence.pad_sequences(x_tensor, maxlen=20, padding='post')
y_pad_tensor = tf.keras.preprocessing.sequence.pad_sequences(y_tensor, maxlen=20, padding='post')
# 针对数据进行shuffle混排
dataset = tf.data.Dataset.from_tensor_slices((x_pad_tensor, y_pad_tensor)).shuffle(len(x_pad_tensor))
# 把数据切分成128大小
dataset = datasets.batch(128, drop_remainder=True)

In [44]:
dataset

<BatchDataset shapes: ((128, 128, 20), (128, 128, 20)), types: (tf.int32, tf.int32)>

In [3]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.gru = tf.keras.layers.GRU(self.enc_units, 
                                       return_sequence=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, inital_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))
    
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)
    
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector,1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        outputs = self.fc(output)
        return outputs, state, attention_weights