In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras import callbacks
import numpy as np

Using TensorFlow backend.


In [2]:
# 基本参数
batch_size = 64
epochs = 100
latent_dim = 256 # LSTM 的单元个数
num_samples = 10000 # 训练样本的大小


data_path = '../data/cmn.txt'

In [3]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
# 显示部分数据
lines[:20]


['Hi.\t嗨。',
 'Hi.\t你好。',
 'Run.\t你用跑的。',
 'Wait!\t等等！',
 'Hello!\t你好。',
 'I try.\t让我来。',
 'I won!\t我赢了。',
 'Oh no!\t不会吧。',
 'Cheers!\t乾杯!',
 'He ran.\t他跑了。',
 'Hop in.\t跳进来。',
 'I lost.\t我迷失了。',
 'I quit.\t我退出。',
 "I'm OK.\t我沒事。",
 'Listen.\t听着。',
 'No way!\t不可能！',
 'No way!\t没门！',
 'Really?\t你确定？',
 'Try it.\t试试吧。',
 'We try.\t我们来试试。']

In [4]:
for line in lines[: min(num_samples, len(lines) - 1)]:
    # 分割输入序列和目标序列
    input_text, target_text = line.split('\t')

    # 用'tab'作为 一个序列的开始字符
    # 用 '\n' 作为 序列的结束字符
    target_text = '\t' + target_text + '\n'

    input_texts.append(input_text)
    target_texts.append(target_text)

    # 计算 input_text 中的 tokens
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)

    # 计算 target_text 中的 tokens
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([ len(txt) for txt in input_texts])
max_decoder_seq_length = max([ len(txt) for txt in target_texts])

print('Nunmber of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length of input:', max_encoder_seq_length)
print('Max sequence length of outputs:', max_decoder_seq_length)


Nunmber of samples: 10000
Number of unique input tokens: 73
Number of unique output tokens: 2622
Max sequence length of input: 30
Max sequence length of outputs: 22


In [5]:
# 建立 字符->数字 字典，用于字符的向量化
input_token_index = dict( [(char, i)for i, char in enumerate(input_characters)] )
target_token_index = dict( [(char, i) for i, char in enumerate(target_characters)] )


In [6]:
# 创建数组 
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype=np.float32)
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)

# 填充数据, 对每一个字符做one-hot
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    # 对编码器的输入序列做one-hot
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0

    # 对解码器的输入与输出做序列做one-hot
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0

        if t > 0:
            # decoder_target_data 不包含开始字符，并且比decoder_input_data提前一步
            decoder_target_data[i, t-1, target_token_index[char]] = 1.0


In [7]:
# 定义编码器的输入
# encoder_inputs (None, num_encoder_tokens), None表示可以处理任意长度的序列
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# 编码器，要求其返回状态
encoder = LSTM(latent_dim, return_state=True)

# 调用编码器，得到编码器的输出（输入其实不需要），以及状态信息 state_h 和 state_c
encoder_outpus, state_h, state_c = encoder(encoder_inputs)

# 丢弃encoder_outputs, 我们只需要编码器的状态
encoder_state = [state_h, state_c]


In [8]:
# 定义解码器的输入
# 同样的，None表示可以处理任意长度的序列
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# 接下来建立解码器，解码器将返回整个输出序列
# 并且返回其中间状态，中间状态在训练阶段不会用到，但是在推理阶段将是有用的
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# 将编码器输出的状态作为初始解码器的初始状态
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_state)

# 添加全连接层
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [9]:
# 定义整个模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 定义回调函数
callback_list = [callbacks.EarlyStopping(patience=5)]
# 编译模型
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# 训练
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs = epochs,
          validation_split=0.1,
          callbacks=callback_list)


Train on 9000 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


<keras.callbacks.History at 0x7f09f1dfb208>

In [10]:
# 定义 sampling 模型
# 定义 encoder 模型，得到输出encoder_states
encoder_model = Model(encoder_inputs, encoder_state)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# 得到解码器的输出以及中间状态
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs]+decoder_states)


In [11]:
# 建立 数字->字符 的字典，用于恢复
reverse_input_char_index = dict([(i, char) for char, i in input_token_index.items()])
reverse_target_char_index = dict([(i, char) for char, i in target_token_index.items()])


In [12]:
def decode_sequence(input_seq):
    # 将输入序列进行编码
    states_value = encoder_model.predict(input_seq)

    # 生成一个size=1的空序列
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # 将这个空序列的内容设置为开始字符
    target_seq[0, 0, target_token_index['\t']] = 1.

    # 进行字符恢复
    # 简单起见，假设batch_size = 1
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # 退出条件：生成 \n 或者 超过最大序列长度
        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length :
            stop_condition = True

        # 更新target_seq
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # 更新中间状态
        states_value = [h, c]

    return decoded_sentence


In [27]:
"每一次预测的时候把之前预测的值全部带入    decode_sequence_new"
"每一次预测的时候把之前一次预测的值带入，并更新隐状态    decode_sequence
"""
以上两种方法的输出完全相同
"""

def decode_sequence_new(input_seq):
    # 将输入序列进行编码
    states_value = encoder_model.predict(input_seq)

    # 生成一个size=1的空序列
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # 将这个空序列的内容设置为开始字符
    target_seq[0, 0, target_token_index['\t']] = 1.

    # 进行字符恢复
    # 简单起见，假设batch_size = 1
    stop_condition = False
    decoded_sentence = ''

#     while not stop_condition:
#         output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

#         # sample a token
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_char = reverse_target_char_index[sampled_token_index]
#         decoded_sentence += sampled_char

#         # 退出条件：生成 \n 或者 超过最大序列长度
#         if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length :
#             stop_condition = True

#         # 更新target_seq
#         target_seq = np.zeros((1, 1, num_decoder_tokens))
#         target_seq[0, 0, sampled_token_index] = 1.

#         # 更新中间状态
#         states_value = [h, c]
    while not stop_condition:
        output_tokens, _, _ = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        #         # 退出条件：生成 \n 或者 超过最大序列长度
        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length :
            stop_condition = True

        # 更新target_seq
        target_seq = np.zeros((1, 1+len(decoded_sentence), num_decoder_tokens))
        target_seq[0, 0, target_token_index['\t']] = 1.
        for j in range(1,target_seq.shape[1]):
            cur_char = decoded_sentence[j-1]
            
            target_seq[0,j,target_token_index[cur_char]] = 1
        

    return decoded_sentence


In [29]:
# 检验成果的时候到了,从训练集中选取一些句子做测试
# 效果还行（废话，从训练集里挑的数据）
for seq_index in range(1000, 1100):
    # batch_size = 1
    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)

    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)


-
Input sentence: I can swim well.
Decoded sentence: 我不能在这里。

-
Input sentence: I can't see you.
Decoded sentence: 我不能幫忙。

-
Input sentence: I can't undo it.
Decoded sentence: 我不能忍受。

-
Input sentence: I don't want it.
Decoded sentence: 我不想要那個。

-
Input sentence: I feel relieved.
Decoded sentence: 我有一個好見。

-
Input sentence: I get up at six.
Decoded sentence: 我有一個好主意。

-
Input sentence: I had no choice.
Decoded sentence: 我有一个好。

-
Input sentence: I hate studying.
Decoded sentence: 我有一个好。

-
Input sentence: I have brothers.
Decoded sentence: 我有一個意見。

-
Input sentence: I have ten pens.
Decoded sentence: 我有一個意見。

-
Input sentence: I have to hurry!
Decoded sentence: 我可以幫忙嗎？

-
Input sentence: I have two cats.
Decoded sentence: 我有一個意見。

-
Input sentence: I have two sons.
Decoded sentence: 我有一個好。

-
Input sentence: I just threw up.
Decoded sentence: 我不能在這裡。

-
Input sentence: I lent him a CD.
Decoded sentence: 我想要去。

-
Input sentence: I like Tom, too.
Decoded sentence: 我喜欢汤姆。

-
Input sentenc

In [30]:
# 检验成果的时候到了,从训练集中选取一些句子做测试
# 效果还行（废话，从训练集里挑的数据）
for seq_index in range(1000, 1100):
    # batch_size = 1
    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence_new(input_seq)

    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: I can swim well.
Decoded sentence: 我不能在这里。

-
Input sentence: I can't see you.
Decoded sentence: 我不能幫忙。

-
Input sentence: I can't undo it.
Decoded sentence: 我不能忍受。

-
Input sentence: I don't want it.
Decoded sentence: 我不想要那個。

-
Input sentence: I feel relieved.
Decoded sentence: 我有一個好見。

-
Input sentence: I get up at six.
Decoded sentence: 我有一個好主意。

-
Input sentence: I had no choice.
Decoded sentence: 我有一个好。

-
Input sentence: I hate studying.
Decoded sentence: 我有一个好。

-
Input sentence: I have brothers.
Decoded sentence: 我有一個意見。

-
Input sentence: I have ten pens.
Decoded sentence: 我有一個意見。

-
Input sentence: I have to hurry!
Decoded sentence: 我可以幫忙嗎？

-
Input sentence: I have two cats.
Decoded sentence: 我有一個意見。

-
Input sentence: I have two sons.
Decoded sentence: 我有一個好。

-
Input sentence: I just threw up.
Decoded sentence: 我不能在這裡。

-
Input sentence: I lent him a CD.
Decoded sentence: 我想要去。

-
Input sentence: I like Tom, too.
Decoded sentence: 我喜欢汤姆。

-
Input sentenc

# Analysis decoder

In [14]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 2622)   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  2948096     input_2[0][0]                    
                                                                 input_3[0][0]                    
          

In [15]:
seq_index = 1000
input_seq = encoder_input_data[seq_index:seq_index+1]
print(input_texts[seq_index])
input_seq.shape

I can swim well.


(1, 30, 73)

In [16]:
states_value = encoder_model.predict(input_seq)

# 生成一个size=1的空序列
target_seq = np.zeros((1, 1, num_decoder_tokens))
# 将这个空序列的内容设置为开始字符
target_seq[0, 0, target_token_index['\t']] = 1.

# 进行字符恢复
# 简单起见，假设batch_size = 1
stop_condition = False
decoded_sentence = ''

print('status h shape: {}'.format(states_value[0].shape))
print('status c shape: {}'.format(states_value[1].shape))

status h shape: (1, 256)
status c shape: (1, 256)


### round 1

In [17]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
print(output_tokens.shape)

(1, 1, 2622)


In [18]:
# sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
print('sampled char is: {}' . format(sampled_char))
print('decoded sentence is: {}' . format(decoded_sentence))

sampled char is: 我
decoded sentence is: 我


In [19]:
 # 更新target_seq
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

# 更新中间状态
states_value = [h, c]

### round 2

In [20]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
print(output_tokens.shape)

# sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
print('sampled char is: {}' . format(sampled_char))
print('decoded sentence is: {}' . format(decoded_sentence))

 # 更新target_seq
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

# 更新中间状态
states_value = [h, c]

(1, 1, 2622)
sampled char is: 不
decoded sentence is: 我不


### round 3

In [21]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
print(output_tokens.shape)

# sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
print('sampled char is: {}' . format(sampled_char))
print('decoded sentence is: {}' . format(decoded_sentence))

 # 更新target_seq
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

# 更新中间状态
states_value = [h, c]

(1, 1, 2622)
sampled char is: 能
decoded sentence is: 我不能


### round 4

In [22]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
print(output_tokens.shape)

# sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
print('sampled char is: {}' . format(sampled_char))
print('decoded sentence is: {}' . format(decoded_sentence))

 # 更新target_seq
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

# 更新中间状态
states_value = [h, c]

(1, 1, 2622)
sampled char is: 在
decoded sentence is: 我不能在


### round 5

In [23]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
print(output_tokens.shape)

# sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
print('sampled char is: {}' . format(sampled_char))
print('decoded sentence is: {}' . format(decoded_sentence))

 # 更新target_seq
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

# 更新中间状态
states_value = [h, c]

(1, 1, 2622)
sampled char is: 这
decoded sentence is: 我不能在这


#### round 6

In [24]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
print(output_tokens.shape)

# sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
print('sampled char is: {}' . format(sampled_char))
print('decoded sentence is: {}' . format(decoded_sentence))

 # 更新target_seq
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

# 更新中间状态
states_value = [h, c]

(1, 1, 2622)
sampled char is: 里
decoded sentence is: 我不能在这里


### round 7

In [25]:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
print(output_tokens.shape)

# sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
print('sampled char is: {}' . format(sampled_char))
print('decoded sentence is: {}' . format(decoded_sentence))

 # 更新target_seq
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.

# 更新中间状态
states_value = [h, c]

(1, 1, 2622)
sampled char is: 。
decoded sentence is: 我不能在这里。


In [26]:
sampled_char

'。'