In [2]:
import pandas as pd
import numpy as np
from keras.layers import Input,LSTM,Dense,CuDNNLSTM,Bidirectional
from keras.models import Model
from keras.optimizers import Adam,rmsprop

In [3]:
data_path = '../data/cmn.txt'
df = pd.read_table(data_path,header=None)
df.columns=['inputs','targets']

#讲每句中文句首加上'\t'作为起始标志，句末加上'\n'作为终止标志
df['targets'] = df['targets'].apply(lambda x: '\t'+x+'\n')
df.head(5)

Unnamed: 0,inputs,targets
0,Hi.,\t嗨。\n
1,Hi.,\t你好。\n
2,Run.,\t你用跑的。\n
3,Wait!,\t等等！\n
4,Hello!,\t你好。\n


In [4]:
input_texts = df.inputs.values.tolist()#英文句子列表
target_texts = df.targets.values.tolist()#中文句子列表
print('input texts samples" {}'.format(len(input_texts)))

#确定中英文各自包含的字符。df.unique()直接取sum可将unique数组中的各个句子拼接成一个长句子
input_characters = sorted(list(set(df.inputs.unique().sum())))
target_characters = sorted(list(set(df.targets.unique().sum())))

print('last 5 character of input text: {}'.format(input_characters[-5:]))
print('last 5 character of target text: {}'.format(target_characters[-5:]))



input texts samples" 20294
last 5 character of input text: ['x', 'y', 'z', 'é', '’']
last 5 character of target text: ['，', '－', '：', '？', '𡡡']


## 每条句子经过对字母转换成one-hot编码后，生成了LSTM需要的三维输入[n_samples, timestamp, one-hot feature]

In [5]:
"""
NUM_SAMPLES，样本条数，这里是输入的句子条数
INPUT_LENGTH，输入数据的时刻t的长度，这里为最长的英文句子长度
OUTPUT_LENGTH，输出数据的时刻t的长度，这里为最长的中文句子长度
INPUT_FEATURE_LENGTH，每个时刻进入encoder的lstm单元的数据x t  xtx_t的维度，这里为英文中出现的字符数
OUTPUT_FEATURE_LENGTH，每个时刻进入decoder的lstm单元的数据x t  xtx_t的维度，这里为中文中出现的字符数
"""
NUM_SAMPLES = len(target_texts)
INPUT_LENGTH = max(df.inputs.apply(lambda x: len(x)))
OUTPUT_LENGTH = max(df.targets.apply(lambda x: len(x)))
INPUT_FEATURE_LENGTH = len(input_characters)
OUTPUT_FEATURE_LENGTH = len(target_characters)
print('NUM_SAMPLES: {}'.format(NUM_SAMPLES))
print('INPUT_LENGTH: {}'.format(INPUT_LENGTH))
print('OUTPUT_LENGTH: {}'.format(OUTPUT_LENGTH))
print('INPUT_FEATURE_LENGTH: {}'.format(INPUT_FEATURE_LENGTH))
print('OUTPUT_FEATURE_LENGTH: {}'.format(OUTPUT_FEATURE_LENGTH))

#encoder输入、decoder输入输出初始化为三维向量
encoder_input = np.zeros((NUM_SAMPLES,INPUT_LENGTH,INPUT_FEATURE_LENGTH))
decoder_input = np.zeros((NUM_SAMPLES,OUTPUT_LENGTH,OUTPUT_FEATURE_LENGTH))
decoder_output = np.zeros((NUM_SAMPLES,OUTPUT_LENGTH,OUTPUT_FEATURE_LENGTH))


input_dict = {char:index for index,char in enumerate(input_characters)}
input_dict_reverse = {index:char for index,char in enumerate(input_characters)}
target_dict = {char:index for index,char in enumerate(target_characters)}
target_dict_reverse = {index:char for index,char in enumerate(target_characters)}




NUM_SAMPLES: 20294
INPUT_LENGTH: 163
OUTPUT_LENGTH: 46
INPUT_FEATURE_LENGTH: 75
OUTPUT_FEATURE_LENGTH: 3427


# 对句子进行字符级one-hot编码，将输入输出数据向量化：

# question 1: paddle at the end of a scentence ? vs paddle at the beginning of a scentence

In [6]:
#encoder的输入向量one-hot
for seq_index,seq in enumerate(input_texts):
    for char_index, char in enumerate(seq):
        encoder_input[seq_index,char_index,input_dict[char]] = 1

# question 2: decoder_input char_index starts from 0, should not be 1?

In [7]:
#decoder的输入输出向量one-hot，训练模型时decoder的输入要比输出晚一个时间步，这样才能对输出监督
for seq_index,seq in enumerate(target_texts):
    for char_index,char in enumerate(seq):
        decoder_input[seq_index,char_index,target_dict[char]] = 1.0
        if char_index > 0:
            decoder_output[seq_index,char_index-1,target_dict[char]] = 1.0

# MODELING

In [8]:
def create_model(n_input,n_output,n_units):
    """
    n_input: encoder输入维度n_input为每个时间步的输入xt的维度，这里是用来one-hot的英文字符数
    n_output: decoder的输入维度为中文字符数


    """
    #训练阶段
    encoder_input = Input(shape = (None, n_input))
    encoder = CuDNNLSTM(n_units, return_state=True)
    _,encoder_h,encoder_c = encoder(encoder_input)
    encoder_state = [encoder_h,encoder_c]

    #decoder
    decoder_input = Input(shape = (None, n_output))
    decoder = CuDNNLSTM(n_units,return_sequences=True, return_state=True)
    decoder_output, _, _ = decoder(decoder_input,initial_state=encoder_state)
    decoder_dense = Dense(n_output,activation='softmax')
    decoder_output = decoder_dense(decoder_output)

    #生成的训练模型
    model = Model([encoder_input,decoder_input],decoder_output)

    #推理阶段，用于预测过程
    encoder_infer = Model(encoder_input,encoder_state)

    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))    
    decoder_state_input = [decoder_state_input_h, decoder_state_input_c]#上个时刻的状态h,c   

    decoder_infer_output, decoder_infer_state_h, decoder_infer_state_c = decoder(decoder_input,initial_state=decoder_state_input)
    decoder_infer_state = [decoder_infer_state_h, decoder_infer_state_c]#当前时刻得到的状态
    decoder_infer_output = decoder_dense(decoder_infer_output)#当前时刻的输出
    decoder_infer = Model([decoder_input]+decoder_state_input,[decoder_infer_output]+decoder_infer_state)

    return model, encoder_infer, decoder_infer


In [65]:
def predict_chinese(source,encoder_inference, decoder_inference, n_steps, features):
    #先通过推理encoder获得预测输入序列的隐状态
    state = encoder_inference.predict(source)
    #第一个字符'\t',为起始标志
    predict_seq = np.zeros((1,1,features))
    predict_seq[0,0,target_dict['\t']] = 1

    output = ''
    #开始对encoder获得的隐状态进行推理
    #每次循环用上次预测的字符作为输入来预测下一次的字符，直到预测出了终止符
    for i in range(n_steps):#n_steps为句子最大长度
        #给decoder输入上一个时刻的h,c隐状态，以及上一次的预测字符predict_seq
        yhat,h,c = decoder_inference.predict([predict_seq]+state)
        #注意，这里的yhat为Dense之后输出的结果，因此与h不同
        char_index = np.argmax(yhat[0,-1,:])
        char = target_dict_reverse[char_index]
        output += char
        state = [h,c]#本次状态做为下一次的初始状态继续传递
        predict_seq = np.zeros((1,1,features))
        predict_seq[0,0,char_index] = 1
        if char == '\n':#预测到了终止符则停下来
            break
    return output


In [66]:
model, encoder_infer, decoder_infer = create_model(n_input=INPUT_FEATURE_LENGTH,
                                                   n_output=OUTPUT_FEATURE_LENGTH,
                                                   n_units=1024)

In [67]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, None, 75)     0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, None, 3427)   0                                            
__________________________________________________________________________________________________
cu_dnnlstm_17 (CuDNNLSTM)       [(None, 1024), (None 4509696     input_33[0][0]                   
__________________________________________________________________________________________________
cu_dnnlstm_18 (CuDNNLSTM)       [(None, None, 1024), 18239488    input_34[0][0]                   
                                                                 cu_dnnlstm_17[0][1]              
          

In [69]:
model.compile(
        loss='categorical_crossentropy',
        optimizer=rmsprop(),
        metrics=['accuracy'])


In [30]:
model.fit(x=[encoder_input,decoder_input],
          y=decoder_output,
          batch_size=64,
          epochs=100,
          validation_split=0.2)

Train on 16235 samples, validate on 4059 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f455a646f28>

In [70]:

for j in range(300):
    print(j)
    model.fit(x=[encoder_input,decoder_input],y=decoder_output,batch_size=None)
    i= np.random.randint(10000)
    print(input_texts[i])
    test = encoder_input[i:i+1,:,:]
    out = predict_chinese(test,encoder_infer,decoder_infer,OUTPUT_LENGTH,OUTPUT_FEATURE_LENGTH)
    print(out)
    print('=============================================')

0
Epoch 1/1
I saw him playing baseball.
我們為你的时候，我不知道。

1
Epoch 1/1
Why did you open the box?
我們的父親很有趣。

2
Epoch 1/1
I come from England.
我不知道他是什么。

3
Epoch 1/1
I've seen that.
我不想要你。

4
Epoch 1/1
Where is the nearest station?
我們在這裡很多次了。

5
Epoch 1/1
My hands and legs are swollen.
我們的父母是一個很大的問題。

6
Epoch 1/1
Tom is a normal teenager.
我不知道他的名字。

7
Epoch 1/1
Hurry up.
我們在這裡的車子很得很快。

8
Epoch 1/1
Wait and see.
我不知道我们在哪里。

9
Epoch 1/1
These are called shoes.
我們在一起很快。

10
Epoch 1/1
Is he afraid of death?
我不知道他是什麼意思。

11
Epoch 1/1
That's life.
我們的車很有趣。

12
Epoch 1/1
They'll kill you.
我不知道他的名字。

13
Epoch 1/1
These pants fit me well.
我們不知道要做什麼。

14
Epoch 1/1
There's no way to know.
我不想去那儿，他也不想。

15
Epoch 1/1
I can only speak for myself.
我們在這裡做了什麼？

16
Epoch 1/1
I thought I was dreaming.
我們的老師喜歡他的新車。

17
Epoch 1/1
Please put your shoes on.
我們不能在這裡待很長時間。

18
Epoch 1/1
Can you solve this problem?
我們在這裡花了太多時間。

19
Epoch 1/1
My father grew old.
我們在那裡有一個不愉快的經歷。

20
Epoch 1/1
He didn't listen to music.

The boy is eating bread.
我們在那裡抓了一些大魚。

45
Epoch 1/1
You look very tired.
我不知道他是否愛我。

46
Epoch 1/1
Tom isn't dumb.
我們在那裡上了公車汽車。

47
Epoch 1/1
Dead men tell no tales.
我們不能在這裡待很長時間。

48
Epoch 1/1
I've got a lot more to learn.
我不想再听到其他借口了。

49
Epoch 1/1
How would you like your steak?
我們在那裡有一個不愉快的。

50
Epoch 1/1
Which book is better?
我不想吃早餐。

51
Epoch 1/1
Here is a letter for you.
我不知道他是否愛我。

52
Epoch 1/1
I feel very sick.
我不知道他是否愛我。

53
Epoch 1/1
Mary closed the door quietly.
我不知道他是否已经为我做好。

54
Epoch 1/1
Tom was on the list.
我們在吃晚餐。

55
Epoch 1/1
I'll think it over.
我們在吃晚餐。

56
Epoch 1/1
Her skin is smooth.
我不知道他是谁。

57
Epoch 1/1
It was a very stupid decision.
我不知道我是否有时间做。

58
Epoch 1/1
I'll bring wine.
我不知道他是日本人。

59
Epoch 1/1
I want an MP3 player!
我不知道他是什么时候从法国回来的。

60
Epoch 1/1
I wish we had won the game.
我不知道他是日本人。

61
Epoch 1/1
How about a sandwich?
我們在學校前面見面了。

62
Epoch 1/1
Tom gave me a pen.
我不知道他是否已经为我做好了。

63
Epoch 1/1
The letter was written by Tom.
我們在吃晚餐。

64
Epoch 1/1
He knows 

Take it.
我不知道我的手錶在哪裡。

89
Epoch 1/1
The price is not reasonable.
我不知道他是什么时候从法国回来的。

90
Epoch 1/1
Tom is a clever kid.
我們在那裡上了公共汽車。

91
Epoch 1/1
He is a friendly person.
我們在那裡上了太茶。

92
Epoch 1/1
I'm innocent.
我不知道他是否會來。

93
Epoch 1/1
She sat on the bench.
我不知道我們為怎麼做。

94
Epoch 1/1
Don't forget about me.
我不知道他是否會來。

95
Epoch 1/1
He seems to be ill.
我不知道他是什么时候我不会回来的。

96
Epoch 1/1
I'm ready to leave.
我不知道他是否已经为我做好。

97
Epoch 1/1
We'll come back tomorrow.
我們在這裡花了太晚時間。

98
Epoch 1/1
I don't want to fail my exams.
我不知道我是否有时间做。

99
Epoch 1/1
Please allow me to go.
我不知道他是否會來。

100
Epoch 1/1
Please turn off the light.
我們在吃晚餐。

101
Epoch 1/1
Tom is more active.
我不知道他是否愛我。

102
Epoch 1/1
I'm the best.
我不知道我們不能能做什麼。

103
Epoch 1/1
I saw her enter the room.
我們在這裡花了太多時間。

104
Epoch 1/1
Tom drank some juice.
我不知道他是否愛我。

105
Epoch 1/1

KeyboardInterrupt: 

In [44]:
i = 2
test = encoder_input[i:i+1,:,:]
out = predict_chinese(test,encoder_infer,decoder_infer,OUTPUT_LENGTH,OUTPUT_FEATURE_LENGTH)
print(input_texts[i])
print(out)

Come in.
我們在這裡花了太多時間。



In [None]:
def decode_sequence(input_seq):
    # 将输入序列进行编码
    states_value = encoder_model.predict(input_seq)

    # 生成一个size=1的空序列
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # 将这个空序列的内容设置为开始字符
    target_seq[0, 0, target_token_index['\t']] = 1.

    # 进行字符恢复
    # 简单起见，假设batch_size = 1
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # 退出条件：生成 \n 或者 超过最大序列长度
        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length :
            stop_condition = True

        # 更新target_seq
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # 更新中间状态
        states_value = [h, c]

    return decoded_sentence
--------------------- 
作者：芥末的无奈 
来源：CSDN 
原文：https://blog.csdn.net/weiwei9363/article/details/79464789 
版权声明：本文为博主原创文章，转载请附上博文链接！

# out

In [67]:
train_lstm = model.get_layer('cu_dnnlstm_5')

In [72]:
print(train_lstm.weights[0])
train_lstm.get_weights()[0]

<tf.Variable 'cu_dnnlstm_5/kernel:0' shape=(75, 256) dtype=float32_ref>


array([[ 0.00048403, -0.10057795,  0.09402207, ...,  0.06421671,
         0.04817443, -0.04124532],
       [ 0.02149276,  0.00541093,  0.11336602, ...,  0.12082654,
        -0.11954425, -0.08038449],
       [ 0.09851375, -0.12272102, -0.11307565, ..., -0.08199231,
        -0.02517138, -0.02650847],
       ...,
       [ 0.06733374,  0.02784984,  0.02314961, ...,  0.09500158,
         0.12246257,  0.00699327],
       [-0.05950085, -0.10947321,  0.0575029 , ..., -0.01508953,
        -0.04106094, -0.06283302],
       [-0.11229177,  0.06529395, -0.0673771 , ..., -0.0839394 ,
        -0.08219849,  0.09753321]], dtype=float32)

In [77]:
infer_lstm = encoder_infer.get_layer('cu_dnnlstm_5')
print(infer_lstm.weights[0])
infer_lstm.get_weights()[0]

<tf.Variable 'cu_dnnlstm_5/kernel:0' shape=(75, 256) dtype=float32_ref>


array([[ 0.00048403, -0.10057795,  0.09402207, ...,  0.06421671,
         0.04817443, -0.04124532],
       [ 0.02149276,  0.00541093,  0.11336602, ...,  0.12082654,
        -0.11954425, -0.08038449],
       [ 0.09851375, -0.12272102, -0.11307565, ..., -0.08199231,
        -0.02517138, -0.02650847],
       ...,
       [ 0.06733374,  0.02784984,  0.02314961, ...,  0.09500158,
         0.12246257,  0.00699327],
       [-0.05950085, -0.10947321,  0.0575029 , ..., -0.01508953,
        -0.04106094, -0.06283302],
       [-0.11229177,  0.06529395, -0.0673771 , ..., -0.0839394 ,
        -0.08219849,  0.09753321]], dtype=float32)

In [79]:
infer_lstm = encoder_infer.get_layer('cu_dnnlstm_5')
print(infer_lstm.weights[0])
infer_lstm.get_weights()[0]

<tf.Variable 'cu_dnnlstm_5/kernel:0' shape=(75, 256) dtype=float32_ref>


array([[ 0.00048537, -0.10057795,  0.09402207, ...,  0.06423962,
         0.04818527, -0.04124521],
       [ 0.02149243,  0.00541081,  0.11336623, ...,  0.12081587,
        -0.11976563, -0.0803909 ],
       [ 0.09846777, -0.12273928, -0.11309011, ..., -0.08199231,
        -0.02517122, -0.02650827],
       ...,
       [ 0.06733384,  0.02784988,  0.02314961, ...,  0.09500158,
         0.12246257,  0.00699364],
       [-0.05950077, -0.10947321,  0.05750281, ..., -0.01508927,
        -0.04100636, -0.06283302],
       [-0.11229177,  0.06529395, -0.0673771 , ..., -0.0839394 ,
        -0.08219849,  0.09753321]], dtype=float32)

In [23]:
model.fit(x=[encoder_input,decoder_output],y=decoder_input,batch_size=512)

Epoch 1/1


<keras.callbacks.History at 0x7f00aff7ce10>

In [None]:
model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])

In [None]:
for i in range(1000,1210):
    test = encoder_input[i:i+1,:,:]#i:i+1保持数组是三维
    out = predict_chinese(test,encoder_infer,decoder_infer,OUTPUT_LENGTH,OUTPUT_FEATURE_LENGTH)
    print(input_texts[i])
    print(out)
