In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.optimizers import Adam
import numpy as np

batch_size = 100  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 512  # Latent dimensionality of the encoding space.
num_samples = 100  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'cmn.txt'

# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)


input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Using TensorFlow backend.


Number of samples: 100
Number of unique input tokens: 46
Number of unique output tokens: 149
Max sequence length for inputs: 9
Max sequence length for outputs: 11


In [2]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

print(target_token_index)


{'\t': 0, '\n': 1, '!': 2, '。': 3, '一': 4, '上': 5, '下': 6, '不': 7, '世': 8, '个': 9, '为': 10, '么': 11, '乾': 12, '了': 13, '事': 14, '人': 15, '什': 16, '他': 17, '付': 18, '们': 19, '会': 20, '住': 21, '你': 22, '來': 23, '信': 24, '們': 25, '儿': 26, '入': 27, '公': 28, '关': 29, '再': 30, '冷': 31, '出': 32, '别': 33, '到': 34, '前': 35, '力': 36, '加': 37, '动': 38, '努': 39, '去': 40, '友': 41, '可': 42, '吃': 43, '同': 44, '后': 45, '吧': 46, '听': 47, '吻': 48, '呆': 49, '告': 50, '和': 51, '善': 52, '嗨': 53, '嘴': 54, '坚': 55, '失': 56, '她': 57, '好': 58, '姆': 59, '始': 60, '它': 61, '完': 62, '定': 63, '就': 64, '帮': 65, '干': 66, '平': 67, '开': 68, '弃': 69, '当': 70, '往': 71, '很': 72, '得': 73, '心': 74, '忘': 75, '忙': 76, '快': 77, '意': 78, '我': 79, '找': 80, '把': 81, '抓': 82, '抱': 83, '拿': 84, '持': 85, '放': 86, '是': 87, '来': 88, '杯': 89, '欢': 90, '气': 91, '汤': 92, '沒': 93, '没': 94, '泳': 95, '洗': 96, '清': 97, '游': 98, '滾': 99, '点': 100, '玩': 101, '生': 102, '用': 103, '留': 104, '病': 105, '的': 106, '相': 107, '着': 108, '知': 109, '确': 11

In [3]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=[state_h, state_c])
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
opt = Adam(lr=0.003, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)
# Save model
model.save('s2s.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 46)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 149)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 512), (None, 1144832     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 512),  1355776     input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


  '. They will not be included '


In [4]:
encoder_model = Model(encoder_inputs, [ state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [5]:
print(''.join([input_characters[np.argmax(i)] for i in encoder_input_data[1]]))
print(''.join([target_characters[np.argmax(i)] for i in decoder_input_data[1]]))
print(''.join([target_characters[np.argmax(i)] for i in decoder_target_data[1]]))

for k in range(100):
    test_data = encoder_input_data[k:k+1]
    h1, c1 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1
    outputs = []
    while True:
        output_tokens, h1, c1 = decoder_model.predict([target_seq, h1, c1])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1
        if sampled_token_index == target_token_index['\n'] or len(outputs) > 20: break
    
    print(input_texts[k])
    print(''.join([target_characters[i] for i in outputs]))

Hi.      
	你好。
						
你好。
							
Hi.
嗨。

Hi.
嗨。

Run.
你用跑的。

Wait!
等等！

Hello!
你好。

I try.
让我来。

I won!
我赢了。

Oh no!
不会吧。

Cheers!
乾杯!

He ran.
他跑了。

Hop in.
跳进来。

I lost.
我迷失了。

I quit.
我退出。

I'm OK.
我沒事。

Listen.
听着。

No way!
不可能！

No way!
不可能！

Really?
你确定？

Try it.
试试吧。

We try.
我们来试试。

Why me?
为什么是我？

Ask Tom.
去问汤姆。

Be calm.
冷静点。

Be fair.
公平点。

Be kind.
友善点。

Be nice.
和气点。

Call me.
联系我。

Call us.
联系我们。

Come in.
进来。

Get Tom.
找到汤姆。

Get out!
滾出去！

Go away!
走開！

Go away!
走開！

Go away.
走開！

Goodbye!
再见！

Goodbye!
再见！

Hang on!
等一下！

He came.
他来了。

He runs.
他跑。

Help me.
帮我一下。

Hold on.
坚持。

Hug Tom.
抱抱汤姆！

I agree.
我同意。

I'm ill.
我生病了。

I'm old.
我老了。

It's OK.
没关系。

It's me.
是我。

Join us.
来加入我们吧。

Keep it.
留着吧。

Kiss me.
吻我。

Perfect!
完美！

See you.
再见！

Shut up!
閉嘴！

Skip it.
不管它。

Take it.
拿走吧。

Wake up!
醒醒！

Wash up.
去清洗一下。

We know.
我们知道。

Welcome.
欢迎。

Who won?
谁赢了？

Why not?
为什么不？

You run.
你跑。

Back off.
往后退点。

Be still.
静静的，别动。

Cuff him.
把他铐上。

Drive on.
往前开。

Get away!


ValueError: not enough values to unpack (expected 2, got 0)