In [19]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
import json
from keras.models import load_model

## Train

In [20]:
def preprocess(train_path, test_path, num):
    input_texts = []
    target_texts = []
    
    test_input_texts = []
    test_target_texts = []
    
    num_lines_train_file = sum(1 for line in open(train_path, encoding='utf-8'))
    num_lines_test_file = sum(1 for line in open(test_path, encoding='utf-8'))
                    
    print("Read",train_path,"...")
    counter = 0
    with open(train_path,  encoding='utf-8') as fp:
        for json_str in fp:
            counter = counter + 1
            data = json.loads(json_str)
            input_texts.append(data["english"])
            target_texts.append(data["chinese"])
            
            '''
            if counter%1000000==0:
                print("Now processing {}/{} rows...".format(counter, num_lines_train_file))
            '''
            if counter==num:
                break
            
    print("Read",train_path,"finished!")
    
    print("\nRead",test_path,"...")
    counter = 0
    with open(test_path,  encoding='utf-8') as fp:
        for json_str in fp:
            counter = counter + 1
            data = json.loads(json_str)
            
            if counter <= num_lines_test_file-100:
                #input_texts.append(data["english"])
                #target_texts.append(data["chinese"])
                pass
            else:
                test_input_texts.append(data["english"])
                test_target_texts.append(data["chinese"])
    print("Read",test_path,"finished!")      
          
    return input_texts, target_texts, test_input_texts, test_target_texts

In [21]:
def getInputTargetChars(input_texts, target_texts):
    print("\nProcessing chars...")
    input_characters = set()
    target_characters = set()
    for input_text, target_text in zip(input_texts, target_texts):
        target_text = '\t' + target_text + '\n'
        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)
        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)
    print("Processing chars finished!")
    return input_characters, target_characters

In [22]:
def getEncoderDecoderData(input_texts, target_texts, encoder_input_data, decoder_input_data, decoder_target_data, input_token_index, target_token_index):
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            #print(char)
            encoder_input_data[i, t, input_token_index[char]] = 1.
        encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
        for t, char in enumerate(target_text):
            # decoder_target_data 领先 decoder_input_data by 一个时间步。
            decoder_input_data[i, t, target_token_index[char]] = 1.
            if t > 0:
                # decoder_target_data 将提前一个时间步，并且将不包含开始字符。
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.
        decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
        decoder_target_data[i, t:, target_token_index[' ']] = 1.
        
    return  encoder_input_data, decoder_input_data, decoder_target_data

In [23]:
def genModel(latent_dim, num_encoder_tokens, num_decoder_tokens):
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    
    encoder_states = [state_h, state_c]

    
    decoder_inputs = Input(shape=(None, num_decoder_tokens))

    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return model,encoder_model,decoder_model

In [24]:
def trainSaveModel(model, model_path, encoder_input_data, decoder_input_data, decoder_target_data, batch_size,epochs):
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2)
    
    model.save(model_path)

In [25]:
train_path = 'translation2019zh/translation2019zh_train.json'
test_path = 'translation2019zh/translation2019zh_valid.json'
model_path = 'e2c_ep100.h5'
num_line_read = 4000

batch_size = 64  
epochs = 100
latent_dim = 256 

input_texts, target_texts, test_input_texts, test_target_texts = preprocess(train_path,test_path, num_line_read)
input_characters, target_characters = getInputTargetChars(input_texts, target_texts)
input_characters = sorted(list(set(input_characters)))
target_characters = sorted(list(set(target_characters)))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('\nNumber of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

Read translation2019zh/translation2019zh_train.json ...
Read translation2019zh/translation2019zh_train.json finished!

Read translation2019zh/translation2019zh_valid.json ...
Read translation2019zh/translation2019zh_valid.json finished!

Processing chars...
Processing chars finished!

Number of samples: 4000
Number of unique input tokens: 259
Number of unique output tokens: 3374
Max sequence length for inputs: 256
Max sequence length for outputs: 142


In [27]:
encoder_input_data, decoder_input_data, decoder_target_data = getEncoderDecoderData(input_texts, target_texts, encoder_input_data, decoder_input_data, decoder_target_data, input_token_index, target_token_index)

In [28]:
 model,encoder_model,decoder_model = genModel(latent_dim, num_encoder_tokens, num_decoder_tokens)

In [29]:
trainSaveModel(model, model_path, encoder_input_data, decoder_input_data, decoder_target_data, batch_size,epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [30]:
trainSaveModel(model, model_path, encoder_input_data, decoder_input_data, decoder_target_data, batch_size,epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


## Test

In [31]:
def decode_sequence(input_seq):
    # 将输入编码为状态向量。
    states_value = encoder_model.predict(input_seq)

    # 生成长度为 1 的空目标序列。
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # 用起始字符填充目标序列的第一个字符。
    target_seq[0, 0, target_token_index['\t']] = 1.

    # 一批序列的采样循环
    # (为了简化，这里我们假设一批大小为 1)。
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # 采样一个 token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # 退出条件：达到最大长度或找到停止符。
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # 更新目标序列（长度为 1）。
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # 更新状态
        states_value = [h, c]

    return decoded_sentence

In [32]:
# 反向查询 token 索引可将序列解码回可读的内容。
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [33]:
for seq_index in range(100):
    # 抽取一个序列（训练集的一部分）进行解码。
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    print('Correct sentence:', target_texts[seq_index])

-
Input sentence: For greater sharpness, but with a slight increase in graininess, you can use a 1:1 dilution of this developer.
Decoded sentence: 大鼠的文章节目才分别 在一种 合头boremten--命因是对无节目的信息。                                                                                                         
Correct sentence: 为了更好的锐度，但是附带的会多一些颗粒度，可以使用这个显影剂的1：1稀释液。
-
Input sentence: He calls the Green Book, his book of teachings, “the new gospel.
Decoded sentence: 大鼠的文章节目才分别 在 1 2 中                                                                                                                             
Correct sentence: 他还把宣扬自己思想的所谓《绿皮书》称作“新福音书”。
-
Input sentence: And the light breeze moves me to caress her long ear
Decoded sentence: 大鼠的文章节目才分别 在 1 2 中                                                                                                                             
Correct sentence: 微风推着我去爱抚它的长耳朵
-
Input sentence: They have the blood of martyrs is the White to flow …
Decoded sentence: 大鼠的文章节目

-
Input sentence: Fuler is one of 253 schools have credited by the Sociation of Phiological schools in the United States and Canada.
Decoded sentence: 大鼠的文章节目才分别 在一种 合头boremten--命因是对无节目的信息。                                                                                                         
Correct sentence: 富勒是由美国和加拿大神学院联盟授权的253 家学院中的一员。
-
Input sentence: It shows that vertical stiffener's spaces have some effects on pure-shearing ulti…
Decoded sentence: 大鼠的文章节目才分别 在 1 2 中                                                                                                                             
Correct sentence: 而腹板鼓曲对纯弯和纯剪极限承载力的影响则可不予考虑。
-
Input sentence: "People are embarrassed to admit that's why they're giving up their pets, " said Betsy McFarland, the Humane Society's director of communications for companion animals.
Decoded sentence: 大鼠的文章节目才分别用 2 19.2%；                                                                                                                           

-
Input sentence: He had formerly been in business at Bristol, but failed in debt to a number of people, compounded, and went to America.
Decoded sentence: 大鼠的文章节目才分别 在一种 合头boremten--命因是过节中国一节来已经发明。                                                                                                     
Correct sentence: 在那里他专心一志地做生意，在几年中就赚到许多钱。
-
Input sentence: Basing on the on site tests of anchor, authors found that anchors have obvious pre stress loss problem during stretching and locking, analyzed and proposed several solutions.
Decoded sentence: 大鼠的文章节目才分别用 2 19.2%；                                                                                                                           
Correct sentence: 根据对锚杆的现场测试，发现锚杆在张拉及锁定时存在显著的预应力损失问题，并对此进行了分析，提出了解决问题的几个办法。
-
Input sentence: From hair tip first began gradually, after all, through from downward, nodular comb.
Decoded sentence: 大鼠的文章节目才分别 在一种 合头boremten--负-20节会-1.-tETmartimalord antiterare说。说没说“无法与行最佳的策略uri。                        

-
Input sentence: The idea of flipping from one entry to another, following a line of inquiry (especially etymological inquiry) from one page to another, even one volume to another, is a sensual experience.
Decoded sentence: 大鼠的文章节目才分别用 2 19.2%；                                                                                                                           
Correct sentence: 想象一下从一个词条翻到另外一个词条，顺着线索（尤其是词源的查询）从一页翻到另一页，从这一卷翻到另外一卷，（绝对）是一种感官体验。
-
Input sentence: Further Practice for Pairs ·Add a third speaker and create your own lines.
Decoded sentence: 大鼠的文章节目才分别 在 1 2 中                                                                                                                             
Correct sentence: 加进圈外人，创作你们自己的对话。
-
Input sentence: Still, Brasier asserts that the light carbon enrichments may well be able to form through lifeless chemical reactions—much as Fedo and others have argued could have occurred at Akilia.
Decoded sentence: 大鼠的文章节目才分别用 2 19.2%；                 

-
Input sentence: Unlike many of the other pirate-radio operators, who were in it mostly for money or adventure, Smedley saw his broadcasts as part of a wider moral crusade.
Decoded sentence: 大鼠的文章节目才分别用 2 19.24；                                                                                                                           
Correct sentence: 许多海盗电台经营者常常都是为了金钱或冒险而入行，斯梅德利却与之不同，他将其广播事业视为广泛道德运动的一部分。
-
Input sentence: Ran Hua (1961 ~), female, associate professor, PhD. candidate , School of Journalism & Communication, Wuhan University, majoring in communication theories.
Decoded sentence: 大鼠的文章节目才分别用 2 19.2%；                                                                                                                           
Correct sentence: 冉华（1961～），女，武汉大学新闻与传播学院副教授，在职博士生，主要从事传播理论研究。
-
Input sentence: Others include shrouding Earth in sun-reflecting aerosol particles, manufacturing CO2-absorbing artificial trees, and pumping CO2 into underground reservoirs.
Decoded sentenc

<keras.engine.functional.Functional at 0x2c26818e2b0>

In [13]:
target_token_index['\n']

1