In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
import json

## Train

In [2]:
def preprocess(train_path, test_path, num):
    input_texts = []
    target_texts = []
    
    test_input_texts = []
    test_target_texts = []
    
    num_lines_train_file = sum(1 for line in open(train_path, encoding='utf-8'))
    num_lines_test_file = sum(1 for line in open(test_path, encoding='utf-8'))
                    
    print("Read",train_path,"...")
    counter = 0
    with open(train_path,  encoding='utf-8') as fp:
        for json_str in fp:
            counter = counter + 1
            data = json.loads(json_str)
            input_texts.append(data["english"])
            target_texts.append(data["chinese"])
            
            '''
            if counter%1000000==0:
                print("Now processing {}/{} rows...".format(counter, num_lines_train_file))
            '''
            if counter==num:
                break
            
    print("Read",train_path,"finished!")
    
    print("\nRead",test_path,"...")
    counter = 0
    with open(test_path,  encoding='utf-8') as fp:
        for json_str in fp:
            counter = counter + 1
            data = json.loads(json_str)
            
            if counter <= num_lines_test_file-100:
                #input_texts.append(data["english"])
                #target_texts.append(data["chinese"])
                pass
            else:
                test_input_texts.append(data["english"])
                test_target_texts.append(data["chinese"])
    print("Read",test_path,"finished!")      
          
    return input_texts, target_texts, test_input_texts, test_target_texts

In [3]:
def getInputTargetChars(input_texts, target_texts):
    print("\nProcessing chars...")
    input_characters = set()
    target_characters = set()
    for input_text, target_text in zip(input_texts, target_texts):
        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)
        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)
    print("Processing chars finished!")
    return input_characters, target_characters

In [4]:
def getEncoderDecoderData(input_texts, target_texts, encoder_input_data, decoder_input_data, decoder_target_data, input_token_index, target_token_index):
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            #print(char)
            encoder_input_data[i, t, input_token_index[char]] = 1.
        encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
        for t, char in enumerate(target_text):
            # decoder_target_data 领先 decoder_input_data by 一个时间步。
            decoder_input_data[i, t, target_token_index[char]] = 1.
            if t > 0:
                # decoder_target_data 将提前一个时间步，并且将不包含开始字符。
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.
        decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
        decoder_target_data[i, t:, target_token_index[' ']] = 1.
        
    return  encoder_input_data, decoder_input_data, decoder_target_data

In [10]:
def genModel(latent_dim, num_encoder_tokens, num_decoder_tokens):
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    
    encoder_states = [state_h, state_c]

    
    decoder_inputs = Input(shape=(None, num_decoder_tokens))

    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    return model

In [12]:
def trainSaveModel(model, model_path, encoder_input_data, decoder_input_data, decoder_target_data, batch_size,epochs):
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2)
    
    model.save(model_path)

In [6]:
train_path = 'translation2019zh/translation2019zh_train.json'
test_path = 'translation2019zh/translation2019zh_valid.json'
model_path = 'e2c.h5'
num_line_read = 4000

batch_size = 64  
epochs = 100   
latent_dim = 256 

input_texts, target_texts, test_input_texts, test_target_texts = preprocess(train_path,test_path, num_line_read)
input_characters, target_characters = getInputTargetChars(input_texts, target_texts)
input_characters = sorted(list(set(input_characters)))
target_characters = sorted(list(set(target_characters)))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('\nNumber of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

Read translation2019zh/translation2019zh_train.json ...
Read translation2019zh/translation2019zh_train.json finished!

Read translation2019zh/translation2019zh_valid.json ...
Read translation2019zh/translation2019zh_valid.json finished!

Processing chars...
Processing chars finished!

Number of samples: 4000
Number of unique input tokens: 259
Number of unique output tokens: 3372
Max sequence length for inputs: 256
Max sequence length for outputs: 142


In [7]:
encoder_input_data, decoder_input_data, decoder_target_data = getEncoderDecoderData(input_texts, target_texts, encoder_input_data, decoder_input_data, decoder_target_data, input_token_index, target_token_index)

In [11]:
model = genModel(latent_dim, num_encoder_tokens, num_decoder_tokens)

In [None]:
trainSaveModel(model, model_path, encoder_input_data, decoder_input_data, decoder_target_data, batch_size,epochs)

Epoch 1/100
Epoch 2/100
 8/50 [===>..........................] - ETA: 1:40 - loss: 1.8050 - accuracy: 0.7574

## Test

In [6]:
input_token_index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '<': 27,
 '=': 28,
 '>': 29,
 '?': 30,
 'A': 31,
 'B': 32,
 'C': 33,
 'D': 34,
 'E': 35,
 'F': 36,
 'G': 37,
 'H': 38,
 'I': 39,
 'J': 40,
 'K': 41,
 'L': 42,
 'M': 43,
 'N': 44,
 'O': 45,
 'P': 46,
 'Q': 47,
 'R': 48,
 'S': 49,
 'T': 50,
 'U': 51,
 'V': 52,
 'W': 53,
 'X': 54,
 'Y': 55,
 'Z': 56,
 '[': 57,
 '\\': 58,
 ']': 59,
 '_': 60,
 '`': 61,
 'a': 62,
 'b': 63,
 'c': 64,
 'd': 65,
 'e': 66,
 'f': 67,
 'g': 68,
 'h': 69,
 'i': 70,
 'j': 71,
 'k': 72,
 'l': 73,
 'm': 74,
 'n': 75,
 'o': 76,
 'p': 77,
 'q': 78,
 'r': 79,
 's': 80,
 't': 81,
 'u': 82,
 'v': 83,
 'w': 84,
 'x': 85,
 'y': 86,
 'z': 87,
 '~': 88,
 '´': 89,
 '·': 90,
 'Â': 91,
 'á': 92,
 'â': 93,
 'ä': 94,
 'è': 95,
 'é': 96,
 'ñ': 97,
 'ó': 98,
 'ö': 99,
 'ü': 100