In [72]:
import json
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [73]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False

def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                #data['chinese'] = ch.convert(data['chinese'])
                if len(data['chinese'])<6:
                    lists.append(data)
                if (len(lists)+1)%100 == 0:
                    #print(len(lists)+1)
                    break

    # creating the dataframe
    df = pd.DataFrame(lists)
    # converted a file to csv
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

init()

In [74]:
import pandas as pd
df = pd.read_csv('datafile.csv')
df['chinese'] = df['chinese'].apply(lambda x: '@' + x + '。')
en_data = df.english.values.tolist()#英文句子列表
ch_data = df.chinese.values.tolist()#中文句子列表

In [75]:
en_vocab = set(''.join(en_data))# 分別生成中英文字典
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}
ch_vocab = set(''.join(ch_data))# 分別生成中英文字典
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}
print('\n英文字典:\n', en2id)
print('\n中文字典共計\n:', ch2id)


英文字典:
 {'d': 0, '（': 1, 'E': 2, '8': 3, 'O': 4, 'e': 5, '!': 6, '"': 7, 'u': 8, 'z': 9, 'T': 10, 's': 11, 'J': 12, 'm': 13, 'h': 14, 'R': 15, '…': 16, 'b': 17, 'v': 18, 'c': 19, '）': 20, 'G': 21, 'k': 22, 'y': 23, ' ': 24, 'g': 25, 'I': 26, "'": 27, 'U': 28, '=': 29, 'X': 30, 'x': 31, 'f': 32, 'V': 33, '.': 34, '?': 35, 'w': 36, 'N': 37, 't': 38, '(': 39, 'n': 40, 'o': 41, 'H': 42, '-': 43, 'a': 44, 'j': 45, 'A': 46, ';': 47, 'P': 48, 'F': 49, ',': 50, 'C': 51, ')': 52, 'K': 53, 'p': 54, ':': 55, 'i': 56, 'r': 57, 'q': 58, 'l': 59, 'L': 60, 'W': 61, 'D': 62, 'B': 63, 'M': 64, 'S': 65, 'Y': 66}

中文字典共計
: {'该': 0, '热': 1, '遍': 2, '船': 3, '，': 4, '爱': 5, '不': 6, '浴': 7, '汤': 8, '！': 9, '什': 10, '应': 11, '铜': 12, '咱': 13, '犯': 14, '草': 15, '东': 16, '核': 17, '知': 18, '错': 19, '数': 20, '茄': 21, '鱼': 22, '…': 23, '目': 24, '魂': 25, '着': 26, '洱': 27, '百': 28, '问': 29, '官': 30, '耳': 31, '专': 32, '在': 33, '慧': 34, '扣': 35, '.': 36, '阒': 37, '差': 38, '很': 39, '功': 40, '颇': 41, '进': 42, '做': 43, '

In [76]:
def num_data(en2id,ch2id,en_data):
    # 利用字典，映射數據
    en_num_data = [[en2id[en] for en in line ] for line in en_data]
    ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
    de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]
    print('char:', en_data[1])
    print('index:', en_num_data[1])
    # 獲取輸入輸出端的最大長度
    max_encoder_seq_length = max([len(txt) for txt in en_num_data])
    max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
    print('max encoder length:', max_encoder_seq_length)
    print('max decoder length:', max_decoder_seq_length)

    # 將數據進行onehot處理
    encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
    decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
    decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

    for i in range(len(ch_num_data)):
        for t, j in enumerate(en_num_data[i]):
            encoder_input_data[i, t, j] = 1.
        for t, j in enumerate(ch_num_data[i]):
            decoder_input_data[i, t, j] = 1.
        for t, j in enumerate(de_num_data[i]):
            decoder_target_data[i, t, j] = 1.

    print('index data:\n', en_num_data[1])
    print('one hot data:\n', encoder_input_data[1])
    return encoder_input_data,decoder_input_data,decoder_target_data

In [77]:
nd = num_data(en2id,ch2id,en_data)

char: She was possessed by a devil.
index: [65, 14, 5, 24, 36, 44, 11, 24, 54, 41, 11, 11, 5, 11, 11, 5, 0, 24, 17, 23, 24, 44, 24, 0, 5, 18, 56, 59, 34]
max encoder length: 75
max decoder length: 7
index data:
 [65, 14, 5, 24, 36, 44, 11, 24, 54, 41, 11, 11, 5, 11, 11, 5, 0, 24, 17, 23, 24, 44, 24, 0, 5, 18, 56, 59, 34]
one hot data:
 [[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [78]:
# =======預定義模型參數========
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256
LEARNING_RATE = 0.01
BATCH_SIZE = 20
EPOCHS = 250

In [79]:
def train():
    encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
    encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
    encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)
    decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))
    lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
    lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
    decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

    decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
    decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
    decoder_outputs = decoder_dense(decoder_h2)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    model.fit([nd[0], nd[1]], nd[2],batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=0.)
    model.save('s2s.h5')# Save model
    encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])# encoder模型和訓練相同
    decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))# 預測模型中的decoder的初始化狀態需要傳入新的狀態
    decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
    decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
    decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))
    decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])# 使用傳入的值來初始化當前模型的輸入狀態
    decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
    decoder_outputs = decoder_dense(decoder_h2)
    decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                        [decoder_outputs, state_h1, state_c1, state_h2, state_c2])
    return encoder_model,decoder_model


In [80]:
model = train()

Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           [(None, None, 67)]   0                                            
__________________________________________________________________________________________________
input_38 (InputLayer)           [(None, None, 281)]  0                                            
__________________________________________________________________________________________________
lstm_24 (LSTM)                  [(None, None, 256),  331776      input_37[0][0]                   
__________________________________________________________________________________________________
lstm_26 (LSTM)                  [(None, None, 256),  550912      input_38[0][0]                   
                                                                 lstm_24[0][1]             

In [81]:
def result():    
    for k in range(0,99):
        test_data = nd[0][k:k+1]
        h1, c1, h2, c2 = model[0].predict(test_data)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, ch2id['@']] = 1
        outputs = []
        while True:
            output_tokens, h1, c1, h2, c2 = model[1].predict([target_seq, h1, c1, h2, c2])
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            outputs.append(sampled_token_index)
            target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
            target_seq[0, 0, sampled_token_index] = 1
            if sampled_token_index == ch2id['。'] or len(outputs) > 10:
                break
        
        print(en_data[k])
        print(''.join([id2ch[i] for i in outputs]))

In [82]:
result()

Side-to-Side Movements.
侧向运动。
She was possessed by a devil.
她着了魔。
The majority was wrong last time.
方是错的。
Great talents flower late.
大器晚成。
Erhai Lake （in Yunnan Province）
洱海。
It strikes one as very strange.
我怕暴死。
Did I divide right?
我算得对吗。
I promise".
我是康康。
All was quiet and not a soul was to be seen.
阒无一人。
Folder wire glass;
夹丝玻璃；。
Steam bath;
蒸气浴；。
Package sealing.
包密封。
Mud on salt.
泥在盐上。
What street?
多么讽刺！。
You must be very quiet. Hold tight to me.
抱紧我。
Frozen ocean shrimp;
海虾；。
It was he.
我怕暴死。
Target Line;
目标线；。
Chemical potential.
化学势。
He ate no soup.
他不喝汤。
What is to be done?
多么讽刺！。
Front wheel drive.
前轮驱动。
All streams flow into the Huanghe River.
百川灌河。
A:It's on the seccond floor.
在二楼.。
All the boats and carts started off at the same time.
车船齐发。
The Univ.
该大学。
The printing plant.
托架松了。
It looks like rain.
我怕暴死。
Straight tool holder .
直车刀架。
I'm afraid of dying a violent death.
我怕暴死。
How do they channel it?
如何引导？。
You love me, then?
您爱我吗？。
The fragrance of flowers assails one's n