In [36]:
import json
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [37]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False

def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                #data['chinese'] = ch.convert(data['chinese'])
                if len(data['chinese'])<10:
                    lists.append(data)
                if (len(lists)+1)%100 == 0:
                    #print(len(lists)+1)
                    break

    # creating the dataframe
    df = pd.DataFrame(lists)
    # converted a file to csv
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

init()

In [38]:
import pandas as pd
df = pd.read_csv('datafile.csv')
df['chinese'] = df['chinese'].apply(lambda x: '@' + x + '。')
en_data = df.english.values.tolist()#英文句子列表
ch_data = df.chinese.values.tolist()#中文句子列表

In [39]:
en_vocab = set(''.join(en_data))# 分別生成中英文字典
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}
ch_vocab = set(''.join(ch_data))# 分別生成中英文字典
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}
print('\n英文字典:\n', en2id)
print('\n中文字典共計\n:', ch2id)


英文字典:
 {'d': 0, '（': 1, 'E': 2, '8': 3, 'O': 4, 'e': 5, '!': 6, '"': 7, 'u': 8, 'z': 9, 'T': 10, 's': 11, '：': 12, 'J': 13, 'h': 14, 'm': 15, 'Y': 16, '…': 17, 'b': 18, '’': 19, 'v': 20, 'c': 21, '）': 22, 'k': 23, 'I': 24, "'": 25, ' ': 26, 'g': 27, 'y': 28, 'G': 29, 'U': 30, 'x': 31, 'f': 32, '？': 33, '.': 34, '0': 35, '?': 36, 'w': 37, 'N': 38, 't': 39, '(': 40, 'n': 41, 'o': 42, 'H': 43, '-': 44, 'a': 45, 'j': 46, 'A': 47, ';': 48, 'P': 49, 'F': 50, 'C': 51, ')': 52, 'p': 53, ':': 54, 'r': 55, 'i': 56, 'q': 57, 'l': 58, 'L': 59, 'W': 60, 'D': 61, 'B': 62, 'M': 63, 'S': 64, ',': 65}

中文字典共計
: {'啊': 0, '刻': 1, '不': 2, '性': 3, '住': 4, '艏': 5, '整': 6, '妮': 7, '十': 8, '见': 9, '受': 10, '露': 11, '回': 12, '裤': 13, '-': 14, '边': 15, '摇': 16, '绕': 17, '飞': 18, '疼': 19, '督': 20, '号': 21, '者': 22, '全': 23, '定': 24, '吹': 25, '独': 26, '巴': 27, '维': 28, '洞': 29, '石': 30, '过': 31, '冠': 32, '所': 33, '德': 34, '骤': 35, '卖': 36, '她': 37, '象': 38, '丫': 39, '左': 40, '已': 41, '屎': 42, '感': 43, '杆': 44, '

In [40]:
def num_data(en2id,ch2id,en_data):
    # 利用字典，映射數據
    en_num_data = [[en2id[en] for en in line ] for line in en_data]
    ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
    de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]
    print('char:', en_data[1])
    print('index:', en_num_data[1])
    # 獲取輸入輸出端的最大長度
    max_encoder_seq_length = max([len(txt) for txt in en_num_data])
    max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
    print('max encoder length:', max_encoder_seq_length)
    print('max decoder length:', max_decoder_seq_length)

    # 將數據進行onehot處理
    encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
    decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
    decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

    for i in range(len(ch_num_data)):
        for t, j in enumerate(en_num_data[i]):
            encoder_input_data[i, t, j] = 1.
        for t, j in enumerate(ch_num_data[i]):
            decoder_input_data[i, t, j] = 1.
        for t, j in enumerate(de_num_data[i]):
            decoder_target_data[i, t, j] = 1.

    print('index data:\n', en_num_data[1])
    print('one hot data:\n', encoder_input_data[1])
    return encoder_input_data,decoder_input_data,decoder_target_data

In [41]:
nd = num_data(en2id,ch2id,en_data)

char: Choose a recorder.
index: [51, 14, 42, 42, 11, 5, 26, 45, 26, 55, 5, 21, 42, 55, 0, 5, 55, 34]
max encoder length: 49
max decoder length: 11
index data:
 [51, 14, 42, 42, 11, 5, 26, 45, 26, 55, 5, 21, 42, 55, 0, 5, 55, 34]
one hot data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [42]:
# =======預定義模型參數========
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256
LEARNING_RATE = 0.01
BATCH_SIZE = 30
EPOCHS = 250

In [43]:
def train():
    encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
    encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
    encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)
    decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))
    lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
    lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
    decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

    decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
    decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
    decoder_outputs = decoder_dense(decoder_h2)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    model.fit([nd[0], nd[1]], nd[2],batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=0.)
    model.save('s2s.h5')# Save model
    encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])# encoder模型和訓練相同
    decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))# 預測模型中的decoder的初始化狀態需要傳入新的狀態
    decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
    decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
    decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))
    decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])# 使用傳入的值來初始化當前模型的輸入狀態
    decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
    decoder_outputs = decoder_dense(decoder_h2)
    decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                        [decoder_outputs, state_h1, state_c1, state_h2, state_c2])
    return encoder_model,decoder_model


In [44]:
model = train()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, None, 66)]   0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, None, 402)]  0                                            
__________________________________________________________________________________________________
lstm_12 (LSTM)                  [(None, None, 256),  330752      input_19[0][0]                   
__________________________________________________________________________________________________
lstm_14 (LSTM)                  [(None, None, 256),  674816      input_20[0][0]                   
                                                                 lstm_12[0][1]              

In [45]:
def result():    
    for k in range(0,99):
        test_data = nd[0][k:k+1]
        h1, c1, h2, c2 = model[0].predict(test_data)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, ch2id['@']] = 1
        outputs = []
        while True:
            output_tokens, h1, c1, h2, c2 = model[1].predict([target_seq, h1, c1, h2, c2])
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            outputs.append(sampled_token_index)
            target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
            target_seq[0, 0, sampled_token_index] = 1
            if sampled_token_index == ch2id['。'] or len(outputs) > 15:
                break
        
        print(en_data[k])
        print(''.join([id2ch[i] for i in outputs]))

In [46]:
result()

Look at these coasters over here.
我想要些水解渴。
Choose a recorder.
C：我姓姜。
I hadn't paid the telephone bill.
我还没交电话费。
That's easier said than done, of course.
这可是件大事儿啊！。
Side-to-Side Movements.
他因谋杀罪而受审。
about like 80 degrees.
你这性感的家伙。
We all are from Shandong.
他的文体很明畅。
She was possessed by a devil.
你还有未竟的事业？。
This wool knits up well.
这可是件大事儿啊！。
The majority was wrong last time.
这可是件大事儿啊！。
Stone Soup Stories to Go!
你还有未竟的事业？。
Done. See you tomorrow.
我想要些水解渴。
He eased some of the strains on the poor.
我应该到屋顶上吗？。
Could it be that it was written wrongly?
C：我姓姜。
What a terrible temper!
我的巴士在哪里？。
Great talents flower late.
你的汤及蔬菜。
I forbid you to make a sortie today.
你今天不许出击。
C：My surname is Jiang.
C：我姓姜。
Well, if it was greater .
他的文体很明畅。
They looked over to the left.
这可是件大事儿啊！。
To supervise the management of printing industry.
监督管理印刷业。
no one else can see you shake your head.
我给你指路。
All photos dials.
你还有未竟的事业？。
Stained glass window panels;
你还有未竟的事业？。
The murderer was caught red-handed.
这可是件大事儿啊！