In [89]:
import json
import pandas as pd
from tensorflow.keras import optimizers
import os
import time
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pprint import pprint
from IPython.display import clear_output

In [90]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False

def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                #data['chinese'] = ch.convert(data['chinese'])
                if len(data['chinese'])<5:
                    lists.append(data)
                if (len(lists)+1)%30 == 0:
                    print(len(lists)+1)
                    break

    # creating the dataframe
    df = pd.DataFrame(lists)
    # converted a file to csv
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

init()


30


In [91]:
import pandas as pd
df = pd.read_csv('datafile.csv')
df['chinese'] = df['chinese'].apply(lambda x: '@' + x + '。')
print(df[:1])

                           english chinese
0  Erhai Lake （in Yunnan Province）    @洱海。


In [92]:
en_data = df.english.values.tolist()#英文句子列表
ch_data = df.chinese.values.tolist()#中文句子列表

#确定中英文各自包含的字符。df.unique()直接取sum可将unique数组中的各个句子拼接成一个长句子
# 分別生成中英文字典
en_vocab = set(''.join(en_data))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

# 分別生成中英文字典
ch_vocab = set(''.join(ch_data))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

print('\n英文字典:\n', en2id)
print('\n中文字典共計\n:', ch2id)



英文字典:
 {'o': 0, 'j': 1, 'S': 2, 'W': 3, 'w': 4, ':': 5, 'K': 6, 'D': 7, 'B': 8, 'f': 9, 'C': 10, 'F': 11, ' ': 12, 'g': 13, 'T': 14, 'x': 15, 'n': 16, '.': 17, 'u': 18, 'L': 19, 'z': 20, 'r': 21, 'd': 22, "'": 23, 'k': 24, 'l': 25, '=': 26, 'E': 27, 'e': 28, 'a': 29, 'A': 30, 'U': 31, 'I': 32, 'q': 33, ',': 34, 'm': 35, 't': 36, '（': 37, 'V': 38, 'N': 39, 'i': 40, '!': 41, 'Y': 42, 'v': 43, 'h': 44, 'p': 45, 'y': 46, '"': 47, '-': 48, '）': 49, ';': 50, '?': 51, 'H': 52, 's': 53, 'c': 54, 'b': 55, 'M': 56, 'P': 57}

中文字典共計
: {'切': 0, '东': 1, '盒': 2, '虾': 3, '核': 4, '鲜': 5, '你': 6, '尼': 7, '京': 8, '楼': 9, '@': 10, '什': 11, '剪': 12, '街': 13, '证': 14, '抱': 15, '拼': 16, '线': 17, '二': 18, '了': 19, '密': 20, '目': 21, '请': 22, '片': 23, '是': 24, '该': 25, '印': 26, '手': 27, '机': 28, '刷': 29, '篮': 30, '蒸': 31, '于': 32, '保': 33, '？': 34, '海': 35, '紧': 36, '名': 37, '；': 38, '包': 39, '骨': 40, '偶': 41, '学': 42, '灌': 43, '坐': 44, '吧': 45, '毛': 46, '洱': 47, '思': 48, '厂': 49, '换': 50, '结': 51, '喝': 52, '

In [93]:
# 利用字典，映射數據
en_num_data = [[en2id[en] for en in line ] for line in en_data]
ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]

print('char:', en_data[1])
print('index:', en_num_data[1])

char: I promise".
index: [32, 12, 45, 21, 0, 35, 40, 53, 28, 47, 17]


In [94]:
import numpy as np

# 獲取輸入輸出端的最大長度
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print('index data:\n', en_num_data[1])
print('one hot data:\n', encoder_input_data[1])

max encoder length: 41
max decoder length: 6
index data:
 [32, 12, 45, 21, 0, 35, 40, 53, 28, 47, 17]
one hot data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [95]:
# =======預定義模型參數========
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.0249
BATCH_SIZE = 100
EPOCHS = 250

In [96]:
# ======================================keras model==================================
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam
import numpy as np

# ==============encoder=============
encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
#emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)


In [97]:
# ==============decoder=============
decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))

#emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=CH_VOCAB_SIZE, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
decoder_outputs = decoder_dense(decoder_h2)

In [98]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.)

# Save model
model.save('s2s.h5')

Model: "model_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           [(None, None, 58)]   0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           [(None, None, 80)]   0                                            
__________________________________________________________________________________________________
lstm_20 (LSTM)                  [(None, None, 256),  322560      input_33[0][0]                   
__________________________________________________________________________________________________
lstm_22 (LSTM)                  [(None, None, 256),  345088      input_34[0][0]                   
                                                                 lstm_20[0][1]             

In [99]:
# encoder模型和訓練相同
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])

# 預測模型中的decoder的初始化狀態需要傳入新的狀態
decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))

# 使用傳入的值來初始化當前模型的輸入狀態
decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
decoder_outputs = decoder_dense(decoder_h2)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                      [decoder_outputs, state_h1, state_c1, state_h2, state_c2])

In [100]:
for k in range(0,10):
    test_data = encoder_input_data[k:k+1]
    h1, c1, h2, c2 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
    target_seq[0, 0, ch2id['@']] = 1
    outputs = []
    while True:
        output_tokens, h1, c1, h2, c2 = decoder_model.predict([target_seq, h1, c1, h2, c2])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1
        if sampled_token_index == ch2id['。'] or len(outputs) > 15:
            break
    
    print(en_data[k])
    print(''.join([id2ch[i] for i in outputs]))

Erhai Lake （in Yunnan Province）
洱海。
I promise".
我保证。
Steam bath;
蒸气浴；。
Package sealing.
包密封。
What street?
什么街？。
You must be very quiet. Hold tight to me.
抱紧我。
Frozen ocean shrimp;
海虾；。
Target Line;
目标线；。
Chemical potential.
化学势。
A:It's on the seccond floor.
在二楼.。
