In [229]:
import json
import pandas as pd
from tensorflow.keras import optimizers
import os
import time
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pprint import pprint
from IPython.display import clear_output

In [230]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False

def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                #data['chinese'] = ch.convert(data['chinese'])
                if len(data['chinese'])<10:
                    lists.append(data)
                if (len(lists)+1)%1000 == 0:
                    print(len(lists)+1)
                    break

    # creating the dataframe
    df = pd.DataFrame(lists)
    # converted a file to csv
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

init()


1000


In [231]:
import pandas as pd
df = pd.read_csv('datafile.csv')
df['chinese'] = df['chinese'].apply(lambda x: '@' + x + '。')
print(df[:1])

                             english     chinese
0  Look at these coasters over here.  @看看这边的杯垫。。


In [232]:
en_data = df.english.values.tolist()#英文句子列表
ch_data = df.chinese.values.tolist()#中文句子列表

#确定中英文各自包含的字符。df.unique()直接取sum可将unique数组中的各个句子拼接成一个长句子
# 分別生成中英文字典
en_vocab = set(''.join(en_data))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

# 分別生成中英文字典
ch_vocab = set(''.join(ch_data))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

print('\n英文字典:\n', en2id)
print('\n中文字典共計\n:', ch2id)



英文字典:
 {'o': 0, 'j': 1, 'B': 2, 'x': 3, 'J': 4, 'L': 5, "'": 6, 'r': 7, 'l': 8, 'a': 9, 'q': 10, 'm': 11, ']': 12, '!': 13, '”': 14, '…': 15, 'R': 16, 'h': 17, '-': 18, '）': 19, 's': 20, 'Q': 21, 'b': 22, 'M': 23, '/': 24, 'P': 25, '3': 26, '[': 27, 'G': 28, ':': 29, 'K': 30, 'n': 31, '：': 32, '？': 33, 'A': 34, '1': 35, 'I': 36, '（': 37, 'N': 38, ')': 39, 'p': 40, '"': 41, '(': 42, '4': 43, 'H': 44, 'W': 45, 'c': 46, 'Z': 47, 'S': 48, '0': 49, ' ': 50, 'T': 51, '·': 52, 'z': 53, '=': 54, 'e': 55, 'E': 56, ',': 57, 't': 58, 'V': 59, 'i': 60, 'Y': 61, 'y': 62, '9': 63, '8': 64, '?': 65, 'w': 66, 'X': 67, '2': 68, 'D': 69, 'f': 70, 'C': 71, '‘': 72, 'F': 73, 'g': 74, '.': 75, 'u': 76, '’': 77, 'd': 78, 'k': 79, 'U': 80, '7': 81, '6': 82, 'O': 83, '“': 84, 'v': 85, '。': 86, ';': 87, '_': 88, '5': 89}

中文字典共計
: {'载': 0, '猫': 1, '蚤': 2, '跑': 3, '七': 4, '李': 5, '②': 6, '淹': 7, '早': 8, '嘟': 9, '碌': 10, '石': 11, '走': 12, '笼': 13, '也': 14, '魏': 15, '@': 16, '要': 17, '奎': 18, '俑': 19, '出': 20, '

In [233]:
# 利用字典，映射數據
en_num_data = [[en2id[en] for en in line ] for line in en_data]
ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]

print('char:', en_data[1])
print('index:', en_num_data[1])

char: Choose a recorder.
index: [71, 17, 0, 0, 20, 55, 50, 9, 50, 7, 55, 46, 0, 7, 78, 55, 7, 75]


In [234]:
import numpy as np

# 獲取輸入輸出端的最大長度
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print('index data:\n', en_num_data[1])
print('one hot data:\n', encoder_input_data[1])

max encoder length: 108
max decoder length: 11
index data:
 [71, 17, 0, 0, 20, 55, 50, 9, 50, 7, 55, 46, 0, 7, 78, 55, 7, 75]
one hot data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [235]:
# =======預定義模型參數========
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.01
BATCH_SIZE = 200
EPOCHS = 175

In [236]:
# ======================================keras model==================================
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.optimizers import Adam
import numpy as np

# ==============encoder=============
encoder_inputs = Input(shape=(None, EN_VOCAB_SIZE))
#emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)(encoder_inputs)
encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, return_state=True)(encoder_h1)


In [237]:
# ==============decoder=============
decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))

#emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=CH_VOCAB_SIZE, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
lstm2 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
decoder_h2, _, _ = lstm2(decoder_h1, initial_state=[encoder_state_h2, encoder_state_c2])
decoder_outputs = decoder_dense(decoder_h2)

In [238]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.)

# Save model
model.save('s2s.h5')

Model: "model_50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_93 (InputLayer)           [(None, None, 90)]   0                                            
__________________________________________________________________________________________________
input_94 (InputLayer)           [(None, None, 1608)] 0                                            
__________________________________________________________________________________________________
lstm_68 (LSTM)                  [(None, None, 256),  355328      input_93[0][0]                   
__________________________________________________________________________________________________
lstm_70 (LSTM)                  [(None, None, 256),  1909760     input_94[0][0]                   
                                                                 lstm_68[0][1]             

In [239]:
# encoder模型和訓練相同
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1, encoder_state_h2, encoder_state_c2])

# 預測模型中的decoder的初始化狀態需要傳入新的狀態
decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))

# 使用傳入的值來初始化當前模型的輸入狀態
decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
decoder_outputs = decoder_dense(decoder_h2)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                      [decoder_outputs, state_h1, state_c1, state_h2, state_c2])

In [241]:
for k in range(0,100):
    test_data = encoder_input_data[k:k+1]
    h1, c1, h2, c2 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
    target_seq[0, 0, ch2id['@']] = 1
    outputs = []
    while True:
        output_tokens, h1, c1, h2, c2 = decoder_model.predict([target_seq, h1, c1, h2, c2])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1
        if sampled_token_index == ch2id['。'] or len(outputs) > 15:
            break
    
    print(en_data[k])
    print(''.join([id2ch[i] for i in outputs]))

Look at these coasters over here.
你常打篮球吗？。
Choose a recorder.
没有诗人可以抒怀。
I hadn't paid the telephone bill.
我是一个勇敢的人。
That's easier said than done, of course.
这是体育馆。
Side-to-Side Movements.
在湖中畅泳。
about like 80 degrees.
你需要填写登记卡。
We all are from Shandong.
我们可以看晚场。
She was possessed by a devil.
她对照歧视妇女。
This wool knits up well.
这是体育馆。
The majority was wrong last time.
这是体育馆。
Stone Soup Stories to Go!
她也同样祝福他。
Done. See you tomorrow.
你需要填写登记卡。
He eased some of the strains on the poor.
他的名声传遍全国。
Could it be that it was written wrongly?
没有促销代码需要。
What a terrible temper!
我们可以看晚场。
Great talents flower late.
这样他们有重叠了。
I forbid you to make a sortie today.
我是一个勇敢的人。
C：My surname is Jiang.
没有磨里，出饭吃。
Well, if it was greater .
我们可以看晚场。
They looked over to the left.
这是体育馆。
To supervise the management of printing industry.
这是这是捕蝇草。
no one else can see you shake your head.
你需要填写登记卡。
All photos dials.
这样他们有重叠了。
Stained glass window panels;
我载过很多名人。
The murderer was caught red-handed.
这是体育馆。
You don’t lo