In [None]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False


In [None]:
import json
import pandas as pd
from opencc import OpenCC
ch = OpenCC('s2twp')
def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                data['chinese'] = ch.convert(data['chinese'])
                lists.append(data)
                if (i+1)%1000000 == 0:
                    print(i+1)
        
    # creating the dataframe
    df = pd.DataFrame(lists)
    # converted a file to csv
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

In [None]:
init()

In [None]:
df = pd.read_csv('datafile.csv')

In [None]:
x = list(df['english'])
y = list(df['chinese'])

In [None]:
# 分別生成中英文字典
en_vocab = set(''.join(x))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}
print('\n英文字典:\n', en2id)

ch_vocab = set(''.join(y))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

print('\n中文字典共計\n:', ch2id)

In [None]:
en_num_data = [[en2id[en] for en in line ] for line in x]
ch_num_data = [[ch2id[ch] for ch in line] for line in y]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in y]

print('char:', x[1])
print('index:', en_num_data[1])

In [None]:
import numpy as np

# 獲取輸入輸出端的最大長度
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print('index data:\n', en_num_data[1])
print('one hot data:\n', encoder_input_data[1])

In [None]:
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.003
BATCH_SIZE = 100
EPOCHS = 200