In [1]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False

In [4]:
import json
import pandas as pd
from opencc import OpenCC
ch = OpenCC('s2twp')
def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                data['chinese'] = ch.convert(data['chinese'])
                lists.append(data)
                if (i+1)%1000000 == 0:
                    print(i+1)
                if (i+1)%2000000 == 0:
                    break
        
    # creating the dataframe
    df = pd.DataFrame(lists)
    # converted a file to csv
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

In [5]:
init()

1000000
2000000


In [6]:
import pandas as pd
df = pd.read_csv('datafile.csv')

In [7]:
# 分別生成中英文字典
def en2id():
    x = list(df['english'][:2000000])
    en_vocab = set(''.join(x))
    id2en = list(en_vocab)
    en2id = {c:i for i,c in enumerate(id2en)}
    #print('\n英文字典:\n', en2id)
    return en2id

def ch2id():
    y = list(df['chinese'][:2000000])
    ch_vocab = set(''.join(y))
    id2ch = list(ch_vocab)
    ch2id = {c:i for i,c in enumerate(id2ch)}
    #print('\n中文字典共計\n:', ch2id)
    return ch2id


In [8]:
en2id = en2id()
ch2id = ch2id()

In [9]:
import numpy as np
def encoder_input_data(en2id,ch2id):
    x = list(df['english'][:2000000])
    y = list(df['chinese'][:2000000])

    en_num_data = [[en2id[en] for en in line ] for line in x]
    ch_num_data = [[ch2id[ch] for ch in line] for line in y]

    print('char:', x[1])
    print('index:', en_num_data[1])

    # 獲取輸入輸出端的最大長度
    max_encoder_seq_length = max([len(txt) for txt in en_num_data])
    print('max encoder length:', max_encoder_seq_length)

    # 將數據進行onehot處理
    encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')

    for i in range(len(ch_num_data)):
        for t, j in enumerate(en_num_data[i]):
            encoder_input_data[i, t, j] = 1.
            
    print('index data:\n', en_num_data[1])
    print('one hot data:\n', encoder_input_data[1])
    return encoder_input_data

In [10]:

def decoder_input_data(ch2id):
    y = list(df['chinese'][:2000000])

    ch_num_data = [[ch2id[ch] for ch in line] for line in y]

    print('char:', y[1])
    print('index:', ch_num_data[1])

    # 獲取輸入輸出端的最大長度
    max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
    print('max decoder length:', max_decoder_seq_length)

    # 將數據進行onehot處理
    decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

    for i in range(len(ch_num_data)):
        for t, j in enumerate(ch_num_data[i]):
            decoder_input_data[i, t, j] = 1.
            
    print('index data:\n', ch_num_data[1])
    print('one hot data:\n', decoder_input_data[1])
    return decoder_input_data

In [11]:
def decoder_target_data(ch2id):
    y = list(df['chinese'][:2000000])
    de_num_data = [[ch2id[ch] for ch in line][1:] for line in y]
    ch_num_data = [[ch2id[ch] for ch in line] for line in y]
    # 獲取輸入輸出端的最大長度
    max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
    print('max decoder length:', max_decoder_seq_length)
    decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

    for i in range(len(ch_num_data)):
        for t, j in enumerate(de_num_data[i]):
            decoder_target_data[i, t, j] = 1.
            
    return decoder_target_data


In [None]:
encoder_input_data = encoder_input_data(en2id,ch2id)
decoder_input_data = decoder_input_data(ch2id)
decoder_target_data = decoder_target_data(ch2id)

char: He calls the Green Book, his book of teachings, “the new gospel.
index: [197, 182, 235, 98, 247, 94, 94, 141, 235, 213, 168, 182, 235, 276, 304, 182, 182, 130, 235, 236, 119, 119, 169, 301, 235, 168, 29, 141, 235, 84, 119, 119, 169, 235, 119, 95, 235, 213, 182, 247, 98, 168, 29, 130, 125, 141, 301, 235, 248, 213, 168, 182, 235, 130, 182, 147, 235, 125, 119, 141, 52, 182, 94, 155]
max encoder length: 419


In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.003
BATCH_SIZE = 100
EPOCHS = 200