In [6]:
def is_chinese(string):
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True

    return False


In [3]:
import json
import pandas as pd
from opencc import OpenCC
ch = OpenCC('s2twp')
def init():
    lists = []
    # reading the json file
    with open('./translation2019zh/translation2019zh_train.json','r', encoding='utf-8') as dat_f:
        data = []
        for i,line in enumerate(dat_f):
            data = json.loads(line)

            if is_chinese(data['chinese']) == True:
                data['chinese'] = ch.convert(data['chinese'])
                lists.append(data)
                if (i+1)%1000000 == 0:
                    print(i+1)
        
    # creating the dataframe
    df = pd.DataFrame(lists)
    # converted a file to csv
    df.to_csv('datafile.csv', encoding='utf-8', index=False)

In [5]:
init()

1000000
2000000
3000000
4000000
5000000


In [4]:
df = pd.read_csv('datafile.csv')

In [5]:
x = list(df['english'])
y = list(df['chinese'])

In [6]:
# 分別生成中英文字典
en_vocab = set(''.join(x))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}
print('\n英文字典:\n', en2id)

ch_vocab = set(''.join(y))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

print('\n中文字典共計\n:', ch2id)


英文字典:
 {'t': 0, 'M': 1, 'r': 2, 'μ': 3, '™': 4, '�': 5, 'ν': 6, '＊': 7, 'l': 8, 'è': 9, 'G': 10, 'Ⅳ': 11, 'i': 12, '9': 13, 'ã': 14, 'Ò': 15, '⑼': 16, '/': 17, '℃': 18, '\u2009': 19, 'g': 20, '⑵': 21, 'b': 22, '∼': 23, 'o': 24, 'ì': 25, '﹑': 26, '”': 27, 'Z': 28, '√': 29, '¨': 30, 'a': 31, '☆': 32, '⸴': 33, 'N': 34, 'ï': 35, 'Ó': 36, '㎡': 37, 'f': 38, 'h': 39, '［': 40, 'Ⅵ': 41, 'î': 42, 'X': 43, 'β': 44, '⑹': 45, 'ε': 46, '’': 47, '□': 48, '=': 49, '﹖': 50, '\xa0': 51, 'm': 52, 'J': 53, '⑻': 54, '´': 55, 'Ⅶ': 56, '）': 57, ',': 58, 'n': 59, 'Å': 60, '0': 61, '╱': 62, 'Σ': 63, ';': 64, '！': 65, '⑽': 66, '►': 67, 'ê': 68, 'Ⅲ': 69, 'α': 70, 's': 71, '『': 72, '⸳': 73, '＃': 74, '1': 75, 'ρ': 76, '④': 77, 'Î': 78, '￥': 79, '§': 80, '♂': 81, '\u200a': 82, '﹐': 83, 'k': 84, 'H': 85, '。': 86, '■': 87, '】': 88, '⑦': 89, '♥': 90, '０': 91, '…': 92, '᪭': 93, '˚': 94, '；': 95, 'É': 96, '♑': 97, 'Ⅷ': 98, '≡': 99, '≥': 100, 'x': 101, 'T': 102, '\xad': 103, '%': 104, 'V': 105, '（': 106, '﹔': 107, '–': 

In [7]:
en_num_data = [[en2id[en] for en in line ] for line in x]
ch_num_data = [[ch2id[ch] for ch in line] for line in y]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in y]

print('char:', x[1])
print('index:', en_num_data[1])

char: He calls the Green Book, his book of teachings, “the new gospel.
index: [85, 254, 359, 211, 31, 8, 8, 71, 359, 0, 39, 254, 359, 10, 2, 254, 254, 59, 359, 229, 24, 24, 84, 58, 359, 39, 12, 71, 359, 22, 24, 24, 84, 359, 24, 38, 359, 0, 254, 31, 211, 39, 12, 59, 20, 71, 58, 359, 354, 0, 39, 254, 359, 59, 254, 329, 359, 20, 24, 71, 210, 254, 8, 360]


In [None]:
import numpy as np

# 獲取輸入輸出端的最大長度
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

# 將數據進行onehot處理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print('index data:\n', en_num_data[1])
print('one hot data:\n', encoder_input_data[1])

max encoder length: 422
max decoder length: 196


In [1]:
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.003
BATCH_SIZE = 100
EPOCHS = 200

NameError: name 'en2id' is not defined