In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.1
pandas 0.25.3
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [22]:
en_cmn_file_path = 'data/cmn_en/cmn_proc.txt'

# 将unicod编码转化为ascii，如果有多个ascii组成，则拆分，去掉重音
import unicodedata
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

# test
en_sentence = 'Then what?'
cmn_sentence = '他来到了网易杭研大厦？'

print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(cmn_sentence))

Then what?
他来到了网易杭研大厦？


In [23]:
# 西方语言常用：标点符号和字母分开
import re
def preprocess_sentence(s):
    # 转化成ascii，变小写去空格
    s = unicode_to_ascii(s.lower().strip())
    
    # 标点符号前后加空格
    s = re.sub(r'([?.!,。，！？‘’“”])', r' \1 ', s)
    # 多余的空格变成一个空格
    s = re.sub(r'[" "]+', ' ', s)
    # 除了标点符号和字母外都是空格
#     s = re.sub(r'[^a-zA-Z?.!,¿]', ' ', s)
    # 去掉前后空格
    s = s.rstrip().strip()
    # 前后加标记
    s = '<start> ' + s + ' <end>'
    return s

# test
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(cmn_sentence))

<start> then what ? <end>
<start> 他来到了网易杭研大厦 ？ <end>


In [24]:
import jieba

seg_list = jieba.cut_for_search(cmn_sentence)
seg_list = ' '.join(seg_list)
print(preprocess_sentence(seg_list))

<start> 他 来到 了 网易 杭研 大厦 ？ <end>


In [28]:
import jieba
# 解析文件
def parse_data(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocess_sentence_pairs = [
        (preprocess_sentence(en), preprocess_sentence(' '.join(jieba.cut_for_search(cmn)))) for en, cmn in sentence_pairs]
    # 解包和zip联用：将每一个元组解开，重新组合成两个新的列表
    return zip(*preprocess_sentence_pairs)

en_dataset, cmn_dataset = parse_data(en_cmn_file_path)
print(en_dataset[-1])
print(cmn_dataset[-1])

<start> if a person has not had a chance to acquire his target language by the time he's an adult , he's unlikely to be able to reach native speaker level in that language . <end>
<start> 如果 一個 人 在 成人 前 沒 有 機會習 得 目標 語言 ， 他 對 該 語言 的 認識 達 到 母語者 程度 的 機會 是 相當 小 的 。 <end>


In [29]:
def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(num_words=None, filters='', split=' ')
    # 统计词频，生成词表
    lang_tokenizer.fit_on_texts(lang)
    # id化
    tensor = lang_tokenizer.texts_to_sequences(lang)
    # padding
    tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

input_tensor, input_tokenizer = tokenizer(cmn_dataset)
output_tensor, output_tokenizer = tokenizer(en_dataset)

def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)
print(max_length_input, max_length_output)

33 36


In [30]:
# 训练集和验证集切分
from sklearn.model_selection import train_test_split
input_train, input_eval, output_train, output_eval = train_test_split(
    input_tensor, output_tensor, test_size = 0.2)

len(input_train), len(input_eval), len(output_train), len(output_eval)

(17296, 4325, 17296, 4325)

In [31]:
# 验证tokenizer是否转化正确
def convert(example, tokenizer):
    for t in example:
        if t != 0:
            print('%d --> %s' % (t, tokenizer.index_word[t]))
            
convert(input_train[0], input_tokenizer)
print()
convert(output_train[0], output_tokenizer)

1 --> <start>
8 --> 他
738 --> 經常
4851 --> 引用
4162 --> 莎士
4163 --> 比亞
3 --> 。
2 --> <end>

1 --> <start>
11 --> he
221 --> often
3067 --> quotes
66 --> from
2749 --> shakespeare
3 --> .
2 --> <end>


In [32]:
def make_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder = True)
    return dataset

batch_size = 32
epochs = 10

train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
eval_dataset = make_dataset(input_eval, output_eval, batch_size, 1, False)

In [33]:
for x, y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x)
    print(y)

(32, 33)
(32, 36)
tf.Tensor(
[[  1  24  40 ...   0   0   0]
 [  1   4  42 ...   0   0   0]
 [  1  14 104 ...   0   0   0]
 ...
 [  1   4 333 ...   0   0   0]
 [  1   7  18 ...   0   0   0]
 [  1  12  73 ...   0   0   0]], shape=(32, 33), dtype=int32)
tf.Tensor(
[[   1   12  168 ...    0    0    0]
 [   1  103  594 ...    0    0    0]
 [   1   12  135 ...    0    0    0]
 ...
 [   1    5  468 ...    0    0    0]
 [   1    7   28 ...    0    0    0]
 [   1 1792  151 ...    0    0    0]], shape=(32, 36), dtype=int32)
