In [0]:
import numpy 
print(numpy.__version__)

1.14.6


In [0]:
import keras 
print(keras.__version__)

2.2.4


In [51]:
with open("fra.txt", 'rt', encoding="utf-8") as f:
    text = f.read()
text[:80]

'Go.\tVa !\nHi.\tSalut !\nRun!\tCours\u202f!\nRun!\tCourez\u202f!\nWow!\tÇa alors\u202f!\nFire!\tAu feu !\nH'

In [52]:
lines = text.strip().split('\n')
lines_pairs = [line.split('\t') for line in  lines]
lines_pairs[:15]

[['Go.', 'Va !'],
 ['Hi.', 'Salut !'],
 ['Run!', 'Cours\u202f!'],
 ['Run!', 'Courez\u202f!'],
 ['Wow!', 'Ça alors\u202f!'],
 ['Fire!', 'Au feu !'],
 ['Help!', "À l'aide\u202f!"],
 ['Jump.', 'Saute.'],
 ['Stop!', 'Ça suffit\u202f!'],
 ['Stop!', 'Stop\u202f!'],
 ['Stop!', 'Arrête-toi !'],
 ['Wait!', 'Attends !'],
 ['Wait!', 'Attendez !'],
 ['Go on.', 'Poursuis.'],
 ['Go on.', 'Continuez.']]

In [53]:
pairs_len = len(lines_pairs)
eng_pair_lens = [len(line_pair[0]) for line_pair in lines_pairs]
fra_pair_lens = [len(line_pair[1]) for line_pair in lines_pairs]
print("一共有{}个法英文语料库对；".format(pairs_len))
print("其中法文语句最短的长度有{}，最长的长度有{}；".format(min(fra_pair_lens), max(fra_pair_lens)))
print("其中英文语句最短的长度有{}，最长的长度有{}。".format(min(eng_pair_lens), max(eng_pair_lens)))

一共有160872个法英文语料库对；
其中法文语句最短的长度有4，最长的长度有349；
其中英文语句最短的长度有3，最长的长度有286。


In [58]:
import re
import numpy as np
from unicodedata import normalize
import string

re_print = re.compile('[^{}]'.format(re.escape(string.printable)))
english_table = str.maketrans('', '', string.punctuation)
cleaned_pairs = list()
for pair in lines_pairs:
    clean_pair = list()
    for i, line in enumerate(pair):
          line = normalize('NFD', line).encode('ascii', 'ignore')
          line = line.decode('UTF-8')
          line = line.split()  
          line = [word.lower() for word in line] 
          line = [word.translate(english_table) for word in line] 
          line = [re_print.sub('', w) for w in line] 
          line = [word for word in line if word.isalpha()] 
          clean_pair.append(' '.join(line))
    cleaned_pairs.append(clean_pair) 
cleaned_pairs = np.array(cleaned_pairs) 
cleaned_pairs[:15]

array([['go', 'va'],
       ['hi', 'salut'],
       ['run', 'cours'],
       ['run', 'courez'],
       ['wow', 'ca alors'],
       ['fire', 'au feu'],
       ['help', 'a laide'],
       ['jump', 'saute'],
       ['stop', 'ca suffit'],
       ['stop', 'stop'],
       ['stop', 'arretetoi'],
       ['wait', 'attends'],
       ['wait', 'attendez'],
       ['go on', 'poursuis'],
       ['go on', 'continuez']], dtype='<U339')

In [61]:
print(string.printable)
print(string.punctuation)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [0]:
import pickle
with open("french_to_english.pkl", "wb") as f:
    pickle.dump(cleaned_pairs, f)

In [63]:
with open("french_to_english.pkl", "rb") as f:
    raw_dataset = pickle.load(f)

sequence_length = 10000
dataset = raw_dataset[:sequence_length]
np.random.shuffle(dataset)
dataset[:15]

array([['he has a video', 'il detient une video'],
       ['can you help me', 'pourraistu maider'],
       ['be respectful', 'soyez respectueuses'],
       ['youre naive', 'vous etes naif'],
       ['he hardly works', 'il travaille a peine'],
       ['show me', 'montrezmoi'],
       ['i must go', 'je dois y aller'],
       ['what a team', 'quelle equipe'],
       ['we are late', 'nous sommes en retard'],
       ['call security', 'appelle la securite'],
       ['is that love', 'estce de lamour'],
       ['he lay face up', 'il etait etendu le visage visible'],
       ['she helps us', 'elle nous aide'],
       ['she had twins', 'elle a eu des jumeaux'],
       ['you needed me', 'vous aviez besoin de moi']], dtype='<U339')

In [64]:
train_len = sequence_length - 1500
train, test = dataset[:train_len], dataset[train_len:]

def save_dataset(sentences, filename):
    with open(filename, 'wb') as f:
        pickle.dump(sentences, f)

save_dataset(dataset, "french_to_english_dataset_top10000.pkl")
save_dataset(train, "french_to_english_train.pkl")
save_dataset(test, "french_to_english_test.pkl")

print("train.shape={}, test.shape={}".format(train.shape, test.shape))

train.shape=(8500, 2), test.shape=(1500, 2)


In [0]:
print(train)

[['we all laughed' 'nous rimes tous']
 ['is that you' 'estce toi']
 ['i like this' 'je lapprecie']
 ...
 ['i am not happy' 'je ne suis pas content']
 ['break it up' 'arretez']
 ['do it again' 'refaisle']]


In [65]:
from keras.preprocessing.text import Tokenizer

texts = ['I love AI in China', '特拉字节', 'AI 人工智能']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
print("tokenizer.word_index={}.".format(tokenizer.word_index))
print("tokenizer.texts_to_sequences={}.".format(tokenizer.texts_to_sequences(texts)))

tokenizer.word_index={'ai': 1, 'i': 2, 'love': 3, 'in': 4, 'china': 5, '特拉字节': 6, '人工智能': 7}.
tokenizer.texts_to_sequences=[[2, 3, 1, 4, 5], [6], [1, 7]].


In [72]:
from keras.preprocessing.text import Tokenizer

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('英文序列单词最大个数{}，单词有{}个。'.format(eng_length, eng_vocab_size))

fra_tokenizer = create_tokenizer(dataset[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1
fra_length = max_length(dataset[:, 1])
print('法文序列单词最大个数{}，单词有{}个。'.format(fra_length, fra_vocab_size))

英文序列单词最大个数5，单词有2125个。
法文序列单词最大个数10，单词有4397个。


In [0]:
from keras.preprocessing import sequence
from keras import utils

def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = sequence.pad_sequences(X, maxlen=length, padding='post')
    return X

def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = utils.to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

X_train = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
y_train = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
y_train = encode_output(y_train, eng_vocab_size)
 
X_test = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
y_test = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
y_test = encode_output(y_test, eng_vocab_size)

In [77]:
from keras import Sequential
from keras.layers import Dense, Embedding, LSTM, RepeatVector, TimeDistributed

def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
 
model = create_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 256)           1125632   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 2125)           546125    
Total params: 2,722,381
Trainable params: 2,722,381
Non-trainable params: 0
_________________________________________________________________


In [0]:
from keras import backend as K 
K.clear_session()

In [78]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

callbacks_EarlyStopping = EarlyStopping(monitor='val_loss', 
                                        patience=3)

model_filename = 'translator_weights_model.h5'
checkpoint_ModelCheckpoint = ModelCheckpoint(model_filename, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')
history = model.fit(X_train, 
                    y_train, 
                    epochs=50, 
                    batch_size=64, 
                    validation_data=(X_test, y_test), 
                    callbacks=[checkpoint_ModelCheckpoint, callbacks_EarlyStopping], 
                    verbose=2)

Train on 8500 samples, validate on 1500 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 3.39091, saving model to translator_weights_model.h5
Epoch 2/50

Epoch 00002: val_loss improved from 3.39091 to 3.26117, saving model to translator_weights_model.h5
Epoch 3/50

Epoch 00003: val_loss improved from 3.26117 to 3.12938, saving model to translator_weights_model.h5
Epoch 4/50

Epoch 00004: val_loss improved from 3.12938 to 2.99794, saving model to translator_weights_model.h5
Epoch 5/50

Epoch 00005: val_loss improved from 2.99794 to 2.87198, saving model to translator_weights_model.h5
Epoch 6/50

Epoch 00006: val_loss improved from 2.87198 to 2.77288, saving model to translator_weights_model.h5
Epoch 7/50

Epoch 00007: val_loss improved from 2.77288 to 2.67034, saving model to translator_weights_model.h5
Epoch 8/50

Epoch 00008: val_loss improved from 2.67034 to 2.57190, saving model to translator_weights_model.h5
Epoch 9/50

Epoch 00009: val_loss improved from 2.57190 to 2

In [0]:
history.history.keys()

dict_keys(['val_loss', 'loss'])

In [0]:
def load_clean_sentences(filename):
    return pickle.load(open(filename, 'rb'))

dataset = load_clean_sentences('french_to_english_dataset_top10000.pkl')
train_ds = load_clean_sentences('french_to_english_train.pkl')
test_ds = load_clean_sentences('french_to_english_test.pkl')

eng_tokenizer = create_tokenizer(dataset[:, 0])
fra_tokenizer = create_tokenizer(dataset[:, 1])
fra_length = max_length(dataset[:, 1])

X_train = encode_sequences(fra_tokenizer, fra_length, train_ds[:, 1])
X_test = encode_sequences(fra_tokenizer, fra_length, test_ds[:, 1])

In [0]:
from keras import models
model = models.load_model('translator_weights_model.h5')

In [0]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [92]:
from nltk.translate.bleu_score import corpus_bleu

def test_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('源语句=[{}], 目标语句=[{}], 预测语句=[{}]'.format(raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    print('BLEU-1: {}'.format(corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))))
    print('BLEU-2: {}'.format(corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))))
    print('BLEU-3: {}'.format(corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))))
    print('BLEU-4: {}'.format(corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))))

print('训练集：')
test_model(model, eng_tokenizer, X_train, train_ds)
print('测试集：')
test_model(model, eng_tokenizer, X_test, test_ds)

训练集：
源语句=[il detient une video], 目标语句=[he has a video], 预测语句=[he has a video]
源语句=[pourraistu maider], 目标语句=[can you help me], 预测语句=[can you help me]
源语句=[soyez respectueuses], 目标语句=[be respectful], 预测语句=[be respectful]
源语句=[vous etes naif], 目标语句=[youre naive], 预测语句=[youre naive]
源语句=[il travaille a peine], 目标语句=[he hardly works], 预测语句=[he hardly works]
源语句=[montrezmoi], 目标语句=[show me], 预测语句=[show me]
源语句=[je dois y aller], 目标语句=[i must go], 预测语句=[i must to go]
源语句=[quelle equipe], 目标语句=[what a team], 预测语句=[what a team]
源语句=[nous sommes en retard], 目标语句=[we are late], 预测语句=[were late]
源语句=[appelle la securite], 目标语句=[call security], 预测语句=[call security]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.08884628871460039
BLEU-2: 0.28372048975984243
BLEU-3: 0.4514308812293625
BLEU-4: 0.5070098604719333
测试集：
源语句=[venez la], 目标语句=[come here], 预测语句=[come back]
源语句=[laissemoi sortir], 目标语句=[let me out], 预测语句=[let me out]
源语句=[ne soyez pas mesquin], 目标语句=[dont be mean], 预测语句=[dont be mean]
源语句=[vous devez le faire], 目标语句=[you must do it], 预测语句=[you need do it]
源语句=[ty rendrastu], 目标语句=[will you go], 预测语句=[shut you serious]
源语句=[elles ont refuse], 目标语句=[they refused], 预测语句=[they refused]
源语句=[ils nous ont trouvees], 目标语句=[they found us], 预测语句=[they found us]
源语句=[jadore cuisiner], 目标语句=[i love cooking], 预测语句=[i love baking]
源语句=[tom semble perdu], 目标语句=[tom seems lost], 预测语句=[tom lost]
源语句=[avaisje tort], 目标语句=[was i wrong], 预测语句=[am was i]
BLEU-1: 0.07970902428748444
BLEU-2: 0.26403639977318066
BLEU-3: 0.4263107912362802
BLEU-4: 0.48055373499039605
