In [23]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# print(doc)
# split into english-german pairs
pairs = to_pairs(doc)
print(pairs[1])
# # clean sentences
clean_pairs = clean_pairs(pairs)
print(type(clean_pairs))
print(clean_pairs.shape)
# save clean pairs to file
# save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
# for i in range(10):
#     print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

['Hi.', 'Grüß Gott!']
<class 'numpy.ndarray'>
(169813, 2)


In [6]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')
print(raw_dataset.shape)
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

(169813, 2)
Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [3]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
print(dataset)
# # prepare english tokenizer
# eng_tokenizer = create_tokenizer(dataset[:, 0])
# eng_vocab_size = len(eng_tokenizer.word_index) + 1
# eng_length = max_length(dataset[:, 0])
# print('English Vocabulary Size: %d' % eng_vocab_size)
# print('English Max Length: %d' % (eng_length))
# # prepare german tokenizer
# ger_tokenizer = create_tokenizer(dataset[:, 1])
# ger_vocab_size = len(ger_tokenizer.word_index) + 1
# ger_length = max_length(dataset[:, 1])
# print('German Vocabulary Size: %d' % ger_vocab_size)
# print('German Max Length: %d' % (ger_length))

# # prepare training data
# trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
# trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
# trainY = encode_output(trainY, eng_vocab_size)
# # prepare validation data
# testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
# testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
# testY = encode_output(testY, eng_vocab_size)

# # define model
# model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
# model.compile(optimizer='adam', loss='categorical_crossentropy')
# # summarize defined model
# print(model.summary())
# plot_model(model, to_file='model.png', show_shapes=True)
# # fit model
# filename = 'model.h5'
# checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# model.fit(trainX, trainY, epochs=1, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[['tom is stuffed' 'tom ist vollgegessen']
 ['i am a boy' 'ich bin ein junge']
 ['i am dozing' 'ich bin am dosen']
 ...
 ['is tom autistic' 'ist tom autistisch']
 ['i need coffee' 'ich brauche kaffee']
 ['here you are' 'bitte schon']]


In [2]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append(raw_target.split())
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[tom schloss die schule ab], target=[tom graduated], predicted=[tom graduated]
src=[bitte nehmen sie sich eins], target=[please take one], predicted=[please take one]
src=[sprich leise], target=[speak softly], predicted=[speak softly]
src=[alle haben gebetet], target=[everyone prayed], predicted=[everyone prayed]
src=[ich habe hunger], target=[i am hungry], predicted=[i hungry hungry]
src=[ich habe gelogen], target=[i was lying], predicted=[i was]
src=[begreifst du jetzt], target=[now do you see], predicted=[now do you see]
src=[tom ist nach hause gekommen], target=[tom came home], predicted=[tom came home]
src=[ich gehe], target=[im going to go], predicted=[im go go]
src=[hatten wir das], target=[did we have it], predicted=[do we have it]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.074887
BLEU-2: 0.264971
BLEU-3: 0.439253
BLEU-4: 0.498419
test
src=[schauen sie mir zu], target=[watch us], predicted=[watch to]
src=[ich habe uberlebt], target=[i survived], predicted=[i survived]
src=[bleib gelassen], target=[keep cool], predicted=[stay cool]
src=[sie sahen tom], target=[they saw tom], predicted=[they saw tom]
src=[ich habe das interesse verloren], target=[i lost interest], predicted=[i made it again]
src=[das kann nicht tom sein], target=[it cant be tom], predicted=[it cant be it]
src=[tom ist taktvoll], target=[tom is tactful], predicted=[tom is selfish]
src=[rufe tom an], target=[call tom], predicted=[give tom a call]
src=[sie sind vorbeigekommen], target=[they called], predicted=[they cheated]
src=[bitte kommen sie mir helfen], target=[come help me], predicted=[please help me]
BLEU-1: 0.075286
BLEU-2: 0.266293
BLEU-3: 0.441384
BLEU-4: 0.500819
