# French to english Translation Demo

In [1]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))))
	print('BLEU-2: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))))
	print('BLEU-3: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))))
	print('BLEU-4: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))))

# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model_french.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[jai ete naif], target=[i was naive], predicted=[i was naive]
src=[vous navez pas paye], target=[you didnt pay], predicted=[you didnt pay]
src=[tom a lair triste], target=[tom sounds sad], predicted=[tom looks sad]
src=[tout le monde se taisait], target=[all were silent], predicted=[all were silent]
src=[je pourrais essayer], target=[i could try], predicted=[i could try]
src=[allume le cierge], target=[light the candle], predicted=[light the candle]
src=[tout le monde applaudit], target=[everyone cheered], predicted=[everyone cheered]
src=[je hais les chiens], target=[i hate dogs], predicted=[i hate dogs]
src=[je ten felicite], target=[congratulations], predicted=[congratulations]
src=[estce en vente], target=[is that for sale], predicted=[is that for sale]
BLEU-1: 91.34
BLEU-2: 88.15
BLEU-3: 81.20
BLEU-4: 55.35
test
src=[le potage est froid], target=[the soup is cold], predicted=[the soups cold]
src=[nous sommes enneiges], target=[were snowed in], predicted=[were adaptable]


# German to English Translation Demo

In [2]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))))
	print('BLEU-2: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))))
	print('BLEU-3: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))))
	print('BLEU-4: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))))

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('german_model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[tu ihm nichts], target=[dont hurt him], predicted=[dont hurt him]
src=[das gibts doch nicht], target=[no way], predicted=[no is me]
src=[schau genauer hin], target=[look closer], predicted=[look closer]
src=[ich hasse sie nicht], target=[i dont hate you], predicted=[i dont hate you]
src=[lass tom in ruhe], target=[leave tom alone], predicted=[leave tom alone]
src=[mir geht es genauso], target=[i feel the same], predicted=[i feel the same]
src=[ihr bezahlt], target=[youre paying], predicted=[youre paying]
src=[sie waren bereit], target=[they were ready], predicted=[they were ready]
src=[ich mag pizza], target=[i like pizza], predicted=[i like pizza]
src=[tom setzte sich], target=[tom sat down], predicted=[tom sat down]
BLEU-1: 86.45
BLEU-2: 80.82
BLEU-3: 72.84
BLEU-4: 47.98
test
src=[wie gro bist du], target=[how tall are you], predicted=[how are you you]
src=[er ist ein exknacki], target=[hes an excon], predicted=[he a a]
src=[das ist wichtig], target=[its important], predic

# Danish to English Translation Demo

In [4]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))))
	print('BLEU-2: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))))
	print('BLEU-3: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))))
	print('BLEU-4: %.2f' % (100 * corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))))

# load datasets
dataset = load_clean_sentences('english-danish-both.pkl')
train = load_clean_sentences('english-danish-train.pkl')
test = load_clean_sentences('english-danish-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('danish_model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[tom blev pakrt af en lastbil], target=[tom got hit by a truck], predicted=[tom was hit by a truck]
src=[tom lo ikke], target=[tom didnt laugh], predicted=[tom didnt laugh]
src=[vi har ingen penge], target=[we have no money], predicted=[we have no money]
src=[jeg elsker at lse bger], target=[i love reading books], predicted=[i love read books books]
src=[jeg hader interviews], target=[i hate interviews], predicted=[i hate interviews]
src=[han kunne ikke lide at vre fattig], target=[he didnt like being poor], predicted=[he didnt like be poor]
src=[tal aldrig med fremmede], target=[never talk to strangers], predicted=[never talk to strangers]
src=[han fangede kyllingen], target=[he caught the chicken], predicted=[he caught the chicken]
src=[vask skeerne], target=[wash the spoons], predicted=[wash the spoons]
src=[jeg har ikke en tatovering], target=[i dont have a tattoo], predicted=[i dont have a tattoo]
BLEU-1: 90.05
BLEU-2: 85.06
BLEU-3: 81.47
BLEU-4: 71.47
test
src=[tom grin