<a href="https://colab.research.google.com/github/yoheshkannan/ML/blob/master/English_to_German_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**English to German Neural Machine Translation**

1) Data Importing and Cleaning

In [1]:
#Function to load the data
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [None]:
document = load_doc('deu-eng.txt')

In [None]:
#Split a document into sentences and make it as source and target using '\t' as a delimiter
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

In [None]:
pairs = to_pairs(document)

In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import nltk

In [None]:
# Function to clean data
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [None]:
clean_data = clean_pairs(pairs)

In [None]:
# print first 10 cleaned data
for i in range(10):
	print('[%s] => [%s]' % (clean_data[i,0], clean_data[i,1]))

[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]


In [None]:
# To save our cleaned data in a pickle file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 

In [None]:
save_clean_data(clean_data, 'eng_ger.pkl')

Saved: eng_ger.pkl


In [None]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
# Function to load pickle file
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [None]:
raw_data = load_clean_sentences('eng_ger.pkl')

2) Split data into Train and Test Set

In [None]:
import numpy as np
# Select number of sentences we need and shuffing it
n_sentences = 20000
dataset = raw_data[:n_sentences, :]
dataset_shuffle = shuffle(dataset)
np.round(20000 * 0.8)

16000.0

In [None]:
# Split data into train and test set
train, test = dataset[:16000], dataset[16000:]

In [None]:
for i in range(10):
	print('[%s] => [%s]' % (train[i,0], train[i,1]))

[it was night] => [es war nacht]
[im fasting] => [ich faste]
[im not yelling] => [ich schreie nicht]
[tom moved away] => [tom ist weggezogen]
[tom didnt sleep] => [tom hat nicht geschlafen]
[is today payday] => [ist heute zahltag]
[i feel the same] => [mir geht es genauso]
[i hope i win] => [ich hoffe ich gewinne]
[i like french] => [ich mag das franzosische]
[eat up] => [iss auf]


In [None]:

def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [None]:
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [None]:
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

3) Prepare Training data.

Each input and output must be encoded to integers and padded to the maximum phrase length. This is because we will use the word embeddings to the input sequence and one hot encode the output sequences.

In [None]:
from keras.preprocessing.text import Tokenizer
#Function to tokenize each word in a sequence
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [None]:
# Function to find out the maximum length of the sequence
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [None]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 3627
English Max Length: 5


In [None]:
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 5622
German Max Length: 10


In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
# Function to encode sequence and apply padding
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [None]:
# Function to convert sequences to one hot 
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [None]:
# prepare training data
train_x = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
train_y = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
train_y = encode_output(train_y, ger_vocab_size)

In [None]:
test_x = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
test_y = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
test_y = encode_output(test_y, ger_vocab_size)


4) Model

In [None]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed

In [None]:
# Encoder and Decoder model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [None]:
# define model
model = define_model(eng_vocab_size, ger_vocab_size, eng_length, ger_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 256)            928512    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 10, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 256)           525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 10, 5622)          1444854   
Total params: 3,423,990
Trainable params: 3,423,990
Non-trainable params: 0
_________________________________________________________________
None


5) Fitting a Model

In [None]:
model.fit(train_x, train_y, epochs = 50, batch_size = 64, validation_data=(test_x, test_y))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f82a01b7fd0>

6) Prediction

In [None]:
# Function for reverse mapping
#Model prediction will be a sequence of integers so we can enumerate and look up in the tokenizer to map back to words
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [None]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [np.argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [None]:
# to generate the translated sentences
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, ger_tokenizer, source)
		raw_src, raw_target = raw_dataset[i]
		if i < 10:
		  print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())

In [None]:
evaluate_model(model, ger_tokenizer, test_x, test)

src=[who hit you], target=[wer hat euch geschlagen], predicted=[wer hat dich geschlagen]
src=[now youre safe], target=[jetzt sind sie in sicherheit], predicted=[jetzt bist ihr in sicherheit]
src=[everyones going], target=[alle gehen hin], predicted=[alle gehen gerade]
src=[i saw the doctor], target=[ich war beim arzt], predicted=[ich habe einen arzt]
src=[lets play tag], target=[kommen sie wir spielen jetzt fangen], predicted=[komm wir spielen jetzt fangen]
src=[you cant go now], target=[du kannst jetzt nicht weg], predicted=[jetzt kannst jetzt jetzt gehen]
src=[who asked you], target=[wer hat dich denn gefragt], predicted=[wer hat dich gefragt]
src=[i require advice], target=[ich benotige einen rat], predicted=[ich habe auf diat]
src=[start running], target=[fangen sie an zu laufen], predicted=[beginnen sie zu laufen]
src=[lets play ball], target=[lasst uns ball spielen], predicted=[lass uns ball spielen]
