# Neural Machine Translation Example

In [2]:
#! usr/bin/env python3
# -*- coding : utf-8 -*-

'''
author: aggarwal

'''

'\nauthor: aggarwal\n\n'

In [3]:
import sys

In [85]:
!{sys.executable} -m pip install pandas numpy scikit-learn



In [5]:
import pandas as pd

In [44]:
filename='eng_deu.txt'

parallel_corpora = pd.read_csv(filename, sep='\t', header = None, names=['eng', 'deu'], encoding = "UTF-8")

In [7]:
parallel_corpora.head(5)

Unnamed: 0,eng,deu
0,Hi.,Hallo!
1,Hi.,Grüß Gott!
2,Run!,Lauf!
3,Wow!,Potzdonner!
4,Wow!,Donnerwetter!


In [8]:
# remove punctuation -- excercise
import string
parallel_corpora['eng'] = parallel_corpora['eng'].str.replace('[{}]'.format(string.punctuation), '')
parallel_corpora['deu'] = parallel_corpora['deu'].str.replace('[{}]'.format(string.punctuation), '')

In [9]:
parallel_corpora.head(5)

Unnamed: 0,eng,deu
0,Hi,Hallo
1,Hi,Grüß Gott
2,Run,Lauf
3,Wow,Potzdonner
4,Wow,Donnerwetter


In [10]:
limited_parallel_corpora = parallel_corpora[:10000]

In [11]:
# take first 10000 and do train-test

from sklearn.model_selection import train_test_split
train, test = train_test_split(limited_parallel_corpora, test_size=0.2)

In [12]:
train[:5]

Unnamed: 0,eng,deu
267,Have fun,Viel Vergnügen
5760,Were you busy,Wart ihr beschäftigt
7690,Tom felt hated,Tom fühlte sich gehasst
5327,Tom applauded,Tom hat geklatscht
1268,Look at us,Schau uns an


In [13]:
test[:5]

Unnamed: 0,eng,deu
9231,Heres your dog,Hier ist dein Hund
4616,I watched Tom,Ich habe Tom beobachtet
5720,Were fasting,Wir fasten
5200,That ones OK,Der da ist OK
1371,Tom cheats,Tom betrügt


In [14]:
#install keras

!{sys.executable} -m pip install keras



In [15]:
!{sys.executable} -m pip install tensorflow



In [16]:
# apply tokenizer
from keras.preprocessing.text import Tokenizer
tokenizer_eng = Tokenizer()
tokenizer_ger = Tokenizer()
tokenizer_eng.fit_on_texts(list(limited_parallel_corpora['eng'].values))
tokenizer_ger.fit_on_texts(list(limited_parallel_corpora['deu'].values))


Using TensorFlow backend.


In [17]:
tokenizer_eng.word_index.items()

dict_items([('tom', 1), ('i', 2), ('it', 3), ('you', 4), ('is', 5), ('im', 6), ('a', 7), ('me', 8), ('he', 9), ('was', 10), ('its', 11), ('go', 12), ('do', 13), ('we', 14), ('to', 15), ('are', 16), ('this', 17), ('dont', 18), ('that', 19), ('ill', 20), ('can', 21), ('youre', 22), ('the', 23), ('my', 24), ('come', 25), ('get', 26), ('were', 27), ('like', 28), ('be', 29), ('here', 30), ('toms', 31), ('up', 32), ('have', 33), ('she', 34), ('who', 35), ('not', 36), ('they', 37), ('in', 38), ('love', 39), ('am', 40), ('did', 41), ('on', 42), ('well', 43), ('him', 44), ('want', 45), ('thats', 46), ('your', 47), ('know', 48), ('us', 49), ('take', 50), ('need', 51), ('how', 52), ('cant', 53), ('home', 54), ('no', 55), ('hes', 56), ('keep', 57), ('see', 58), ('now', 59), ('has', 60), ('help', 61), ('let', 62), ('saw', 63), ('stop', 64), ('got', 65), ('too', 66), ('will', 67), ('look', 68), ('one', 69), ('out', 70), ('there', 71), ('away', 72), ('may', 73), ('just', 74), ('lost', 75), ('try', 76

In [18]:
# vocab size

'''
+1 needed because if you use the pad_sequence to process the sequence, you will find the 0 is used as the padding value. 
In order to distinguish between PAD and UNKNOWN, keras use word_count+1 as the index of UNKNOWN.
'''
eng_vocab_size = len(tokenizer_eng.word_index) + 1
ger_vocab_size = len(tokenizer_ger.word_index) + 1

In [19]:
print(eng_vocab_size)
print(ger_vocab_size)

2344
3715


In [20]:
# sequencing and padding


from keras.preprocessing.sequence import pad_sequences
max_length = 30

# training set

trainX = tokenizer_ger.texts_to_sequences(train['deu'].values)
trainY = tokenizer_eng.texts_to_sequences(train['eng'].values)

trainX = pad_sequences(trainX, maxlen=max_length, padding='post')
trainY = pad_sequences(trainY, maxlen=max_length, padding='post')


# validation set

testX = tokenizer_ger.texts_to_sequences(test['deu'].values)
testY = tokenizer_eng.texts_to_sequences(test['eng'].values)

testX = pad_sequences(testX, maxlen=max_length, padding='post')
testY = pad_sequences(testY, maxlen=max_length, padding='post')

In [21]:
trainY[:1]

array([[ 33, 134,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int32)

In [22]:
#label encoding
import numpy as np
from keras.utils import to_categorical

def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

trainY = encode_output(trainY, eng_vocab_size)
testY = encode_output(testY, eng_vocab_size)

In [23]:
trainY[:1]

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

In [24]:

# generate model

from keras.models import Sequential
from keras.models import load_model
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model



In [25]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, 30, 30, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
model.summary()






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 256)           951040    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 30, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 30, 2344)          602408    
Total params: 2,604,072
Trainable params: 2,604,072
Non-trainable params: 0
_________________________________________________________________


In [26]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=10, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint])


Train on 8000 samples, validate on 2000 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.68010, saving model to model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.68010 to 0.59861, saving model to model.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.59861 to 0.57610, saving model to model.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.57610 to 0.57541, saving model to model.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.57541 to 0.56103, saving model to model.h5
Epoch 6/10

Epoch 00006: val_loss improved from 0.56103 to 0.54501, saving model to model.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.54501 to 0.53128, saving model to model.h5
Epoch 8/10

Epoch 00008: val_loss improved from 0.53128 to 0.51228, saving model to model.h5
Epoch 9/10

Epoch 00009: val_loss improved from 0.51228 to 0.50380, saving model to model.h5
Epoch 10/10

Epoch 00010: val_loss improved from 0.50380 to 0.48381, saving model to model.h5


<keras.callbacks.History at 0x1a48871550>

In [27]:
model = load_model('model.h5')

In [28]:
testX[:1]

array([[ 31,   3, 173, 139,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int32)

In [29]:
source_sentence = test['deu'].values[0]

print(source_sentence)

Hier ist dein Hund


In [30]:
# see predictions of first test sentence


pridiction_probabilities = model.predict(testX[:1], verbose=0)[0]
    
print(pridiction_probabilities)

[[8.7220920e-05 8.7283500e-02 4.7288236e-04 ... 3.4610312e-06
  7.1591230e-06 4.4256331e-06]
 [2.5793558e-04 5.3539104e-03 6.0642960e-06 ... 1.2377087e-05
  3.4232817e-05 1.5207303e-05]
 [9.9953348e-03 1.7491162e-02 1.5171657e-06 ... 6.0153667e-05
  2.3145834e-04 1.0286463e-04]
 ...
 [9.9979681e-01 5.2930540e-07 1.9412555e-11 ... 2.9893064e-09
  7.7389067e-10 4.8297233e-09]
 [9.9979681e-01 5.2931046e-07 1.9413296e-11 ... 2.9891867e-09
  7.7386264e-10 4.8295208e-09]
 [9.9979681e-01 5.2931347e-07 1.9413814e-11 ... 2.9890841e-09
  7.7384199e-10 4.8293547e-09]]


In [31]:
# get word encoded intergers out of probability map

integers = [np.argmax(vector) for vector in pridiction_probabilities]

print(integers)

[11, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [32]:
# convert integers into words
eng_sentence = []
for each_int in integers:
	for word, index in tokenizer_eng.word_index.items():
		if index == each_int:
			eng_sentence.append(word)

translated_sentence = ' '.join(eng_sentence)

print(translated_sentence)

its is is


In [33]:
actual_sentence = test['eng'].values[0]

print(actual_sentence)

Heres your dog


In [34]:
# evaluate blue scores


from nltk.translate.bleu_score import corpus_bleu


bleu_score = corpus_bleu([[actual_sentence.split()]], [translated_sentence.split()])

print(bleu_score)

0


In [35]:
input_sentence = input('please write a German source sentence')



please write a German source sentencehallo


In [36]:
input_sentence

'hallo'

In [37]:
input_sentence_encoded = tokenizer_ger.texts_to_sequences([input_sentence])
print(input_sentence_encoded)

input_sentence_encoded = pad_sequences(input_sentence_encoded, maxlen=max_length, padding='post')
print(input_sentence_encoded)

[[251]]
[[251   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0]]


In [38]:
input_sentence_encoded


array([[251,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int32)

In [39]:
pridiction_probabilities_inp = model.predict(input_sentence_encoded[:1], verbose=0)[0]
    
print(pridiction_probabilities_inp)
integers_inp = [np.argmax(vector) for vector in pridiction_probabilities_inp]

print(integers_inp)

[[1.6252521e-03 2.1578169e-03 1.1236207e-03 ... 3.7360113e-04
  2.5249922e-04 3.7715415e-04]
 [2.2135077e-02 3.4371302e-03 3.1913593e-04 ... 3.2661462e-04
  1.5551026e-04 3.6163579e-04]
 [7.8272641e-01 5.4044829e-04 1.1229779e-06 ... 3.8889884e-05
  1.2211330e-05 4.9388909e-05]
 ...
 [9.9979085e-01 5.4429523e-07 2.0855923e-11 ... 3.0356160e-09
  7.5507994e-10 4.7609241e-09]
 [9.9979085e-01 5.4429626e-07 2.0857514e-11 ... 3.0354133e-09
  7.5502959e-10 4.7607789e-09]
 [9.9979085e-01 5.4429313e-07 2.0858827e-11 ... 3.0352165e-09
  7.5498352e-10 4.7606337e-09]]
[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [40]:
# convert integers into words
eng_sentence = []
for each_int in integers_inp:
	for word, index in tokenizer_eng.word_index.items():
		if index == each_int:
			eng_sentence.append(word)

translated_sentence = ' '.join(eng_sentence)

print(translated_sentence)

you


In [47]:
#EVALUTATION

eval_file =  pd.read_csv("eng_deu_evaluation.txt", sep='\t', header = None, names=['eng', 'ger'], encoding = "UTF-8")
eval_file['eng'] = eval_file['eng'].str.replace('[{}]'.format(string.punctuation), '')
eval_file['ger'] = eval_file['ger'].str.replace('[{}]'.format(string.punctuation), '')

evalX = tokenizer_ger.texts_to_sequences(eval_file['ger'].values)
evalX = pad_sequences(evalX, maxlen=max_length, padding='post')




In [48]:
prediction_probabilities = model.predict(evalX, verbose=0)

In [49]:
integers = [[np.argmax(vector) for vector in array] for array in prediction_probabilities]

predicted_sentences = []
for array in integers:
    eng_sentence = []
    for each_int in array:
        for word, index in tokenizer_eng.word_index.items():
            if index == each_int:
                eng_sentence.append(word)
                
    translated_sentence = ' '.join(eng_sentence)
    predicted_sentences.append(translated_sentence)
print(predicted_sentences)



['dont', 'dont', 'tom you', 'dont', 'dont', 'dont you', 'dont you', 'dont you', 'dont you', 'tom', 'are you', 'dont you', 'tom you it', 'dont', 'tom', 'its is', 'tom tom', 'i like', 'i was', 'i was a', 'he a busy', 'you', 'dont you', 'dont you', 'tom tom', 'youre the', 'i am', 'dont', 'you', 'tom', 'he was', 'i like to', 'i like to', 'i like a', 'i was', 'im am', 'tom is', 'is is is', 'its is', 'its is', 'tom', 'dont you', 'tom', 'tom', 'dont you', 'tom tom', 'is is is', 'he was', 'are you', 'tom', 'tom you', 'dont', 'dont you', 'tom tom', 'dont you', 'you', 'he was a', 'he was a', 'im am', 'im am', 'i like you', 'i like to', 'i like it', 'i am', 'im am', 'i a a', 'i like it', 'im a a', 'im a', 'its the', 'its is', 'tom tom', 'tom is', 'tom is', 'youre are', 'are is', 'are', 'you you me', 'we you', 'he you a', 'he a a', 'he a a', 'im am', 'i like to', 'i like to', 'i like to', 'i am', 'i like a', 'i like to', 'i like', 'i like it', 'i like to', 'i like to', 'i am', 'im a', 'i am', 'you

In [254]:
#gold sentences
print(eval_file['eng'].values)

['Can I help' 'Can I sing' 'Can I stay' 'Carry this' 'Check that'
 'Check that' 'Check this' 'Choose one' 'Choose one' 'Come again'
 'Come again' 'Come again' 'Come alone' 'Come alone' 'Come alone'
 'Come along' 'Come along' 'Come early' 'Come early' 'Come on in'
 'Come on in' 'Come on in' 'Come to me' 'Come to us' 'Definitely'
 'Destroy it' 'Did Tom go' 'Did Tom go' 'Dig faster' 'Do come in'
 'Do come in' 'Do come in' 'Do men cry' 'Dont brag' 'Dont come'
 'Dont come' 'Dont come' 'Dont jump' 'Dont look' 'Dont move' 'Dont move'
 'Dont peek' 'Dont push' 'Dont sing' 'Dont stop' 'Dont talk' 'Dont talk'
 'Dont wait' 'Dont wait' 'Dont wait' 'Dont yell' 'Dont yell' 'Dont yell'
 'Drink this' 'Drink this' 'Duty calls' 'Eat slowly' 'Examine it'
 'Fill it up' 'Find a job' 'Find a job' 'Find a job' 'Fire burns'
 'Follow Tom' 'Follow Tom' 'Follow Tom' 'Follow him' 'Forget Tom'
 'Forget him' 'Forgive us' 'Get off me' 'Give it up' 'Go on home'
 'Go see Tom' 'Go to work' 'Go to work' 'God exists' 'Goo

In [50]:
bleu_score = corpus_bleu([sentence.split() for sentence in eval_file['eng'].values], [translated_sentence.split() for translated_sentence in predicted_sentences])
print(bleu_score)



1.3335277372413589e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
