<a href="https://colab.research.google.com/github/tohbenghwee/mldds/blob/master/Keras_Eng2Tamil_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Steps
- Load data
- Preprocess, tokenize, pad sequences
- Design Arch
- Fit the model
- Evalute 
- Predict

### Datasets
http://www.manythings.org/anki/  (Download and unzip mar-tam.zip file)

In [0]:
# !wget http://www.manythings.org/anki/tam-eng.zip
# from zipfile import ZipFile
# ZipFile('mar-eng.zip').extractall()

### link to visualize model

In [0]:
# https://lutzroeder.github.io/netron/

#Import Necessary Packages

In [0]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense, CuDNNLSTM
from keras.models import Model

# Load data


In [0]:
lines= pd.read_csv('tam.txt', names=['eng', 'tam'], sep="\t")

In [0]:
# lines = lines.sample(n=10000, replace=True)

#Preprocess, tokenize, pad sequences

In [0]:
lines.shape

(10000, 2)

In [0]:
lines.tail()

Unnamed: 0,eng,tam
159,Do you know when he will come?,அவன் எப்ப வருவான் என்று உனக்குத் தெரியுமா
176,"Because he's sick, he can't come.",அவனுக்கு உடல் நிலை சரியில்லாததனால் அவனால் வர இ...
17,Give it to her.,அவளிடம் கொடு
45,Beware of the dog!,நாய் ஜாக்கிரதை!
180,Tom has been crying all afternoon.,டாம் மதியம் முழுவதும் அழுதுகொண்டேயிருக்கிறான்.


In [0]:
# Lowercase all characters
lines.eng=lines.eng.apply(lambda x: x.lower())
lines.tam=lines.tam.apply(lambda x: x.lower())

In [0]:
# Remove quotes
lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x))
lines.tam=lines.tam.apply(lambda x: re.sub("'", '', x))

In [0]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.tam=lines.tam.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [0]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines.eng=lines.eng.apply(lambda x: x.translate(remove_digits))
lines.tam=lines.tam.apply(lambda x: x.translate(remove_digits))
# lines.tam = lines.tam.apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

In [0]:
# Remove extra spaces
lines.eng=lines.eng.apply(lambda x: x.strip())
lines.tam=lines.tam.apply(lambda x: x.strip())
lines.eng=lines.eng.apply(lambda x: re.sub(" +", " ", x))
lines.tam=lines.tam.apply(lambda x: re.sub(" +", " ", x))

In [0]:
# Add start and end tokens to target sequences
lines.tam = lines.tam.apply(lambda x : 'START_ '+ x + ' _END')

In [0]:
lines.sample(10)

Unnamed: 0,eng,tam
196,its my fault that the cake was burned i was ta...,START_ என்னுடையத் தவறினால் கேக்கானதுக் கருகிப்...
80,im proud of my son,START_ என் மகனைப் பற்றி பெருமைப் படுகிறேன் _END
125,she got married to him,START_ அவள் அவனுக்கு திருமணம் செய்து வைக்கப் ப...
25,dont lie to me,START_ என்னிடம் பொய் சொல்லாதே _END
144,she went out of the room,START_ அவள் அறையை விட்டு வெளியே சென்றாள் _END
36,he is still here,START_ அவன் இன்னும் இருக்கிறான் _END
64,shut up and listen,START_ வாயை மூடி கவனி _END
105,he got a lot of money,START_ அவனுக்கு நிறைய பணம் கிடைத்தது _END
141,i cant find it anywhere,START_ இது எங்கே இருக்கு என்று என்னால் கண்டுபி...
123,it seems she hates you,START_ அவள் உன்னை வெறுக்கிற மாதிரி தெரிகிறது _END


In [0]:
def get_vocab_set(series):
    allwords=set()
    for s in series:
        for word in s.split():
            if word not in allwords:
                allwords.add(word)
    return allwords

# Vocabulary of English
all_eng_words=get_vocab_set(lines.eng)
# Vocabulary of Tam
all_tam_words=get_vocab_set(lines.tam)

In [0]:
def get_max_len(series):
    lenght_list = [ len(l.split(' ')) for l in series ]
    max_length = np.max(lenght_list)
    return max_length

# Max Length of source sequence
max_length_src = get_max_len(lines.eng)
# max_length_src

# Max Length of target sequence
max_length_tar = get_max_len(lines.tam)
# max_length_tar

In [0]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_tam_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_tam_words)
num_encoder_tokens, num_decoder_tokens

(374, 537)

In [0]:
num_decoder_tokens += 1 # For zero padding
num_decoder_tokens

538

In [0]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [0]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [0]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,eng,tam
93,come home before six,START_ ஆறு மணிக்கு முன்பு வீட் டிற்கு வா _END
173,theres no easy way out of here,START_ இங்கிருந்து வெளியே செல்ல சுலபமான வழியில...
5,she smiled,START_ அவள் சிரித்தாள் _END
115,all of them went there,START_ அவர்கள் எல்லோரும் அங்கே சென்றார்கள் _END
177,friendship requires mutual trust,START_ நட்புக்குத் தேவை பரஸ்பர நம்பிக்கை _END
86,roll the ball to me,START_ பந்தை என்னிடம் உருட்டி விடு _END
97,its a piece of cake,START_ இது ஒரு கேக்கின் துண்டு _END
178,he put the ring on marys finger,START_ அவன் மேரியின் விரலில் மோதிரத்தை அணிவித்...
125,she got married to him,START_ அவள் அவனுக்கு திருமணம் செய்து வைக்கப் ப...
84,its time to get up,START_ தூக்கத்திலிருந்து எழுவதற்கான நேரம் இது ...


In [0]:
# Train - Test Split
X, y = lines.eng, lines.tam
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((9000,), (1000,))

#### Save the train and test dataframes for reproducing the results later, as they are shuffled.

In [0]:
X_train.to_pickle('X_train_eng2tam.pkl')
X_test.to_pickle('X_test_eng2tam.pkl')

In [0]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder - Decoder Model Architecture

In [0]:
latent_dim = 50

In [0]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [0]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [0]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, None, 50)     18700       input_13[0][0]                   
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, None, 50)     26900       input_14[0][0]                   
__________________________________________________________________________________________________
lstm_7 (LS

In [0]:
model.save("tmp.h5")

  '. They will not be included '


#Fit the model

In [0]:
# TIME TAKEN TO FIT :  1 loop, best of 3: 31.7 s per loop

In [0]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128 #128
epochs = 10

In [0]:
# # print(batch_size)
# out = generate_batch(X_train, y_train, batch_size = batch_size)
# for some_X, some_Y in out:
#     print(some_X[0].shape)
#     print(some_X[1].shape)
#     print(some_Y[0].shape)
#     break

In [0]:
%%timeit
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs)
#                     validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
#                     validation_steps = val_samples//batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1 loop, best of 3: 43.7 s per loop


In [0]:
# %%timeit
model.evaluate_generator(generate_batch(X_train, y_train),steps = 3)

[0.014143630241354307, 1.0000000198682149]

In [0]:
model.evaluate_generator(generate_batch(X_test, y_test),steps = 3)

[0.014131749980151653, 1.0]

### Always remember to save the weights

In [0]:
model.save_weights('nmt_eng2tam_weights.h5')

### Load the weights, if you close the application

In [0]:
model.load_weights('nmt_eng2tam_weights.h5')

### Inference Setup

In [0]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

### Decode sample sequeces

In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Evalute

### Evaluation on Train Dataset

In [0]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [0]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Tamil Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Tamil Translation:', decoded_sentence[:-4])

Input English sentence: beware of the dog
Actual Tamil Translation:  நாய் ஜாக்கிரதை 
Predicted Tamil Translation:  நாய் ஜாக்கிரதை 


In [0]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Tamil Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Tamil Translation:', decoded_sentence[:-4])

Input English sentence: im not sure how to answer this
Actual Tamil Translation:  எப்படி பதில் சொல்வது என்பதில் நான் உறுதியாக இல்லை 
Predicted Tamil Translation:  எப்படி பதில் சொல்வது என்பதில் நான் உறுதியாக இல்லை 


### Evaluation on Validation Dataset

In [0]:
val_gen = generate_batch(X_test, y_test, batch_size = 1)
k=-1

In [0]:
k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Tamil Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Tamil Translation:', decoded_sentence[:-4])

Input English sentence: be kind to old people
Actual Tamil Translation:  வயோதிகர்களிடம் அன்பாக இரு 
Predicted Tamil Translation:  வயோதிகர்களிடம் அன்பாக இரு 
