<img src="https://raw.githubusercontent.com/teja/Machine_Learning/master/Images/seq_to_seq_lstm_steps.PNG" width="540" height="240" align="left"/>

In [1]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(keras.__version__)
import pandas as pd
import numpy as np
import re

2.2.0
2.3.0-tf


In [0]:
## Import data and extract zip file in CWD
import zipfile, urllib, os,io
from urllib.request import Request,urlopen, urlretrieve
url = "https://raw.githubusercontent.com/atulpatelDS/Data_Files/master/vocabulary/hin-eng.zip"
 
local_filename, headers = urllib.request.urlretrieve(url=url)
zip_file = zipfile.ZipFile(file = local_filename, mode = 'r')
##zip_file.extractall(path = os.getcwd())     #os.getcwd() directs to current working directory
## If using windows we can use below command but we need to download wget
#!wget https://raw.githubusercontent.com/atulpatelDS/Data_Files/master/vocabulary/hin-eng.zip --quiet

In [0]:
## Read the text file
data = ""
with zip_file.open("hin.txt") as textfile:
  for line in io.TextIOWrapper(textfile,"utf-8"):
    data += line 

In [4]:
type(data),len(data)

(str, 401016)

In [5]:
data[400:500]

'.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishir) & #6179123 (fastrizwaan)\nHello!\tनमस्'

Extract Language source and Target pairs

In [0]:
## We need to delete the extra string from "\tCC-BY" in each line
## import re
textdata = re.findall(r"^(.*)\tCC-BY",data,re.M)

In [0]:
#Split by newline character
#data =  data.split('\n')

In [0]:
#Show some Data
#data[100:105]

In [0]:
#type(data),len(data)

In [10]:
type(textdata),len(textdata)

(list, 2778)

In [11]:
textdata[10:15]

["I'm OK.\tमैं ठीक हूँ।",
 'Awesome!\tबहुत बढ़िया!',
 'Come in.\tअंदर आ जाओ।',
 'Get out!\tबाहर निकल जाओ!',
 'Go away!\tचले जाओ!']

Separate Source and Target Language Pairs

In [0]:
encoder_text = [] #Initialize Source language list
decoder_text = [] #Initialize Target language list
#Iterate over data
for line in textdata:
    try:
        in_txt, out_txt = line.split('\t')
        encoder_text.append(in_txt)
        
        # Add tab '<start>' as 'start sequence in target
        # And '<end>' as End
        decoder_text.append('<start> ' + out_txt + ' <end>')
    except:
        pass #ignore data which goes into error   

In [13]:
encoder_text[100:105]

['I have a car.',
 'I have a dog.',
 'I understand.',
 "I'm a doctor.",
 'It is a book.']

In [14]:
decoder_text[100:105]

['<start> मेरे पास एक गाड़ी है। <end>',
 '<start> मेरे पास एक कुत्ता है। <end>',
 '<start> मैं समझता हूँ। <end>',
 '<start> मैं डॉक्टर हूँ। <end>',
 '<start> यह किताब है। <end>']

Tokenize the Source Language Sentences

In [0]:
encoder_tk = keras.preprocessing.text.Tokenizer()
encoder_tk.fit_on_texts(encoder_text)

In [0]:
#encoder_tk.word_index
#encoder_tk.word_count

In [0]:
## Convert sentences of tokenizer to number
encoder_seq  = encoder_tk.texts_to_sequences(encoder_text)

In [18]:
type(encoder_seq),len(encoder_seq)

(list, 2778)

In [19]:
encoder_seq[100:105]

[[2, 14, 6, 96], [2, 14, 6, 124], [2, 208], [39, 6, 150], [10, 5, 6, 69]]

In [20]:
#Maximum length of sentence
max_encoder_seq_length = max([len(txt) for txt in encoder_seq])
print('Maximum sentence length for Source language: ', max_encoder_seq_length)

#Source language Vocablury
encoder_vocab_size = len(encoder_tk.word_index)
print('Source language vocablury size: ', encoder_vocab_size)

Maximum sentence length for Source language:  22
Source language vocablury size:  2375


Tokenize the Target Language Sentences

In [0]:
#Tokenizer for target language, filters should not <start> and <end>
#remove < and > used in Target language sequences
decoder_tk = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
decoder_tk.fit_on_texts(decoder_text) #Fit it on target sentences
decoder_seq = decoder_tk.texts_to_sequences(decoder_text) #Convert sentences to numbers 

In [22]:
type(decoder_seq),len(decoder_seq),decoder_seq[100:105]

(list,
 2778,
 [[1, 28, 40, 20, 101, 3, 2],
  [1, 28, 40, 20, 202, 3, 2],
  [1, 6, 762, 22, 2],
  [1, 6, 181, 22, 2],
  [1, 25, 78, 3, 2]])

In [23]:
#Maximum length of sentence
max_decoder_seq_length = max([len(txt) for txt in decoder_seq])
print('Maximum sentence length for Target language: ', max_decoder_seq_length)

#Target language Vocablury
decoder_vocab_size = len(decoder_tk.word_index)
print('Target language vocablury size: ', decoder_vocab_size)

Maximum sentence length for Target language:  27
Target language vocablury size:  2973


Compare Encoder and Decoder sentences lenght


In [24]:
#Source Language sentences
print('Length for sentence number 100: ', len(encoder_seq[100]))
print('Length for sentence number 2000: ', len(encoder_seq[2000]))

Length for sentence number 100:  4
Length for sentence number 2000:  5


In [25]:
#Target Language sentences
print('Length for sentence number 100: ', len(decoder_seq[100]))
print('Length for sentence number 2000: ', len(decoder_seq[2000]))

Length for sentence number 100:  7
Length for sentence number 2000:  9


As we can see there are difference in lenght of sentences and we need to use same length sentence for both encoder and decoder so we will use padding 

### Padding in sentences

In [0]:
## Padding in source sentences: will use length =22 for all encoder sentences
encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(encoder_seq, 
                                                                   maxlen=max_encoder_seq_length, #22
                                                                   padding='pre')

## Padding "pre"  means data closer to the end point

In [0]:
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_seq, 
                                                                   maxlen=max_decoder_seq_length, #27
                                                                   padding='post')
## Padding "post" means data closer to the start point

In [28]:
print('Source data shape: ', encoder_input_data.shape)
print('Target data shape: ', decoder_input_data.shape)

Source data shape:  (2778, 22)
Target data shape:  (2778, 27)


In [29]:
encoder_text[0],type(encoder_input_data)

('Wow!', numpy.ndarray)

In [30]:
encoder_input_data[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1264],
      dtype=int32)

In [31]:
decoder_text[0],type(decoder_input_data)

('<start> वाह! <end>', numpy.ndarray)

In [32]:
decoder_input_data[0]

array([  1, 750,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0], dtype=int32)

In [0]:
## Integer to word converter for decoder output
#decoder_tk.word_index
int_to_word_decoder = dict((i,c) for c,i in decoder_tk.word_index.items())

In [34]:
int_to_word_decoder[15]

'की'

Prepare Decoder Output

In [0]:
#Initialize array
decoder_target_data = np.zeros((decoder_input_data.shape[0], decoder_input_data.shape[1]))

#Shift Target output by one word
for i in range(decoder_input_data.shape[0]):
    for j in range(1,decoder_input_data.shape[1]):
        decoder_target_data[i][j-1] = decoder_input_data[i][j]

In [36]:
decoder_target_data[0]

array([750.,   2.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [0]:
#Initialize one hot encoding array
decoder_target_one_hot = np.zeros((decoder_input_data.shape[0], #number of sentences
                                   decoder_input_data.shape[1], #Number of words in each sentence
                                   len(decoder_tk.word_index)+1)) #Vocab size + 1

In [0]:
#Build one hot encoded array
for i in range(decoder_target_data.shape[0]):
    for j in range(decoder_target_data.shape[1]):
        decoder_target_one_hot[i][j] = tf.keras.utils.to_categorical(decoder_target_data[i][j],
                                                                     num_classes=len(decoder_tk.word_index)+1)    

In [39]:
decoder_target_one_hot.shape

(2778, 27, 2974)

Trainning Model

In [0]:
## lets take some input parameter
encoder_embedding_size = 50
decoder_embedding_size = 50
rnn_units = 256 ## It measn H(hidden state in LSTM) and C(cell staye in LSTM) both will have 256 numbers
## LSTM Timestamp is 22 becuase my input language sentence lenght is 22 and we are going to learn max 22 lenght sentence
## This could be change if sentence lenght is more or less

Build Encoder

<img src="https://raw.githubusercontent.com/atulpatelDS/Machine_Learning/master/Images/seq_to_seq_lstm_encoder.PNG" width="540" height="240" align="left"/>

In [0]:
## Input Layer for Encoder
## Decoder has more than 1 input/output so we are not using Sequential
encoder_inputs = tf.keras.layers.Input(shape=(None,)) ##  As we have 22 length sentence so we can use 22 inplace of None but we we want to work with any number than we should use None.
## Add embedding layer
encoder_embedding = tf.keras.layers.Embedding(encoder_vocab_size+1, encoder_embedding_size)
## Get embedding Layer output by feeding encoder input
encoder_embedding_output = encoder_embedding(encoder_inputs)
## Add LSTM Layer to get H and C and output as shown in below image
## return_state=True,## when we set True we get 3 output(H,C and output)
encoder_out,state_h, state_c = tf.keras.layers.LSTM(rnn_units,return_state=True)(encoder_embedding_output)
## Build a list of H and C to feed in decoder
encoder_states = [state_h,state_c]

In [42]:
state_c,state_h

(<tf.Tensor 'lstm/Identity_2:0' shape=(None, 256) dtype=float32>,
 <tf.Tensor 'lstm/Identity_1:0' shape=(None, 256) dtype=float32>)

Build Decoder

In [0]:
## Decoder Input-- Target padded sequences
decoder_inputs = tf.keras.layers.Input(shape=(None,))##As we have 27 length sentence so we can use None inplace of 27 but we we want to work with any number than we should use None.
decoder_embedding = tf.keras.layers.Embedding(decoder_vocab_size + 1, decoder_embedding_size)
## Embedding Layer Output
decoder_embedding_output = decoder_embedding(decoder_inputs)
## Decoder LSTM 
## return_sequences=True means get all H output from each time stamps.
## return_state=True,when we set True we get H output at each time stmaps as well as last H and C
decoder_rnn = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True)
## Decoder LSTM Output
decoder_output,_,_ = decoder_rnn(decoder_embedding_output, initial_state=encoder_states)## initial_state encoder output H and C
## Decoder Output to DENSE layer
decoder_dense =tf.keras.layers.Dense(decoder_vocab_size + 1, activation='softmax')
## Dense Layer Output
decoder_outputs = decoder_dense(decoder_output)

In [44]:
decoder_inputs,decoder_embedding,decoder_embedding_output,decoder_rnn,decoder_output,decoder_dense,decoder_outputs

(<tf.Tensor 'input_2:0' shape=(None, None) dtype=float32>,
 <tensorflow.python.keras.layers.embeddings.Embedding at 0x7f89601782b0>,
 <tf.Tensor 'embedding_1/Identity:0' shape=(None, None, 50) dtype=float32>,
 <tensorflow.python.keras.layers.recurrent_v2.LSTM at 0x7f89601786d8>,
 <tf.Tensor 'lstm_1/Identity:0' shape=(None, None, 256) dtype=float32>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f896cbe99e8>,
 <tf.Tensor 'dense/Identity:0' shape=(None, None, 2974) dtype=float32>)

<img src="https://raw.githubusercontent.com/atulpatelDS/Machine_Learning/master/Images/seq_to_seq_lstm_decoder_with_value.PNG" width="700" height="240" align="left"/>

<img src="https://raw.githubusercontent.com/atulpatelDS/Machine_Learning/master/Images/seq_to_seq_lstm_decoder.PNG" width="700" height="240" align="left"/>

Build seq2seq Model using Both Encoder and Decoder

In [0]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], #2 Inputs to the model
                              decoder_outputs) #Output of the model

<img src="https://raw.githubusercontent.com/atulpatelDS/Machine_Learning/master/Images/seq_to_seq_lstm.PNG" width="540" height="240" align="left"/>

In [46]:
model.input

[<tf.Tensor 'input_1:0' shape=(None, None) dtype=float32>,
 <tf.Tensor 'input_2:0' shape=(None, None) dtype=float32>]

In [47]:
model.output

<tf.Tensor 'dense/Identity:0' shape=(None, None, 2974) dtype=float32>

In [0]:
model.compile(optimizer="adam",loss="categorical_crossentropy")

Train The Model

In [49]:
## English-Source Lang sentence length = 22
## Hindi-Target Lang Sentence Length = 27
## Total count of sentences pairs (English to Hindi) : 2778
## Target Language Vocabulary size+1-- One Hot Encoding :  2974
encoder_input_data.shape,decoder_input_data.shape,decoder_target_one_hot.shape ## Target Language Vocabulary size+1:  2974

((2778, 22), (2778, 27), (2778, 27, 2974))

In [50]:
model.fit([encoder_input_data,decoder_input_data],decoder_target_one_hot,
          batch_size=64,
          epochs=1000,
          validation_split=0.2)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x7f896005deb8>

In [51]:
## Save the model
model.save("model/seq2seq_language_translation_lstm.hd5")

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: model/seq2seq_language_translation_lstm.hd5/assets


## Prediction Model

As we see that we use the Hindi(Target) as input for decoder and in real time we will not get that as input for our model so we can not directly use this using model.predict. We need to make separate prediction model.

Build the encoder Model to predict the Encoder State

<img src="https://raw.githubusercontent.com/atulpatelDS/Machine_Learning/master/Images/seq_to_seq_lstm_pred_encoder.PNG" width="440" height="190" align="left"/>

In [0]:
encoder_model = tf.keras.models.Model(encoder_inputs, #Padded input sequences
                                      encoder_states) #Hidden state and Cell state at last time step
## Encoder model is almost same as we used in traiing model

**Build the decoder model**
1. Define input for both H and C state 
2. Get Decoder LSTM output along with H and C state
3. Get Decoder Dense layer output
4. Build Model

***Define input for both H and C state***

In [0]:
#Hidden state input
decoder_state_input_h = tf.keras.layers.Input(shape=(rnn_units,))

#Cell state input
decoder_state_input_c = tf.keras.layers.Input(shape=(rnn_units,))

#Putting it together
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

Build the encoder Model to predict the Encoder State

<img src="https://raw.githubusercontent.com/atulpatelDS/Machine_Learning/master/Images/seq_to_seq_lstm_pred_input_output_decoder.PNG" width="400" height="200" align="left"/>

***Get Decoder LSTM output along with H and C state***

In [0]:
#Get Embedding layer output
x = decoder_embedding(decoder_inputs) ## decoder_embedding_output
#We will use the layer which we trained earlier
## return_sequences=True means get all H output from each time stamps.
## return_state=True,when we set True we get H output at each time stmaps as well as last H and C
rnn_outputs, state_h, state_c = decoder_rnn(x, initial_state=decoder_states_inputs)
## As we saw earlier we ignore the state_h,state_c in decoder building but here we are using becuase we use use last state of H and C
## We will initilize the second Decoder layer with the previous H and C as so on as per below image
decoder_states = [state_h, state_c]


<img src="https://raw.githubusercontent.com/atulpatelDS/Machine_Learning/master/Images/seq_to_seq_lstm_pred_decoder_H_C.PNG" width="540" height="240" align="left"/>

***Get Decoder Dense layer output***

In [0]:
#decoder_dense = tf.keras.layers.Dense(decoder_vocab_size+1,activation="softmax")
## Dense Layer Output
decoder_outputs = decoder_dense(rnn_outputs)

***Build Decoder Model***

In [56]:
decoder_inputs,decoder_states_inputs,decoder_outputs,decoder_states

(<tf.Tensor 'input_2:0' shape=(None, None) dtype=float32>,
 [<tf.Tensor 'input_3:0' shape=(None, 256) dtype=float32>,
  <tf.Tensor 'input_4:0' shape=(None, 256) dtype=float32>],
 <tf.Tensor 'dense_1/Identity:0' shape=(None, None, 2974) dtype=float32>,
 [<tf.Tensor 'lstm_1_1/Identity_1:0' shape=(None, 256) dtype=float32>,
  <tf.Tensor 'lstm_1_1/Identity_2:0' shape=(None, 256) dtype=float32>])

In [0]:
decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,  #Model inputs
                                      [decoder_outputs] + decoder_states)

***Build Prediction Function***

In [58]:
np.zeros((1,1)).shape

(1, 1)

In [0]:
target_text_seq = np.zeros((1,1))

In [60]:
type(target_text_seq)

numpy.ndarray

In [61]:
decoder_tk.word_index["<start>"]

1

In [0]:
target_text_seq[0][0] = decoder_tk.word_index["<start>"]

In [63]:
target_text_seq[0][0] 

1.0

In [64]:
## Lets check what is the word for index 1
int_to_word_decoder[1]

'<start>'

In [0]:
## Prediction fucntion which will run both encoder and decoder Model
def decoder_output_sentence(input_lang_sequence):
  ## Get the encoder state output : sentence embedding (h and c)
  decoder_initial_states_value = encoder_model.predict(input_lang_sequence)
  # Build the starting sequence for decoder model that is always a <start> as we defined this on each hindi sentences
  target_text_seq = np.zeros((1,1))
  target_text_seq[0][0] = decoder_tk.word_index["<start>"]

  ## Lets initialize the predicted sentence
  predicted_sentence = ""
  num_of_predictions = 0
  ## flag to check if prediction should be stop
  stop_loop = False
  ## Start the loop
  while not stop_loop:
    ## input value for decoder is H and C which we got from Encoder Predicted Model and Target Language start sequence which is always <start>
    predicted_outputs,h,c = decoder_model.predict([target_text_seq]+decoder_initial_states_value)
    ## Now get the predicted word index with highest probability
    predicted_output = np.argmax(predicted_outputs[0,-1,:]) ## ignore index 0 and -1
    ## Now get the predicted word from predicted index
    predicted_word = int_to_word_decoder[predicted_output]
    ## Check if predcition should stop
    if(predicted_word == "<end>" or num_of_predictions>max_decoder_seq_length):
      stop_loop=True
      continue
    num_of_predictions += 1

    ## Updated predicted_sentences 
    if (len(predicted_sentence)==0):
      predicted_sentence = predicted_word
    else:
      predicted_sentence = predicted_sentence+" "+predicted_word
    ## Update target_lang_seq as input for decoder for next initilization for new predicted word
    target_text_seq[0][0] = predicted_output
    ## Need to update initial input value for decoder 
    decoder_initial_states_value = [h,c]
  return predicted_sentence

***Call the Prediction function using any random sentence***

In [66]:
#Generate a random number
start_num = np.random.randint(0, high=len(encoder_text) - 10)

#Predict model output for 5 sentences
for i in range(start_num, start_num + 5):
    input_sequence = encoder_input_data[i : i+1]
    predicted_sentence = decoder_output_sentence(input_sequence)
    print('--------')
    print ('Input sentence: ', encoder_text[i])
    print ('Predicted sentence: ', predicted_sentence )

--------
Input sentence:  Let me introduce my wife.
Predicted sentence:  यह मेरी पत्नी है।
--------
Input sentence:  Let me know your address.
Predicted sentence:  मुझे अपना पता बतादेना।
--------
Input sentence:  My father died of cancer.
Predicted sentence:  मेरे पिताजी कैंसर से चल बसे।
--------
Input sentence:  Please wait five minutes.
Predicted sentence:  कृपया पाँच मिनट ठहरिए।
--------
Input sentence:  She is an obstinate girl.
Predicted sentence:  वह एक ज़िद्दी लड़की है।


***Save encoder and Decoder Model***

In [67]:
#Compile models to avoid error
encoder_model.compile(optimizer='adam',loss='categorical_crossentropy')
decoder_model.compile(optimizer='adam',loss='categorical_crossentropy')

#Save the models
encoder_model.save('model/seq2seq_encoder_eng_hin.hd5')  #Encoder model
decoder_model.save('model/seq2seq_decoder_eng_hin.hd5')  #Decoder model

INFO:tensorflow:Assets written to: model/seq2seq_encoder_eng_hin.hd5/assets
INFO:tensorflow:Assets written to: model/seq2seq_decoder_eng_hin.hd5/assets


***Save Encoder and Decoder Tokenizer***

In [0]:
## Need to save becuase we cannot change the indexing once we traiined our model.
import pickle

pickle.dump(encoder_tk,open('model/encoder_tokenizer_eng','wb'))
pickle.dump(decoder_tk,open('model/decoder_tokenizer_hin','wb'))