# Overview
This codelab will demonstrate how to build a LSTM model for MNIST recognition using keras & how to convert the model to TensorFlow Lite.

---



In [None]:
#!pip install tf-nightly

### Prerequisites
We're going to override the environment variable `TF_ENABLE_CONTROL_FLOW_V2` since for TensorFlow Lite control flows.

It needs tensorflow version == 1.14.0 to be able to support the LSTM and Desne layers in tf.lite.

In [1]:
# TF_ENABLE_CONTROL_FLOW_V2 -----> This needs to be overriden and enabled.
import os
os.environ['TF_ENABLE_CONTROL_FLOW_V2'] = '1'

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
import json
import numpy
import pandas as pd
import tensorflow as tf
import keras
from keras import losses
from keras import optimizers
from keras.callbacks import Callback
from keras.layers import Dense
from keras.layers import LSTM, Bidirectional, BatchNormalization, SimpleRNN
from keras.layers import Embedding
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
numpy.random.seed(7)
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vsatpathy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Embeddings

Embedding is a requirement for the weight matrix as an initializer to the Embedding layer in the model.
One can add any text file of pre-trained embedding such as Glove.

This returns 2 values:

    Word vocabulary
    Embedding matrix corresponding to every word

In [2]:
# GLOVE--EMBEDDING
def read_data(file_name):
    with open(file_name,'r') as f:
        word_vocab = set() # not using list to avoid duplicate entry
        word2vector = {}
        for line in f:
            line_ = line.strip() #Remove white space
            words_Vec = line_.split()
            word_vocab.add(words_Vec[0])
            word2vector[words_Vec[0]] = numpy.array(words_Vec[1:],dtype=float)
    print("Total Words in DataSet:",len(word_vocab))
    return word_vocab,word2vector

word_vocab,w2v = read_data('glove.6B.100d.txt')

Total Words in DataSet: 400000


# Pre-Process

The pre-processing can vary user to user.

    1. Conversion into lower text.
    2. Removal of stop words.
    3. Removal of single characters.
    4. Removal of white spaces.

These are the few examples to the same.

In [3]:
def helper(text):
    dummy=[]
    for word in text:
        dummy.append(str(word))
    final=' '.join(dummy)
    return final

# Note

Keep the stopwords while training for intent classification.

In case of auto-correction stopwords need to be removed.

In [4]:
def preprocess(text):
    text=str(text)
    text=text.split(" ")
    text=helper(text)
    text = str(text.lower())
    # Remove all the special characters
    text = re.sub(r'\W', ' ', text)
    #text = re.sub(r'[^a-zA-Z ]+', '', text)
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    #tokenize the texts using nltk
    text = word_tokenize(text)
    
    #######   STOPWORDS.   #######
    #text = [word for word in text if word not in stop_words]
    #Lemmatize the words
    word_net_lemmatizer = WordNetLemmatizer()
    text = [word_net_lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

# Data preparation

This process involves:

    1. Reading of data from excel
    2. Encoding the labels
    3. Creating one unanimous DataFrame
    4. Segregating into x,y variables for passing into the model
    5. Tokenizing the input sequences
    6. Padding the sequences for constant input length to the model

In [5]:
xls=pd.ExcelFile('ML Data set.xlsx')
df=pd.read_excel(xls,'Intent Training Set')

labels=[]
for col in df.columns:
    labels.append(col)
print(labels)
master_values={}
dummy=0
cor_word={}
for lab in labels:
    master_values[lab]=df[lab].values
    cor_word[dummy]=lab
    dummy+=1

flag=0
concat_values=[]
corres_labels=[]
for key,values in master_values.items():
    concat_values.extend(master_values[key])
    for j in range(len(values)):
        corres_labels.append(flag)
    flag+=1
    
final_data=pd.DataFrame({'text':concat_values,'feature':corres_labels})

['log_medication', 'content _search_details ', 'content _search_pricing', 'content _search_routine', 'content _search_safety', 'content _search_support', 'user_search_instances', 'user_search_quantity', 'user_search_last_instance', 'native_search']


In [6]:
final_data.text = final_data.text.apply(preprocess)

X = final_data.text
y = final_data.feature

In [7]:
max_length = 20

tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_data.text)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_length, padding='post')

num_words = len(tokenizer.word_index) + 1
embedding_matrix = numpy.zeros((num_words, 25))

# Generation of embedding matrix

We check if the existing word exists in the pre-trained vocab.

    if True:
        Add it to the embedding matrix for the corresponding word.
    else:
        pass it as dummy matrix.

In [8]:
dummy_matrix = numpy.zeros(shape = (25,))

for word,i in tokenizer.word_index.items():
    try:
        embedding_vector = w2v[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        embedding_matrix[i] = dummy_matrix

## Step 1 Build the MNIST LSTM model.

Note we will be using **`tf.lite.experimental.nn.TFLiteLSTMCell`** & **`tf.lite.experimental.nn.dynamic_rnn`**

As the tflite wrapper doesn't directly support the LSTM layers of keras, hence a function is placed for the manual addition of LSTM layers with the use of tf.lit.experimental package.

For more canonical lstm codelab, please see [here](https://github.com/kerasteam/keras/blob/master/examples/imdb_lstm.py).


In [9]:
def buildLstmLayer(inputs, num_layers, num_units):
  """Build the lstm layer.

  Args:
    inputs: The input data.
    num_layers: How many LSTM layers do we want.
    num_units: The unmber of hidden units in the LSTM cell.
    
  """
  lstm_cells = []
  for i in range(num_layers):
    lstm_cells.append(
        tf.lite.experimental.nn.TFLiteLSTMCell(
            num_units, forget_bias=0, name='rnn{}'.format(i)))
  lstm_layers = tf.keras.layers.StackedRNNCells(lstm_cells)
  # Assume the input is sized as [batch, time, input_size], then we're going
  # to transpose to be time-majored.
  transposed_inputs = tf.transpose(
      inputs, perm=[1, 0, 2])
  outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
      lstm_layers,
      transposed_inputs,
      dtype='float32',
      time_major=True)
  unstacked_outputs = tf.unstack(outputs, axis=0)
  return unstacked_outputs[-1]

#tf.reset_default_graph()
model = tf.keras.models.Sequential([
  tf.keras.layers.Input(shape=(X.shape[1],), name='input'),
  tf.keras.layers.Embedding(num_words , 25,weights=[embedding_matrix],trainable=True,input_length=X.shape[1]),
  tf.keras.layers.Lambda(buildLstmLayer, arguments={'num_layers' : 2, 'num_units' : 64}),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(len(labels), activation=tf.nn.softmax, name='output')
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 25)            4225      
_________________________________________________________________
lambda (Lambda)              (None, 64)                56064     
_________________________________________________________________
flatten (Flatten)            (None, 64)                0         
_________________________________________________________________
output (Dense)               (None, 10)                650       
Total params: 60,939
Trainable params: 60,939
Non-trainable params: 0
_________________________________________________________________


## Step 2: Train & Evaluate the model.

The data is then split into training data and testing data.

Things to lookout for are the hyper parameters that need to be amended basis on the dataset size and the variance in the same. Currently used hyper parameters are:

    1. Epochs
    2. Batch size
    
Other hyper parameters that can come into play:

    1. Learning rate
    2. Decay ratio
    3. Neurons per LSTM
    
The model training begins.

In [10]:
x_train, x_test,y_train, y_test = train_test_split(X,y,test_size=0.2)
#Addition of early stop
early_stop=keras.callbacks.callbacks.EarlyStopping(monitor='val_acc',min_delta=0.002, patience=15)
model.fit(X, y, epochs=200,batch_size=16,validation_data=(x_test,y_test),verbose=1,callbacks=[early_stop])

Train on 430 samples, validate on 86 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200


<tensorflow.python.keras.callbacks.History at 0x14ad89f60>

## Step 3: Convert the Keras model to TensorFlow Lite model.

Note here: we convert to TensorFlow Lite model and export it to the pre-defined path.

In [11]:
sess = tf.keras.backend.get_session()
input_tensor = sess.graph.get_tensor_by_name('input:0')
output_tensor = sess.graph.get_tensor_by_name('output/Softmax:0')
converter = tf.lite.TFLiteConverter.from_session(
    sess, [input_tensor], [output_tensor])
tflite = converter.convert()
open("tf_models/intent_class_update_1.tflite","wb").write(tflite)
print('Model converted successfully!')

Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 27 variables.
INFO:tensorflow:Converted 27 variables to const ops.
Instructions for updating:
Use `tf.compat.v1.graph_util.remove_training_nodes`
Model converted successfully!


## Step 4: Check the converted TensorFlow Lite model.

We're just going to load the TensorFlow Lite model and use the TensorFlow Lite python interpreter to verify the results.

The steps involved constitute of the following:

    1. Giving the input text for testing
    2. Passing it in sets of 2 words to the auto_correct function
    3. Replicating the pre-processing methodology as used before
    4. Loading the tflite model in ints interpreter.
    5. Passing into the model for prediction
    6. Restitching the corrected texts for output
    
The algorithm can be amended to the users liking. But the steps involved remains the same.

In [12]:
def auto_correct(text,tf_lite_model):
    data = preprocess(text)
    #print("data: ----->",data)
    tokenized = tokenizer.texts_to_sequences([data])
    #print("tokenized: ----->", tokenized)
    padded = pad_sequences(tokenized, maxlen=max_length, padding='post')
    ip = np.array(padded,dtype="float32")
    #print("ip: ----->",ip)
    
    interpreter = tf.lite.Interpreter(model_content=tf_lite_model)
    
    try:
        interpreter.allocate_tensors()
    except ValueError:
        assert False
    
    dummy=[[]]
    if tokenized!=dummy:
        input_index = (interpreter.get_input_details()[0]['index'])
        print(input_index,ip)
        interpreter.set_tensor(input_index,ip)
        interpreter.invoke()
        output_index = (interpreter.get_output_details()[0]['index'])
        result = interpreter.get_tensor(output_index)
        
        # Reset all variables so it will not pollute other inferences.
        interpreter.reset_all_variables()
        return result
    else:
        return dummy[0]

In [13]:
def get_correct_word(shortlisted):
    dummy=[]
    for word in shortlisted:
        if word not in dummy:
            dummy.append(word)
    final_text=(" ").join(dummy)
    return final_text

In [None]:
sentence_to_autocorrect = 'i want to log m gallery'
print("input: -----> ",sentence_to_autocorrect)
words = sentence_to_autocorrect.split(' ')
shortlisted=[]

for i in range(len(words)-1):
    sub_text = words[i],words[i+1]
    results = auto_correct(str(sub_text),tflite)
    if len(results)>0:
        shortlisted.append(cor_word[np.argmax(results)])
    else:
        shortlisted.extend(sub_text)
    
corrected=get_correct_word(shortlisted)
print("output: -----> ",corrected)

# Inferencing for intent

In [16]:
input_text="screen is too bright"
data = preprocess(input_text)
#print("data: ----->",data)
tokenized = tokenizer.texts_to_sequences([data])
#print("tokenized: ----->", tokenized)
padded = pad_sequences(tokenized, maxlen=max_length, padding='post')
ip = np.array(padded,dtype="float32")
print("ip: ----->",ip)

interpreter = tf.lite.Interpreter(model_content=tflite)

try:
    interpreter.allocate_tensors()
except ValueError:
    assert False
    
input_index = (interpreter.get_input_details()[0]['index'])
interpreter.set_tensor(input_index,ip)
interpreter.invoke()
output_index = (interpreter.get_output_details()[0]['index'])
result = interpreter.get_tensor(output_index)
print(result)
print("Intent: ",labels[np.argmax(result[0])])
# Reset all variables so it will not pollute other inferences.
interpreter.reset_all_variables()

ip: -----> [[120.  20.  45. 161.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.]]
[[2.2154201e-02 1.4697796e-01 1.1103599e-02 8.6035654e-03 1.3480875e-02
  6.9347718e-03 1.0445220e-04 9.6677151e-03 5.0666803e-03 7.7590621e-01]]
Intent:  native_search


# Exporting
The vocabulary of the new words are to be saved in a .json format.

In [15]:
import json
with open('intent_class_embeddings_1.json', 'w') as f: 
    json.dump(tokenizer.word_index,f)