# Install all dependencies

All dependencies inclusive of the layers needed for model construction and language pre-processing models.

In [1]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import json
import numpy
import pandas as pd
from keras import losses
from keras import optimizers
from keras.callbacks import Callback
from keras.layers import Dense
from keras.layers import LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
import re
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from keras.models import load_model

numpy.random.seed(7)

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vsatpathy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vsatpathy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/vsatpathy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Embeddings

Embedding is a requirement for the weight matrix as an initializer to the Embedding layer in the model.
One can add any text file of pre-trained embedding such as Glove.

This returns 2 values:

    Word vocabulary
    Embedding matrix corresponding to every word

In [2]:

# GLOVE--EMBEDDING
def read_data(file_name):
    with open(file_name,'r') as f:
        word_vocab = set() # not using list to avoid duplicate entry
        word2vector = {}
        for line in f:
            line_ = line.strip() #Remove white space
            words_Vec = line_.split()
            word_vocab.add(words_Vec[0])
            word2vector[words_Vec[0]] = numpy.array(words_Vec[1:],dtype=float)
    print("Total Words in DataSet:",len(word_vocab))
    return word_vocab,word2vector

word_vocab,w2v = read_data('glove.6B.100d.txt')

Total Words in DataSet: 400000


# Pre-Process

The pre-processing can vary user to user.

    1. Conversion into lower text.
    2. Removal of stop words.
    3. Removal of single characters.
    4. Removal of white spaces.

These are the few examples to the same.

In [3]:
def helper(text):
    dummy=[]
    for word in text:
        dummy.append(str(word))
    final=' '.join(dummy)
    return final
## While preprocessing total word count in corpus is stored
word_count = []
def preprocess(text):
    text=str(text)
    text=text.split(" ")
    text=helper(text)
    text = str(text.lower())
    # Remove all the special characters
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'[^a-zA-Z ]+', '', text)
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    #tokenize the texts using nltk
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    #Lemmatize the words
    #word_net_lemmatizer = WordNetLemmatizer()
    #text = [word_net_lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    print(text)
    return text

# Data preparation

This process involves:

    1. Reading of data from excel
    2. Encoding the labels
    3. Creating one unanimous DataFrame
    4. Segregating into x,y variables for passing into the model
    5. Tokenizing the input sequences
    6. Padding the sequences for constant input length to the model

In [9]:
#xls=pd.ExcelFile('ML Data set.xlsx')
#df=pd.read_excel(xls,'Intent Training Set')
#df=pd.read_excel(xls,'Auto_correct existing')
df=pd.read_excel('USBank_Intent - RM.xlsx')

labels=[]
for col in df.columns:
    labels.append(col)
print(labels)
master_values={}
dummy=0
cor_word={}
for lab in labels:
    master_values[lab]=df[lab].values
    cor_word[dummy]=lab
    dummy+=1

flag=0
concat_values=[]
corres_labels=[]
for key,values in master_values.items():
    concat_values.extend(master_values[key])
    for j in range(len(values)):
        corres_labels.append(flag)
    flag+=1
    
final_data=pd.DataFrame({'text':concat_values,'feature':corres_labels})

['Show Bill', 'Show Amount', 'Pay ', 'Payment mode', 'Confusion', 'Minimum due', 'Recurring payments', 'Add credit card', 'Add biller', 'Corporate']


In [10]:
final_data.text = final_data.text.apply(preprocess)
X = final_data.text
y = final_data.feature

hey show bills
show bills
hey many bills
hey bills pay immediately
want pay bills
bills mine due
many bills
much excel energy bill
bill amount excel energy
much excel energy
much pay excel energy
much pay excel energy
due amount
show details bill
okay pay bill
pay
pay excel energy today
pay electricity bill
okay pay bill today
want pay bill
lets pay bill
pay using savings account
pay credit card
pay credit card
pay savings account
pay checking account
pay using credit card
pay using savings account
understand
would

mean
understand
would

pay minimum balance credit card
pay minimum card
minimum avoid interest card
pay minimum due
lets pay minimum amount possible
pay minimum avoid interest
pay minimum balance credit card
set th every month
yes set nd
set th
set th
set auto pay
set pay date th every month
setup recurring payment
want add credit card
add credit card
please add credit card
add new credit card
add credit card
please add credit card
add new credit card
add new biller
want ad

In [11]:
max_length = 10

tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_data.text)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_length, padding='post')

num_words = len(tokenizer.word_index) + 1
embedding_matrix = numpy.zeros((num_words, 25))

# Generation of embedding matrix

We check if the existing word exists in the pre-trained vocab.

    if True:
        Add it to the embedding matrix for the corresponding word.
    else:
        pass it empty.

In [12]:
for word,i in tokenizer.word_index.items():
    try:
        embedding_vector = w2v[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        pass

# Call back created

In case the model takes too lonn to train or there is no development in training, callbacks can be used for terminating the training process and saving the model.

In [13]:
ACCUSRACY_THRESHOLD = .98

class Call_back(Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs.get('acc') > ACCUSRACY_THRESHOLD:
            print('Reached accuracy')
            self.model.stop_training = True
            
callback = Call_back()

# Model Architecture

Things to lookout for are the hyper parameters that need to be amended basis on the dataset size and the variance in the same. Currently used hyper parameters are:

    1. Epochs
    2. Batch size
    
Other hyper parameters that can come into play:

    1. Learning rate
    2. Decay ratio
    3. Neurons per LSTM

In [15]:
embedding_model = Sequential()
embedding_model.add(Embedding(num_words, 25,weights=[embedding_matrix],trainable=True,input_length=X.shape[1]))
embedding_model.add(Bidirectional(LSTM(16)))
embedding_model.add(Dense(len(labels), activation='softmax'))
embedding_model.compile(optimizer=optimizers.Adam(),
              loss=losses.sparse_categorical_crossentropy,
              metrics=['acc'])
embedding_model.summary()

#history = embedding_model.fit(X, y, epochs=100, batch_size=2,callbacks=[callback])
history = embedding_model.fit(X, y, epochs=100, batch_size=2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 25)            1275      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                5376      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                330       
Total params: 6,981
Trainable params: 6,981
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
E

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


# Post processing and Testing

The steps involved constitute of the following:

    1. Giving the input text for testing
    2. Passing it in sets of 2 words to the prediction function
    3. Replicating the pre-processing methodology as used before
    4. Passing into the model for prediction
    5. Restitching the corrected texts for output
    
The algorithm can be amended to the users liking. But the steps involved remains the same.

In [16]:
def prediction(test_word,max_length=max_length,tokenizer=tokenizer):
    text = [preprocess(test_word)]
    #text=test_word
    #print('preprocess: ----->',text)
    tokenizer = tokenizer.texts_to_sequences(text)
    #print('tokenizer: ----->',tokenizer)
    embeddings = pad_sequences(tokenizer, maxlen=max_length, padding='post')
    #print('embeddings: ----->',embeddings)
    dummy=[[]]
    if tokenizer!=dummy:
        prediction1 = embedding_model.predict(embeddings)
        #print('prediction: ----->',prediction1)
        return prediction1
    else:
        return dummy[0]

In [17]:
def stitch(final_text):
    dummy=[]
    for i in range(len(final_text)):
        if final_text[i] not in dummy:
            dummy.append(final_text[i])
    final=" ".join(dummy)
    return final

In [18]:
input_text='pay with my credit card'
window_size=1
updated_text=input_text.split()
final_text=[]

for i in range(len(updated_text)):
    if (i+window_size)<len(updated_text):
        test_word=updated_text[i]+" "+updated_text[i+window_size]
        #print('test word: ----->',test_word)
        prob=prediction(test_word,max_length)
        if len(prob)>0:
            #print(test_word)
            #print('prob: ----->',prob)
            test_word=cor_word[numpy.argmax(prob)]
            #print(test_word)
            final_text.extend(test_word.split(" "))
        else:
            final_text.extend(test_word.split(" "))
    else:
        break
corrected=stitch(final_text)
print('Input text: ',input_text)
print('Corrected text: ',corrected)

pay

credit
credit card
Input text:  pay with my credit card
Corrected text:  Pay  with my Confusion Payment mode


# Testing for intent_classification

In [21]:
input_text_1="Pay the minimum balance on my credit card"
prob=prediction(input_text_1,max_length)
print(labels[numpy.argmax(prob[0])])
# tokens=[[[19.0,1.0,12.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0]]]
# out=embedding_model.predict(tokens)
# print(labels[numpy.argmax(out)])

pay minimum balance credit card
Minimum due


# Exporting

The model needs to be saved in a .h5 format.

The vocabulary of the new words are to be saved in a .json format.

In [None]:
import json
embedding_model.save('ios_models/intent_class_new.h5')
with open('embeddings_json/core_vocab_intent_class_new.json','w') as vocab:
    json.dump(tokenizer.word_index,vocab)

In [16]:
import json
embedding_model.save('ios_models/auto_correct_new.h5')
with open('embeddings_json/core_vocab_auto_correct_new.json','w') as vocab:
    json.dump(tokenizer.word_index,vocab)

In [22]:
import json
embedding_model.save('intent_class_QA.h5')
with open('core_vocab_intent_class_QA.json','w') as vocab:
    json.dump(tokenizer.word_index,vocab)