In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from numpy.testing import assert_allclose
import pandas as pd
import pickle
import random
import json
import string
import re

import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem.lancaster import LancasterStemmer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, Bidirectional, Reshape
from tensorflow.keras.optimizers import SGD

In [2]:
with open('kpi_identifier.json') as file:
    data = json.load(file)

In [3]:
stemmer = LancasterStemmer()

In [4]:
# define the checkpoint
filepath="tmp/checkpoint"

my_callbacks = [
#     tf.keras.callbacks.EarlyStopping(monitor="loss", patience=10),
#     tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.01, patience=10, min_lr=0.001, verbose=1),
    tf.keras.callbacks.ModelCheckpoint(filepath=filepath, monitor='loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')
]

In [5]:
words = []
classes = []
documents = []
ignore_words = ["'s"]
ignore_words.extend(string.punctuation)

# try:
#     with open('data.pickle', 'rb') as file:
#         words, classes, documents = pickle.load(file)
# except:
# loop through each sentence in our intents pattern
for intent in data['intents']:
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        wrds = nltk.word_tokenize(pattern)
        words.extend(wrds) # add to word list
        documents.append((wrds, intent['tag'])) # add to documents in our corpus

    if intent['tag'] not in classes:
        classes.append(intent['tag']) # add to our class list

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes))) # sort classes

print(f"{len(documents)} documents\n{len(classes)} classes\n{len(words)} unique stemmed words")

with open('data.pickle', 'wb') as file:
    pickle.dump((words, classes, documents), file)

639 documents
12 classes
163 unique stemmed words


In [6]:
# !pip install numpy==1.19.5

In [7]:
# create our training data
training = []
output_empty = [0 for _ in range(len(classes))]

for doc in documents: # bag of words
    bag = []
    s_words = [stemmer.stem(w.lower()) for w in doc[0] if w not in ignore_words]
    
    for w in words:
        bag.append(1) if w in s_words else bag.append(0)
        
    output_row = output_empty[:]
    output_row[classes.index(doc[1])] = 1
    
    training.append([bag, output_row])
    
# shuffle our features and turn intp np.array
random.shuffle(training)
training = np.array(training)

# create train and test list.
train_x = tf.cast(np.array(list(training[:, 0])), tf.float32)
train_y = tf.cast(np.array(list(training[:, 1])), tf.float32)

train_x = tf.reshape(train_x, (train_x.shape[0], 1, train_x.shape[1]))
print(f"{train_x.shape} {train_y.shape}")

model = Sequential()
model.add(LSTM(units=train_x.shape[2], input_shape=(1, train_x.shape[2])))
model.add(Dropout(0.2))

model.add(Dense(128, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(train_y.shape[1], activation="softmax"))

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)

if not tf.io.gfile.exists(filepath):
    print("\n============================= Create a new model =============================\n")
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
else:
    print("\n============================= Load existing model weights =============================\n")
    model = tf.keras.models.load_model("kpi_identifier.hdf5")
    model.load_weights(filepath)
    

# Fit the model
model.fit(train_x, train_y, epochs=200, batch_size=1, verbose=1, callbacks=my_callbacks)
# model.load_weights("./models/ramses_kpi/tmp/checkpoint")
# model.save("./models/ramses_kpi/model.h5")

(639, 1, 163) (639, 12)


Epoch 1/200
Epoch 00001: loss improved from inf to 0.00194, saving model to tmp/checkpoint
Epoch 2/200
Epoch 00002: loss did not improve from 0.00194
Epoch 3/200
Epoch 00003: loss did not improve from 0.00194
Epoch 4/200
Epoch 00004: loss improved from 0.00194 to 0.00162, saving model to tmp/checkpoint
Epoch 5/200
Epoch 00005: loss did not improve from 0.00162
Epoch 6/200
Epoch 00006: loss did not improve from 0.00162
Epoch 7/200
Epoch 00007: loss did not improve from 0.00162
Epoch 8/200
Epoch 00008: loss did not improve from 0.00162
Epoch 9/200
Epoch 00009: loss did not improve from 0.00162
Epoch 10/200
Epoch 00010: loss did not improve from 0.00162
Epoch 11/200
Epoch 00011: loss did not improve from 0.00162
Epoch 12/200
Epoch 00012: loss did not improve from 0.00162
Epoch 13/200
Epoch 00013: loss did not improve from 0.00162
Epoch 14/200
Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.001.

Epoch 00014: loss did not improve from 0.00162
Epoch 15/2

<tensorflow.python.keras.callbacks.History at 0x7f6bcc74b490>

In [8]:
def clean_up_sentence(sentence):
    s_words = nltk.word_tokenize(sentence)
    s_words = [stemmer.stem(w.lower()) for w in s_words if w not in ignore_words]
    
    return s_words

def bow(sentence, words, show_details=True):
    s_words = clean_up_sentence(sentence)
    
    bag = [0 for _ in range(len(words))]
    
    for s in s_words:
        for i, w in enumerate(words):
            if w == s:
                bag[i] = 1
                
                if show_details:
                    print(f"found in bag: {s}")
                    
    return np.array(bag)

def classify_local(sentence):
    ERROR_THRESHOLD = 0.6
    
    input_data = pd.DataFrame([bow(sentence, words, show_details=False)], dtype=float, index=['input'])
    input_data = input_data.values.reshape(-1, 1, input_data.shape[1])
        
    results = model.predict([input_data])[0]
    
    #filter out prediction below a threshold, and provide intent index
    results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD]
    
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=False)
    
    return_list = []
    for r in results:
        return_list.append((classes[r[0]], str(r[1])))
        
#     print(return_list)
    return return_list
    
# while True:
#     inp = input("You: ")

#     if inp == 'quit' or inp == 'stop' or inp == 'q':
#         break

#     try:
#         results = classify_local(inp)[0]
        
#         for intent in data['intents']:
#             if intent['tag'] == results[0]:
#                 responses = results[0], inp

#         print(responses)
        
#     except:
#         print("Don't understand your query please use a different term\n")

    

In [9]:
model.load_weights(filepath)
model.save("kpi_identifier.hdf5")

In [10]:
class KpiIdentifier:
    def __init__(self, data_path, model_path, json_data_path):
        try:
            with open(data_path, 'rb') as file:
                self.words, self.classes, self.documents = pickle.load(file)
        except Exception as e:
            print(e)
            raise FileNotFoundError(f"{data_path} doesn't exist")
            
        try:
            self.model = tf.keras.models.load_model(model_path)
        except Exception as e:
            print(e)
            raise FileNotFoundError(f"{model_path} doesn't exist")
            
        try:
            with open(json_data_path, "rb") as file:
                self.data = json.load(file)
        except Exception as e:
            print(e)
            raise FileNotFoundError(f"{json_data_path} doesn't exist")
           
    @property
    def ignore_words(self):
        return ["'s"] + [s for s in string.punctuation]
    
    def clean_up_sentence(self, sentence):
        s_words = nltk.word_tokenize(sentence)
        s_words = [stemmer.stem(w.lower()) for w in s_words if w not in self.ignore_words]

        return s_words

    def bow(self, sentence, words, show_details=True):
        s_words = self.clean_up_sentence(sentence)

        bag = [0 for _ in range(len(words))]

        for s in s_words:
            for i, w in enumerate(words):
                if w == s:
                    bag[i] = 1

                    if show_details:
                        print(f"found in bag: {s}")

        return np.array(bag)

    def classify_local(self, sentence):
        ERROR_THRESHOLD = 0.6

        input_data = pd.DataFrame([self.bow(sentence, self.words, show_details=False)], dtype=float, index=['input'])
        input_data = tf.cast(input_data.values.reshape(-1, 1, input_data.shape[1]), tf.float32)
        
        results = self.model.predict(input_data)[0]

        #filter out prediction below a threshold, and provide intent index
        results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD]

        # sort by strength of probability
        results.sort(key=lambda x: x[1], reverse=False)

        return_list = []
        for r in results:
            return_list.append((self.classes[r[0]], str(r[1])))
                
        print(return_list)
        return return_list

    def run(self, text):
        try:
            results = self.classify_local(text)[0]
            
            for intent in self.data["intents"]:
                if intent["tag"] == results[0]: 
                    for query in text.split():
                        for val in intent["validate"]:
                            if stemmer.stem(nltk.word_tokenize(query)[0]) == stemmer.stem(val): return results[0], text
                            elif results[0] == "definition" and re.search(r"""(?:is\s)\w{3,}""", text): return results[0], text
                    return None
                
        except IndexError:
            return None
                    
        except Exception as e:
            print(e)
            return None

    
# identifier = KpiIdentifier(data_path="data.pickle", model_path="kpi_identifier.hdf5", json_data_path="kpi_identifier.json")
# identifier.run(text="")

In [11]:
identifier = KpiIdentifier(data_path="data.pickle", model_path="kpi_identifier.hdf5", json_data_path="kpi_identifier.json")
text = input("Enter Text: ")

while True:
    if text in ["quit", "exit", "stop", "q", "s", "e"]:
        break
    
    display(identifier.run(text=text))
    text = input("\nEnter Text: ")

Enter Text:  how are we doing on invoice


[('invoice', '0.9999963')]


('invoice', 'how are we doing on invoice')


Enter Text:  what is my revenue for today


[('revenue', '1.0')]


('revenue', 'what is my revenue for today')


Enter Text:  quit
