# Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle
import random
import json
import string
import re
import traceback

import tensorflow
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem.lancaster import LancasterStemmer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, Bidirectional, Reshape
from tensorflow.keras.optimizers import SGD
import sys
import os

# Constants

In [158]:
class ModelConstants:
    JSON_DATASET_PATH = "/home/rash/Documents/univers-py-api/ramses/data/kpi_identifier.json"
    PICKLE_DATASET_PATH = "/home/rash/Documents/univers-py-api/test-notebooks/data.pickle"
    TRAINING_CHECKPOINT_FILE_PATH = "/home/rash/Documents/univers-py-api/test-notebooks/tmp/checkpoint"
    TRAINED_MODEL_PATH = "/home/rash/Documents/univers-py-api/test-notebooks/kpi_identifier.h5"
    RAMSES_ERROR_THRESHOLD = 0.6

# Build model

In [163]:
class Model:
    # Loading json dataset
    def __load_json_dataset__():
        try:
            with open(ModelConstants.JSON_DATASET_PATH, "rb") as file:
                data = json.load(file)

                return data

        except Exception as e:
            raise FileNotFoundError(f"{ModelConstants.DATASET_PATH} does not exist")
    
    # Loading pickle dataset
    def __load_pickle_dataset__():
        try:
            with open(ModelConstants.PICKLE_DATASET_PATH, 'rb') as file:
                words, classes, documents = pickle.load(file)

                return words, classes, documents

        except Exception as e:
            raise FileNotFoundError(f"{ModelConstants.PICKLE_DATASET_PATH} does not exist")

    
        
    __data__ = __load_json_dataset__()
    __words__, __classes__, __documents__ = [], [], []
    
    __stemmer__ = nltk.stem.LancasterStemmer()
    
    __loaded_words__, __loaded_classes__, __loaded_documents__ = __load_pickle_dataset__()
    __loaded_model__ = tensorflow.keras.models.load_model(ModelConstants.TRAINED_MODEL_PATH)

    
    def __init__(self):
        if Model.__model__ is not None:
            raise Exception("This class is a singleton!")

        else:
            Model.__model__ = self
    
    @staticmethod
    def __ignored_words__():
        return ["'s"] + [character for character in string.punctuation]
    
    @staticmethod
    def __callbacks__():
        my_callbacks = [
        #     tensorflow.keras.callbacks.EarlyStopping(monitor="loss", patience=10),
        #     tensorflow.keras.callbacks.TensorBoard(log_dir='./logs'),
            tensorflow.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.01, patience=10, min_lr=0.001, verbose=1),
            tensorflow.keras.callbacks.ModelCheckpoint(filepath=ModelConstants.TRAINING_CHECKPOINT_FILE_PATH, monitor='loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')
        ]
        
        return my_callbacks
        
    @staticmethod
    def __clean_up_sentence__(sentence):
        s_words = nltk.word_tokenize(sentence)
        s_words = [Model.__stemmer__.stem(word.lower()) for word in s_words if
                   word not in Model.__ignored_words__()]

        return s_words

    @staticmethod
    def __bow__(sentence, words, show_details=True):
        s_words = Model.__clean_up_sentence__(sentence=sentence)

        bag = [0 for _ in range(len(words))]

        for s_word in s_words:
            for index, word in enumerate(words):
                if word == s_word:
                    bag[index] = 1

                    if show_details:
                        print(f"found in bag: {s_word}")

        return np.array(bag)

    @staticmethod
    def __classify_local__(sentence):
        ERROR_THRESHOLD = ModelConstants.RAMSES_ERROR_THRESHOLD

        input_data = pd.DataFrame(
            [Model.__bow__(sentence=sentence, words=Model.__loaded_words__, show_details=False)],
            dtype=float, index=['input'])
        input_data = input_data.values.reshape(-1, 1, input_data.shape[1])

        results = Model.__loaded_model__.predict([input_data])[0]
        context_probabilities = {}

        for index, probability in enumerate(results):
            context_probabilities[Model.__loaded_classes__[index]] = f"{round(probability, 4)}%"

        # filter out prediction below a threshold, and provide intent index
        results = [[index, result] for index, result in enumerate(results) if result > ERROR_THRESHOLD]

        # sort by strength of probability
        results.sort(key=lambda x: x[1], reverse=False)
        return_list = []

        for result in results:
            return_list.append((Model.__loaded_classes__[result[0]], str(result[1])))

        if not return_list: return None, context_probabilities

        return return_list[0], context_probabilities

    @staticmethod
    def __predict__(doc: str):
        try:
            results, context_probabilities = Model.__classify_local__(sentence=doc)
            
            for intent in Model.__data__["intents"]:
                if intent["tag"] == results[0]:
                    for query in doc.split():
                        for validate in intent["validate"]:
                            if Model.__stemmer__.stem(
                                    nltk.word_tokenize(query)[0]) == Model.__stemmer__.stem(validate):
                                return results[0], context_probabilities
                            elif results[0] == "definition" and re.search(r"""(?:is\s)\w{3,}""", doc):
                                return results[0], context_probabilities
                    return None, None
        
        except Exception:
            return None, None
    
    @staticmethod
    def __feature_engineering__():
        for intent in Model.__data__['intents']:
            for pattern in intent['patterns']:
                # tokenize each word in the sentence
                words = nltk.word_tokenize(pattern)
                Model.__words__.extend(words) # add to word list
                Model.__documents__.append((words, intent['tag'])) # add to documents in our corpus

            if intent['tag'] not in Model.__classes__:
                Model.__classes__.append(intent['tag']) # add to our class list

        # stem and lower each word and remove duplicates
        Model.__words__ = [Model.__stemmer__.stem(word.lower()) for word in Model.__words__ if word not in Model.__ignored_words__()]
        Model.__words__ = sorted(list(set(Model.__words__)))

        Model.__classes__ = sorted(list(set(Model.__classes__))) # sort classes

        print(f"{len(Model.__documents__)} documents\n{len(Model.__classes__)} classes\n{len(Model.__words__)} unique stemmed words")

        with open(ModelConstants.PICKLE_DATASET_PATH, 'wb') as file:
            pickle.dump((Model.__words__, Model.__classes__, Model.__documents__), file)
    
    @staticmethod
    def __normalize__():
        training = []
        output_empty = [0 for _ in range(len(Model.__classes__))]

        for doc in Model.__documents__: # bag of words
            bag = []
            s_words = [Model.__stemmer__.stem(word.lower()) for word in doc[0] if word not in Model.__ignored_words__()]

            for word in Model.__words__:
                bag.append(1) if word in s_words else bag.append(0)

            output_row = output_empty[:]
            output_row[Model.__classes__.index(doc[1])] = 1

            training.append([bag, output_row])

        # shuffle our features and turn intp np.array
        random.shuffle(training)
        training = np.array(training)

        # create train and test list.
        train_x = tensorflow.cast(np.array(list(training[:, 0])), tensorflow.float32)
        train_y = tensorflow.cast(np.array(list(training[:, 1])), tensorflow.float32)

        train_x = tensorflow.reshape(train_x, (train_x.shape[0], 1, train_x.shape[1]))
        print(f"{train_x.shape} {train_y.shape}")
        
        return train_x, train_y
    
    @staticmethod
    def __build__(train_x, train_y):
        # Build Model
        model = Sequential()
        model.add(LSTM(units=train_x.shape[2], input_shape=(1, train_x.shape[2])))
        model.add(Dropout(0.2))

        model.add(Dense(128, activation="relu"))
        model.add(Dropout(0.2))

        model.add(Dense(64, activation="relu"))
        model.add(Dropout(0.2))

        model.add(Dense(train_y.shape[1], activation="softmax"))
        
        return model
    
    @staticmethod
    def __train__(epochs: int, batch_size: int, save_model: bool, verbose: int = 1):
        # Feature Engineering on data
        Model.__feature_engineering__()
        
        # Normalize data
        train_x, train_y = Model.__normalize__()
        
        # Build Model
        model = Model.__build__(train_x=train_x, train_y=train_y)

        # Model Optimizer
        sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)

        if not tf.io.gfile.exists(ModelConstants.TRAINING_CHECKPOINT_FILE_PATH):
            print("\n============================= Create a new model =============================\n")
            model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
        
        else:
            print("\n============================= Load existing model weights =============================\n")
            model = tf.keras.models.load_model(ModelConstants.TRAINED_MODEL_PATH)
            model.load_weights(ModelConstants.TRAINING_CHECKPOINT_FILE_PATH)


        # Fit the model
        model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=Model.__callbacks__())
        
        if save_model:
            Model.__save__(model=model)
            
        return model
    
    @staticmethod
    def __save__(model: object):
        model.load_weights(ModelConstants.TRAINING_CHECKPOINT_FILE_PATH)
        model.save(ModelConstants.TRAINED_MODEL_PATH)
        
        return
    
    @staticmethod
    def __validate_helper__():
        predictions, wrong_predictions = [], []
        accuracy, recall, precision, f_score = [], [], [], []
        
        for intent in Model.__data__["intents"]:
            prediction_dict = {}
            
            for pattern in intent["patterns"]:
                # Model prediction
                context, probabilties = Model.predict(doc=pattern)
                
                prediction_dict["text"] = pattern
                prediction_dict["actual"] = intent.get("tag")
                prediction_dict["predicted"] = context
                prediction_dict["accuracy"] = probabilties.get(context) if probabilties is not None else None
                
                predictions.append(prediction_dict)
        
        for prediction in predictions:
            print(f"Question: {prediction.get('text')} \t Actual: {prediction.get('actual')} \t Predicted: {prediction.get('predicted')} \t Accuracy: {prediction.get('accuracy')}")
            
            if prediction.get("actual") != prediction.get("predicted"):
                wrong_predictions.append(prediction)
            
        wrong_predictions = [prediction for prediction in predictions if prediction.get("actual") != prediction.get("predicted")]
        return True if not wrong_predictions else False

    @staticmethod
    def train(epochs: int = 20, batch_size: int = 1, save_model: bool = True):
        model = Model.__train__(epochs=epochs, batch_size=batch_size, save_model=save_model)
        return model
    
    @staticmethod
    def predict(doc: str):
        if Model.__loaded_model__ is None:
            Model()

        return Model.__predict__(doc=doc)
    
    
    @staticmethod
    def validate():
        if Model.__loaded_model__ is None:
            Model()

        return Model.__validate_helper__()

In [164]:
# model = Model.train(epochs=200)

639 documents
12 classes
163 unique stemmed words
(639, 1, 163) (639, 12)


Epoch 1/200

Epoch 00001: loss improved from inf to 0.00071, saving model to /home/rash/Documents/univers-py-api/test-notebooks/tmp/checkpoint
Epoch 2/200

Epoch 00002: loss did not improve from 0.00071
Epoch 3/200

Epoch 00003: loss did not improve from 0.00071
Epoch 4/200

Epoch 00004: loss did not improve from 0.00071
Epoch 5/200

Epoch 00005: loss did not improve from 0.00071
Epoch 6/200

Epoch 00006: loss did not improve from 0.00071
Epoch 7/200

Epoch 00007: loss improved from 0.00071 to 0.00047, saving model to /home/rash/Documents/univers-py-api/test-notebooks/tmp/checkpoint
Epoch 8/200

Epoch 00008: loss did not improve from 0.00047
Epoch 9/200

Epoch 00009: loss did not improve from 0.00047
Epoch 10/200

Epoch 00010: loss did not improve from 0.00047
Epoch 11/200

Epoch 00011: loss did not improve from 0.00047
Epoch 12/200

Epoch 00012: loss did not improve from 0.00047
Epoch 13/200

Epoch 00013: loss

In [162]:
# Model.predict(doc="what is my revenue for today?")
# Model.validate()

Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%
Question: month forecast revenue 	 Actual: forecast 	 Predicted: forecast 	 Accuracy: 1.0%

True