In [1]:
import json
import nltk
from collections import defaultdict
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [2]:
# Global Variables
words = []
ignore_list = [".", ",", "?", "/", "'s", "'m"]
stop_words = set(nltk.corpus.stopwords.words("english"))
lemmatizer = nltk.stem.WordNetLemmatizer()
processed_data = defaultdict(list)


In [3]:
# Reading the file
data_file = open('intents.json').read()
intents = json.loads(data_file)
intents = intents["intents"]


In [4]:
# Preprocessing the data
for intent in intents:
    for pattern in intent["patterns"]:
        tokens = nltk.word_tokenize(pattern)
        tokens = [token for token in tokens if token not in ignore_list]
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        processed_data[intent["tag"]].append(tokens)
        words.extend(tokens)


In [5]:
# Making the dataset
divided_data = [[], []]
for key in processed_data.keys():
    for sentence in processed_data[key]:
        divided_data[0].append(key)
        divided_data[1].append(' '.join(sentence))


In [6]:
# Printing the results
for i in range(len(divided_data[0])):
    print(divided_data[0][i], "=>",  divided_data[1][i])
    print("_"*150)



XDR Platform => What XDR platform
______________________________________________________________________________________________________________________________________________________
XDR Platform => Why XDR platform
______________________________________________________________________________________________________________________________________________________
XDR Platform => How helpful XDR platform
______________________________________________________________________________________________________________________________________________________
XDR Platform => How login XDR platform
______________________________________________________________________________________________________________________________________________________
XDR Platform => What XDR platform
______________________________________________________________________________________________________________________________________________________
XDR Platform => Why XDR platform
_______________________________

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

x = numpy.array(divided_data[1])
x = vectorizer.fit_transform(x)
x = x.toarray()
y = numpy.array(divided_data[0])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)


In [8]:
#Using Random Forest Classifier to predict the data
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
y_predf = forest.predict(x_test)
print(classification_report(y_test, y_predf))

                                precision    recall  f1-score   support

               Data Protection       1.00      1.00      1.00         2
                Email Security       1.00      1.00      1.00         2
             Endpoint Security       1.00      1.00      1.00         2
Network Detection and Response       1.00      1.00      1.00         3
                  XDR Platform       1.00      1.00      1.00         1

                      accuracy                           1.00        10
                     macro avg       1.00      1.00      1.00        10
                  weighted avg       1.00      1.00      1.00        10



In [9]:
sent = "How to login to XDR"
tokens = nltk.word_tokenize(sent)
tokens = [token for token in tokens if token not in ignore_list]
tokens = [token for token in tokens if token not in stop_words]
tokens = [lemmatizer.lemmatize(token) for token in tokens]

sent = [" ".join(tokens)]
sent = numpy.array(sent)

ques = vectorizer.transform(sent)
ques = ques.toarray()

fin = forest.predict(ques)
print(fin)

['XDR Platform']


In [10]:
import pickle
vect = open("chtbot_vect.pkl", "wb")
pickle.dump(vectorizer, vect)
vect.close()

mdl = open("chtbot_mdl.pkl", "wb")
pickle.dump(forest, mdl)
mdl.close()