In [36]:
#load dataset

import pandas as pd

dataset = pd.read_csv('preggybot_datasets.csv', encoding= 'unicode_escape')
dataset.head()

Unnamed: 0,Category,Questions,Answers
0,pregnancy_genInfo,What are the early signs of pregnancy?,The most common sign of pregnancy is a missin...
1,pregnancy_genInfo,Early signs of pregnancy.,The most common sign of pregnancy is a missin...
2,pregnancy_genInfo,What are the symptoms of pregnancy?,The most common sign of pregnancy is a missin...
3,pregnancy_genInfo,Signs of pregnancy.,The most common sign of pregnancy is a missin...
4,pregnancy_genInfo,Common signs of pregnancy,The most common sign of pregnancy is a missin...


In [37]:
# create vectorizer to calculate each word. this will serve as the vocabulary

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words= 'english')

# To actuallt create the vectorizer, we simply need to call fit in the text data that we want to fix
vectorizer.fit(dataset['Questions'])

# Now, we can inspect how our vectorizer vectorized the texy
# This will print out a list of words used and their index in the vectors
print(vectorizer.vocabulary_)

# if we would like to actually create a vector, 
# we can do so by passing the dataset into the vectorizer to get the back counts
vector = vectorizer.transform(dataset['Questions'])

{'early': 167, 'signs': 368, 'pregnancy': 313, 'symptoms': 393, 'common': 136, 'know': 245, 'pregnant': 314, 'missed': 272, 'period': 303, 'im': 233, 'having': 219, 'nausea': 279, 'havin': 218, 'heartburn': 224, 'constipation': 141, 'normal': 287, 'experiencing': 182, 'breast': 116, 'tenderness': 398, 'terder': 400, 'tender': 397, 'notice': 289, 'soon': 378, 'long': 254, 'start': 383, 'week': 446, 'occur': 294, 'quickly': 321, 'does': 159, 'drink': 162, 'caffeine': 120, 'coffee': 131, 'okay': 295, 'safe': 344, 'want': 443, 'drinking': 163, 'bad': 98, 'good': 213, 'weight': 448, 'expect': 179, 'gain': 209, 'range': 322, 'gane': 210, 'desired': 155, 'travel': 414, 'drive': 164, 'driving': 165, 'exercise': 176, 'recommended': 327, 'woman': 449, 'important': 234, 'need': 280, 'hours': 230, 'atleast': 89, 'kinds': 243, 'activities': 78, 'recommend': 325, 'exercises': 177, 'benefits': 102, 'advantages': 79, 'andvantages': 83, 'feel': 189, 'dizzy': 158, 'tired': 412, 'women': 450, 'exhausted'

In [38]:
# creating and testing the model
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

X_train_matrix = vector
y = dataset["Category"]

X_train, X_test, y_train, y_test = train_test_split(X_train_matrix, y, test_size=0.2)

mlpModel = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
mlpModel.fit(X_train, y_train.values.ravel())

y_pred = mlpModel.predict(X_test)


# Testing of accuracy, precision, recall and f1 score
from sklearn import metrics

acc_score = metrics.accuracy_score(y_test, y_pred)
precision_score = metrics.precision_score(y_test, y_pred, average='macro')
recall_score = metrics.recall_score(y_test, y_pred, average='macro')
f1_score = metrics.f1_score(y_test, y_pred, average='macro')

print('Accuracy score: {}'.format(acc_score*100))
print('Precision score: {}'.format(precision_score*100))
print('F1 Score score: {}'.format(f1_score*100))
print('Recall score: {}'.format(recall_score*100))

Accuracy score: 98.92086330935251
Precision score: 98.94956842733838
F1 Score score: 98.86996463558393
Recall score: 98.79329004329006


In [39]:
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef

cohen_score = cohen_kappa_score(y_test, y_pred)
print(('Kappa Score: {}'.format(cohen_score*100)))

mcc = matthews_corrcoef(y_test, y_pred)
print(('Mattheews Corrcoef Score: {}'.format(mcc*100)))

Kappa Score: 97.73995989377269
Mattheews Corrcoef Score: 97.74273353593081


In [40]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Print the Confusion Matrix
cm = confusion_matrix(y_test, y_pred, )
print("Confusion Matrix\n")

print('\nTrue Positive(TP) =', cm[0,0])
print('True Negatives(TN) =', cm[1,1])
print('False Positive(FP) =', cm[0,1])
print('False Negative(FN) =', cm[1,0])

Confusion Matrix


True Positive(TP) = 108
True Negatives(TN) = 167
False Positive(FP) = 2
False Negative(FN) = 1


In [41]:
from sklearn.metrics import matthews_corrcoef

TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

# print classification accuracy

specificity = TN / (TN + FP)
print('Specificity : {0:0.4f}'.format(specificity))

false_positive_rate = FP / float(FP + TN)
print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

false_negative_rate = FN/(TP+FN)
print('False Negative Rate : {0:0.4f}'.format(false_negative_rate))

Specificity : 0.9882
False Positive Rate : 0.0118
False Negative Rate : 0.0092


In [42]:
# user query prediction

query = ["signs of pregnancy?"]
print ("Question: ", query)

vector_query = vectorizer.transform(query)
print(vector_query)

prediction = mlpModel.predict(vector_query)

print("Answer: ", prediction)

Question:  ['signs of pregnancy?']
  (0, 313)	1
  (0, 368)	1
Answer:  ['pregnancy_genInfo']
