In [5]:
# Fundamentals
import matplotlib.pyplot as plt
import seaborn as sns


# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Import Tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

# Import the train test split
from sklearn.model_selection import train_test_split

# To evaluate our model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

In [6]:
#load dataset

import pandas as pd

data = pd.read_csv('preggybot_datasets.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Category,Questions,Answers
0,General Information,What are the early signs of pregnancy?,The most common sign of pregnancy is a missin...
1,General Information,Early signs of pregnancy.,The most common sign of pregnancy is a missin...
2,General Information,What are the symptoms of pregnancy?,The most common sign of pregnancy is a missin...
3,General Information,Signs of pregnancy.,The most common sign of pregnancy is a missin...
4,General Information,Common signs of pregnancy,The most common sign of pregnancy is a missin...


In [7]:
# Select the features and the target
X = data['Answers']
y = data['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the tf-idf vectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')

# First fit the vectorizer with our training set
tfidf_train = vectorizer.fit_transform(X_train)

# Now we can fit our test data with the same vectorizer
tfidf_test = vectorizer.transform(X_test)

# Initialize the Multinomial Naive Bayes classifier
nb = MultinomialNB()

# Fit the model
nb.fit(tfidf_train, y_train)

# Print the accuracy score
print("Accuracy:",nb.score(tfidf_test, y_test))

Accuracy: 0.9964028776978417


In [8]:
# Predict the labels
y_pred = nb.predict(tfidf_test)

# Print the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix\n")
print(cm)

# Print the Classification Report
cr = classification_report(y_test, y_pred)
print("\n\nClassification Report\n")
print(cr)


# Print the Receiver operating characteristic Auc score
auc_score = roc_auc_score(y_test, y_pred)
print("\nROC AUC Score:",auc_score)

# Get probabilities.
y_pred_proba = nb.predict(tfidf_test)

# Get False Positive rate, True Positive rate and the threshold
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Visualize the ROC curve.
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FP Rate')
plt.ylabel('TP Rate')
plt.title('ROC')
plt.show()

Confusion Matrix

[[119   0   0]
 [  0  93   0]
 [  1   0  65]]


Classification Report

                     precision    recall  f1-score   support

   Baby Development       0.99      1.00      1.00       119
General Information       1.00      1.00      1.00        93
          Medicines       1.00      0.98      0.99        66

           accuracy                           1.00       278
          macro avg       1.00      0.99      1.00       278
       weighted avg       1.00      1.00      1.00       278



  y_score = check_array(y_score, ensure_2d=False)


ValueError: Unable to convert array of bytes/strings into decimal numbers with dtype='numeric'