In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Naive Bayes Classification

*   **Berniulli / Multivariant**
*   **Multinomial**
*   **Gaussian**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('/content/drive/MyDrive/ML/#dataset/Naive-Bayes-Classification-Data.csv')

X = data[['glucose', 'bloodpressure']]
y = data['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bernoulli_nb = BernoulliNB()
multinomial_nb = MultinomialNB()
gaussian_nb = GaussianNB()

bernoulli_nb.fit(X_train > 0, y_train)
multinomial_nb.fit(X_train, y_train)
gaussian_nb.fit(X_train, y_train)

y_pred_bernoulli = bernoulli_nb.predict(X_test > 0)
y_pred_multinomial = multinomial_nb.predict(X_test)
y_pred_gaussian = gaussian_nb.predict(X_test)

def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1


bernoulli_metrics = evaluate_model(y_test, y_pred_bernoulli)
multinomial_metrics = evaluate_model(y_test, y_pred_multinomial)
gaussian_metrics = evaluate_model(y_test, y_pred_gaussian)


print(f"Bernoulli Naive Bayes:\n Accuracy: {bernoulli_metrics[0] * 100:.2f}%\n Precision: {bernoulli_metrics[1]:.2f}\n Recall: {bernoulli_metrics[2]:.2f}\n F1 Score: {bernoulli_metrics[3]:.2f}\n")
print(f"Multinomial Naive Bayes:\n Accuracy: {multinomial_metrics[0] * 100:.2f}%\n Precision: {multinomial_metrics[1]:.2f}\n Recall: {multinomial_metrics[2]:.2f}\n F1 Score: {multinomial_metrics[3]:.2f}\n")
print(f"Gaussian Naive Bayes:\n Accuracy: {gaussian_metrics[0] * 100:.2f}%\n Precision: {gaussian_metrics[1]:.2f}\n Recall: {gaussian_metrics[2]:.2f}\n F1 Score: {gaussian_metrics[3]:.2f}\n")


Bernoulli Naive Bayes:
 Accuracy: 46.73%
 Precision: 1.00
 Recall: 0.00
 F1 Score: 0.00

Multinomial Naive Bayes:
 Accuracy: 73.87%
 Precision: 0.83
 Recall: 0.64
 F1 Score: 0.72

Gaussian Naive Bayes:
 Accuracy: 92.96%
 Precision: 0.93
 Recall: 0.93
 F1 Score: 0.93



# Naive Bayes Classification


## **SMS Classification**


In [2]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/

import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])
sms.shape

(5572, 2)

In [3]:
sms.head()
print(sms.label.value_counts())
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
sms.head()

label
ham     4825
spam     747
Name: count, dtype: int64


Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [5]:
X_train = X[0:4179]
X_test = X[4179:]
y_train = y[0:4179]
y_test = y[4179:]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
print(vect.vocabulary_)

X_train_dtm = vect.transform(X_train)
X_train_dtm
X_test_dtm = vect.transform(X_test)
X_test_dtm



<1393x7231 sparse matrix of type '<class 'numpy.int64'>'
	with 9523 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.naive_bayes import MultinomialNB
# Multinomial Naive Bayes model
nb = MultinomialNB()

# Wall time is turn around time # cpu time is burst time
%time nb.fit(X_train_dtm, y_train)

CPU times: user 7.18 ms, sys: 821 µs, total: 8 ms
Wall time: 14.4 ms


In [8]:
y_pred_class = nb.predict(X_test_dtm)
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9842067480258435

In [9]:
# examine class distribution
print(y_test.value_counts())
metrics.confusion_matrix(y_test, y_pred_class)

label_num
0    1211
1     182
Name: count, dtype: int64


array([[1202,    9],
       [  13,  169]])