## Practical 4 : Naive Bayes using `sklearn`

In [1]:
import numpy as np
from scipy.stats import norm, multivariate_normal
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

### Loeading data

In [2]:
X, y = load_iris(return_X_y=True)

X_train, y_train = X[0:150:2, ...], y[0:150:2]
X_test, y_test = X[1:150:2, ...], y[1:150:2]

### Gaussian NB Model

In [3]:
gnb_model = GaussianNB()

gnb_model.fit(X_train, y_train)

GaussianNB()

In [4]:
y_pred_gnb = gnb_model.predict(X_test)

In [5]:
confusion_matrix(y_test, y_pred_gnb)

array([[25,  0,  0],
       [ 0, 24,  1],
       [ 0,  2, 23]])

In [6]:
accuracy_score(y_test, y_pred_gnb)

0.96

### Custom Gaussian NB model

In [7]:
μ_0, σ_0 = X_train[y_train == 0, ...].mean(axis=0), X_train[y_train == 0, ...].std(axis=0)
μ_1, σ_1 = X_train[y_train == 1, ...].mean(axis=0), X_train[y_train == 1, ...].std(axis=0)
μ_2, σ_2 = X_train[y_train == 2, ...].mean(axis=0), X_train[y_train == 2, ...].std(axis=0)

p_0 = np.sum(y_train == 0) / y_train.size
p_1 = np.sum(y_train == 1) / y_train.size
p_2 = 1 - p_0 - p_1

In [8]:
dist_0 = norm(μ_0, σ_0)
dist_1 = norm(μ_1, σ_1)
dist_2 = norm(μ_2, σ_2)

In [9]:
X_test_proba_0 = p_0 * np.exp(np.sum(dist_0.logpdf(X_test), axis=-1))
X_test_proba_1 = p_1 * np.exp(np.sum(dist_1.logpdf(X_test), axis=-1))
X_test_proba_2 = p_2 * np.exp(np.sum(dist_2.logpdf(X_test), axis=-1))

In [10]:
y_pred_custom_gnb = np.argmax([X_test_proba_0, X_test_proba_1, X_test_proba_2], axis=0)

In [11]:
accuracy_score(y_test, y_pred_custom_gnb)

0.96

In [12]:
confusion_matrix(y_test, y_pred_custom_gnb)

array([[25,  0,  0],
       [ 0, 24,  1],
       [ 0,  2, 23]])

### Bernoulli NB model

In [13]:
X = np.array([
       ['Tirth Tirth Patel'],
       ['Tirth Hihoriya Ramesh'],
       ['Hihoriya Ashesh Ramesh'],
       ['Tirth Ashesh Patel'],
       ['Ramesh Hihoriya'],
       ]).reshape(-1,)
y = np.array([1, 0, 1, 1, 0])

In [14]:
cv = CountVectorizer(binary=True)
X = cv.fit_transform(X).toarray()

In [15]:
cv.vocabulary_

{'tirth': 4, 'patel': 2, 'hihoriya': 1, 'ramesh': 3, 'ashesh': 0}

In [16]:
X

array([[0, 0, 1, 0, 1],
       [0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [0, 1, 0, 1, 0]])

In [17]:
bnb_model = BernoulliNB()

bnb_model.fit(X, y)

BernoulliNB()

In [18]:
y_pred_bnb = bnb_model.predict(X)

In [19]:
confusion_matrix(y, y_pred_bnb)

array([[2, 0],
       [1, 2]])

In [20]:
accuracy_score(y, y_pred_bnb)

0.8

### Multinomial NB Model

In [21]:
X = np.array([
       ['Tirth Tirth Patel'],
       ['Tirth Hihoriya Ramesh'],
       ['Hihoriya Ashesh Ramesh'],
       ['Tirth Ashesh Patel'],
       ['Ramesh Hihoriya'],
       ]).reshape(-1,)
y = np.array([1, 0, 1, 1, 0])

In [22]:
cv = CountVectorizer()
X = cv.fit_transform(X).toarray()

In [23]:
cv.vocabulary_

{'tirth': 4, 'patel': 2, 'hihoriya': 1, 'ramesh': 3, 'ashesh': 0}

In [24]:
X

array([[0, 0, 1, 0, 2],
       [0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [0, 1, 0, 1, 0]])

In [25]:
mnb_model = MultinomialNB()

mnb_model.fit(X, y)

MultinomialNB()

In [26]:
y_pred_mnb = mnb_model.predict(X)

In [27]:
confusion_matrix(y, y_pred_mnb)

array([[2, 0],
       [1, 2]])

In [28]:
accuracy_score(y, y_pred_mnb)

0.8