# Naive Bayes

Bayes Theorem:
$$ P(A|B) = \frac{P(B|A) * P(A)}{P(B)} $$
In our case:
$$ P(y|X) = \frac{P(X|y) * P(y)}{P(X)} $$
with feature vector X:
$$ X = (x_{1}, x_{2}, x_{3}....x_{n}) $$
Assume that all features are mutually independent
$$ P(y|X) = \frac{P(x_{1}|y) + P(x_{2}|y)*..... P(x_{n}|y)* P(y)}{P(X)} $$
Select class with highes probabilty:
$$ y = argmax_{y} P(y|X) = \frac{P(x_{1}|y) + P(x_{2}|y)*..... P(x_{n}|y)* P(y)}{P(X)} $$
$$ y = argmax_{y} P(x_{1}|y) + P(x_{2}|y)*..... P(x_{n}|y)* P(y) $$
$$ y = argmax_{y} log(P(x_{1}|y)) + log(P(x_{2}|y))*..... log(P(x_{n}|y))* log(P(y)) $$
Prior probability P(y): frequency

Class conditional probability 
$$ P(X_i|y) $$

$$ P(x_{i}|y) = \frac{1}{\sqrt{2πσ_{y}^{2}}} * exp(- \frac{(x_{i} - μ_{y})^{2}}{2σ_{y}^{2}}) $$

In [1]:
import numpy as np

## very detailed

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

In [28]:
# Some sample data from sklearn
X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [29]:
print(X_train.shape)
print(y_train.shape)

(800, 10)
(800,)


In [30]:
# init parameters
n_samples, n_features = X.shape
classes_ = np.unique(y)
n_classes = len(np.unique(y))

In [32]:
# calculate mean, var, and prior for each class
mean_ = np.zeros((n_classes, n_features), dtype=np.float64)
var_ = np.zeros((n_classes, n_features), dtype=np.float64)
priors_ = np.zeros(n_classes, dtype=np.float64)

for idx, c in enumerate(classes_):
    X_c = X_train[y_train == c]
    mean_[idx, :] = X_c.mean(axis=0)
    var_[idx, :] = X_c.var(axis=0)
    priors_[idx] = X_c.shape[0] / float(n_samples)

In [43]:
# predict
y_pred = []

for x in X_test:
  posteriors = []

  for idx, c in enumerate(classes_):
    prior = np.log(priors_[idx])
    
    mean = mean_[idx]
    var = var_[idx]
    numerator = np.exp(-((x - mean) ** 2) / (2 * var))
    denominator = np.sqrt(2 * np.pi * var)
    posterior = np.sum(np.log(numerator / denominator))

    posterior = prior + posterior
    posteriors.append(posterior)

  y_pred.append(classes_[np.argmax(posteriors)])

In [44]:
accuracy = np.sum(y_test == y_pred) / len(y_test)
print("Naive Bayes classification accuracy", accuracy)

Naive Bayes classification accuracy 0.965


## clean version

In [2]:
class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [3]:
# Testing
if __name__ == "__main__":
    # Imports
    from sklearn.model_selection import train_test_split
    from sklearn import datasets

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_classes=2, random_state=123
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=123
    )

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)

    print("Naive Bayes classification accuracy", accuracy(y_test, predictions))

Naive Bayes classification accuracy 0.965


## sklearn

In [47]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))

print("Naive Bayes classification accuracy", accuracy_score(y_test, y_pred))

Number of mislabeled points out of a total 200 points : 7
Naive Bayes classification accuracy 0.965
