### Probabilistic model
Naive Bayes models the conditional probability of classes $C_k$, given an instance represented by a feature vector $x=(x_1, \dots, x_n)$, as 
\begin{align}
p(C_k \mid x) = \frac{p(x \mid C_k) p(C_k)}{p(x)}.
\end{align}
The most important feature of the Naive Bayes model is that it assumes that all features are mutually independent conditional on the category $C_k$, e.g., 
\begin{align}
p(x_i \mid x_1, \dots, x_{i-1}, x_{i+1}, \dots, x_n, C_k) = p(x_i \mid C_k).
\end{align}

### Naive Bayes classifier
The Naive Bayes classifier is based on the MAP (maximum a posteriori) estimate of the conditional probability $p(C_k, x)$, i.e., given a feature vector $x$, we predict it being of the class
\begin{align}
\hat{y} = \text{argmax}_{k \in [K]} p(C_k) \prod_{i=1}^n p(x_i \mid C_k),
\end{align}
or equivalently (for computational reasons)
\begin{align}
\hat{y} = \text{argmax}_{k \in [K]} \left[ \log \left( p(C_k) \right) + \sum_{i=1}^n \log \left( p(x_i \mid C_k \right) \right].
\end{align}
For simplicity we assume here that $p(C_k) = c_k$ is constant. 

#### Modeling  the conditional probabilities
One can choose any model for the conditional probabilities $p(x_i \mid C_k)$, e.g., Gaussian, Bernoulli, Multinomial, etc. Note that the Naive Bayes classifier can easily handle mixtures of categorical and real-valued features.

In [144]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [156]:
class NaiveBayes:
    def __init__(self, class_prior, features, n_classes):
        self.class_prior = class_prior
        self.features = features
        self.n_classes = n_classes
        
        self._init_weights()
        
    def _init_weights(self):
        self.weights = []
        
        for i, feature in enumerate(self.features):
            if feature == 'gaussian':
                self.weights.append((self._random_normal(), self._random_normal()))
            
            elif feature == 'bernoulli':
                self.weights.append((self._random_uniform()))
     
    def _random_normal(self, loc_in=0.0, scale_in=1.0, size_in=None):
        if size_in is None:
            size_in = self.n_classes
        
        return np.random.normal(loc=loc_in, scale=scale_in, size=size_in)
    
    def _random_uniform(self, low_in=0.0, high_in=1.0, size_in=None):
        if size_in is None:
            size_in = self.n_classes
        
        return np.random.uniform(low=low_in, high=high_in, size=size_in)
        
    def _conditional_log_probability(self, feature, class_index, feature_index):
        if self.features[feature_index] == 'gaussian':
            return - (1/2)*np.log(2*np.pi*self.weights[feature_index][1][class_index]**2) \
                   - (feature-self.weights[feature_index][0][class_index])**2 \
                   / (2*self.weights[feature_index][1][class_index]**2)
        
        elif self.features[feature_index] == 'bernoulli':
            if feature == 0:
                return np.log(1-self.weights[feature_index][class_index])
            else:
                return np.log(self.weights[feature_index][class_index])
    
    def log_likelihood(self, X, Y):
        log_likelihood = 0.0
        
        for x, y in zip(X, Y):
            sum_of_logs = 0.0
            for j, _ in enumerate(self.features):
                sum_of_logs += np.log(self.class_prior[y]) + self._conditional_log_probability(x[j], y, j)
                
            log_likelihood += sum_of_logs
            
        return log_likelihood
    
    ### assumes all features are gaussian
    def gaussian_maximum_likelihood_fit(self, X, Y):
        means = np.zeros((len(self.features), self.n_classes))
        counts = np.zeros((self.n_classes,))
        
        for x, y in zip(X, Y):
            for j, _ in enumerate(self.features):
                means[j, y] += x[j]
            counts[y] += 1
                
        print(counts)
        for i in range(self.n_classes):
             means[:, i] /= counts[i]
                
        variances = np.zeros((len(self.features), self.n_classes))
        
        for x, y in zip(X, Y):
            for j, _ in enumerate(self.features):
                variances[j, y] += (x[i] - means[i, y])**2 / counts[y]
                
        for i in range(self.n_classes):
            for j, _ in enumerate(self.features):
                self.weights[j][0][i] = means[j, i]
                self.weights[j][1][i] = np.sqrt(variances[j, i])
                
    def predictions(self, X):
        predictions = []
        
        for x in X:
            
            class_predictions = []
            for i in range(self.n_classes):
                
                class_predictions.append(self.log_likelihood([x], [i]))
                
            predictions.append(class_predictions)
            
        return predictions

In [157]:
from sklearn.datasets import load_iris
from sklearn.utils import shuffle

X, Y = load_iris(return_X_y=True)
X, Y = shuffle(X, Y)

naive_bayes_iris = NaiveBayes([1/3.]*3, ['gaussian']*4, 3)

print('Log-likelihood:', naive_bayes_iris.log_likelihood(X, Y))

naive_bayes_iris.gaussian_maximum_likelihood_fit(X, Y)

print('Log-likelihood:', naive_bayes_iris.log_likelihood(X, Y))

Log-likelihood: -269720.4033337989
[50. 50. 50.]
Log-likelihood: -984.3869954533072


In [166]:
predictions = naive_bayes_iris.predictions(X[:])

In [170]:
np.sum(Y==np.argmax(np.array([predictions]), axis=2)) / 150

0.9266666666666666