In [21]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB as skGaussianNB

In [22]:
# Example with GaussianNB from scikit-learn
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
skgnb = skGaussianNB()
y_pred = skgnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 75 points : 4


In [23]:
# My implementation of GaussianNB
# Reference https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes
class GausssianNB():
    def fit(self, X, y):
        self.classes_ = np.unique(y) # class labels known to the classifier, (n_classes,)
        n_features = X.shape[1]
        n_classes = len(self.classes_)
        self.class_count_ = np.zeros(n_classes) # number of traning samples observed in each class
        self.sigma_ = np.zeros((n_classes, n_features)) # variance of each feature per class
        self.theta_ = np.zeros((n_classes, n_features)) # mean of each feature per class
        for i, c in enumerate(self.classes_):
            X_c = X[y==c] # training examples observed in this class
            self.class_count_[i] = len(X_c)
            self.sigma_[i] = np.var(X_c, axis=0)
            self.theta_[i] = np.mean(X_c, axis=0)
        self.class_prior_ = self.class_count_ / np.sum(self.class_count_) # probability of each class
        return self
        
    def _joint_log_likelyhood(self, X):
        joint_log_likelyhood = np.zeros((X.shape[0], len(self.classes_)))
        for i in range(len(self.classes_)):
            p1 = np.log(self.class_prior_[i])
            p2 = -0.5 * np.log(2 * np.pi * self.sigma_[i]) - 0.5 * (X - self.theta_[i]) ** 2 / self.sigma_[i]
            joint_log_likelyhood[:, i] = p1 + np.sum(p2, axis=1)
        return joint_log_likelyhood
        
    def predict(self, X):
        joint_log_likelyhood = _joint_log_likelyhood(X)
        return self.classes_[np.argmax(joint_log_likelyhood, axis=1)]

In [24]:
gnb = GaussianNB()
del y_pred
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 75 points : 4
