In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.utils import check_array
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [2]:
class idfGaussianNB:
    def __init__(self, var_smoothing=1e-9):
        self.var_smoothing = var_smoothing
        
    def fit(self, X, y):
        n_smaples, n_features = X.shape
        if(n_smaples != y.shape[0]):
            raise ValueError('X and y have incompatible shapes.')
        self._classes = unique_y = np.unique(y)
        n_classes = unique_y.shape[0]
        self._theta = np.zeros((n_classes, n_features))
        self._sigma = np.zeros((n_classes, n_features))
        self._class_prior = np.zeros(n_classes)
        self._epsilon = self.var_smoothing * np.var(X, axis=0).max()
        for i, y_i in enumerate(unique_y):
            self._theta[i, :] = np.mean(X[y == y_i, :], axis=0)
            self._sigma[i, :] = np.var(X[y == y_i, :]) + self._epsilon
            self._class_prior[i] = np.float(np.sum(y == y_i)) / n_smaples
        return self
    
    def predict(self, X):
        jll = self.joint_log_likelihood(X)
        return self._classes[np.argmax(jll, axis=1)]
    
    def predict_proba(self, X):
        return np.exp(self.predict_log_prob(X))
    
    def predict_log_prob(self, X):
        jll = self.joint_log_likelihood(X)
        log_prob_x = np.log(np.sum(np.exp(jll)))
        return jll-np.atleast_2d(log_prob_x).T
        
    def joint_log_likelihood(self, X):
        X = np.atleast_2d(X)
        joint_log_likelihood = []
        for i in range(np.size(self._classes)):
            jointi = np.log(self._class_prior[i])
            n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self._sigma[i, :]))
            n_ij -= 0.5 * np.sum(((X - self._theta[i, :]) ** 2) /
                                 (self._sigma[i, :]), 1)
            joint_log_likelihood.append(jointi + n_ij)

        joint_log_likelihood = np.array(joint_log_likelihood).T
        return joint_log_likelihood
        

In [3]:
data = load_iris()

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [4]:
idf_clf = idfGaussianNB()
clf = GaussianNB()

In [5]:
idf_clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [6]:
idf_clf.predict(X_test)

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2,
       1, 0, 1, 0, 1, 1, 1, 2, 1, 1, 2, 1, 1, 0, 2, 2])

In [7]:
clf.predict(X_test)

array([1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 1, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2,
       2, 0, 1, 0, 1, 1, 1, 2, 1, 1, 2, 2, 2, 0, 2, 2])

In [8]:
print('Same prediction on training set:\n', 
      np.bincount(idf_clf.predict(X_train) == clf.predict(X_train)))
print('Same prediction on test set:\n', 
      np.bincount(idf_clf.predict(X_test) == clf.predict(X_test)))

Same prediction on training set:
 [ 12 100]
Same prediction on test set:
 [ 5 33]
