In [None]:
import numpy as np
from util import get_data
from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn

In [None]:
class NaiveBayes:
    def fit(self, X, Y, smoothing=10e-3):
        self.gaussians = {}
        self.priors = {}
        labels = set(Y)
        for c in labels:
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': current_x.var(axis=0) + smoothing,
            }
            self.priors[c] = len(current_x) / len(X)
    
    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)
    
    def predict(self, X):
        N, D = X.shape
        K = len(self.gaussians)
        P = np.zeros((N, K))  # for each of N samples - K probabilities to compute
        for c, g in self.gaussians.items():
            mean, var = g['mean'], g['var']
            P[:, c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        return np.argmax(P, axis=1)
        

In [None]:
X, Y = get_data(10000)
Ntrain = int(len(Y) / 2)
Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
Xtest, Ytest = X[Ntrain:], Y[Ntrain:]

model = NaiveBayes()
t0 = datetime.now()
model.fit(Xtrain, Ytrain)
print("Training time: {}".format(datetime.now() - t0))

t0 = datetime.now()
print("Train accuracy: {}".format(model.score(Xtrain, Ytrain)))
print("Time to compute train accuracy: {}, Train size: {}".format(datetime.now() - t0, len(Ytrain)))

t0 = datetime.now()
print("Test accuracy: {}".format(model.score(Xtest, Ytest)))
print("Time to compute test accuracy: {}, Test size: {}".format(datetime.now() - t0, len(Ytest)))
