# Naive Bayes with MNIST


In [3]:
import numpy as np
import import_ipynb
from utils import get_mnist_data
from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn

For the naive bayes model, we will need to store the Gaussian paramaeters in a dictionary containing the means and variance for each label (Y values). 

To calculate the priors more efficiently (since they are monotonically increasing), we use the log  likelihood. This in

In [None]:
class NaiveBayes(object):
    def fit(self, X, Y, smoothing=10e-3):
        self.gaussians = dict()
        self.priors = dict()
        labels = set(Y) # store all unique values in Y
        
        for c in labels:
            current_x = X[Y == c] # give the current value of X where y is the same as the current label c.
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': current_x.var(axis=0) + smoothing
            }
            
            self.priors[c] = -np.log(float(len(Y[Y==c]))/len(Y))
            
    def score(self,X,Y):
        P = self.predict(X)
        return np.mean(P==Y)
    
    def predict(self,X):
        N,D = X.shape
        K = len(self.gaussians) # for each N sample there are K different probabilities to be calculated
        P = np.zeros((N,K))
        # loop through all gaussians to get the mean and the variance
        for c, g in self.gaussians.items():
            mean, var = g['mean'],g['var']
            P[:,c] = mvn.logpdf(X, mean=mean, cov=var) + self.priors[c]
        return np.argmax(P,axis=1)
        
    
if __name__ == '__main__':
    fn = '../mnist/train.csv'
    X,Y = get_mnist_data(fn,10000)
    Ntrain = int(len(Y)/2)
    Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
    Xtest, Ytest = X[Ntrain:], Y[Ntrain:]
    
    model = NaiveBayes()
    
    # time how long it takes to fit the model
    t0 = datetime.now()
    model.fit(Xtrain,Ytrain)
    print('Training time:',(datetime.now() - t0))
    
    
    # time how long it takes to take the training accuracy
    t0 = datetime.now()
    print('Training accuracy:', model.score(Xtrain,Ytrain))
    print('Time taken to compute train accuracy of train size:',len(Ytrain),'::', (datetime.now()-t0))
    
    #time how long it takes to get the test accuracy
    t0 = datetime.now()
    print('Testing accuracy:',model.score(Xtest,Ytest))
    print('Time taken to compute testing accuracy of sample size:',len(Ytest),'::',(datetime.now()-t0))

Training time: 0:00:00.723595
Training accuracy: 0.8148
Time taken to compute train accuracy of train size: 5000 :: 0:00:34.926350
