In [16]:
# import the libraries
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn
from datetime import datetime

In [31]:
# get data
def get_data(size=None):
    df=pd.read_csv('/Users/sunpreet/Downloads/train.csv',nrows=size)
    data= df.as_matrix()
    np.random.shuffle(data)
    X=data[:,1:]/255
    Y=data[:,0]
    return X,Y
    

In [37]:
# define the naive bayes class

class NaiveBayes(object):
    def __init__(self):
        pass
    def fit(self,X,Y,smoothing=10e-3):
        self.gaussian={}
        self.prior={}
        z = set(Y)
        for i in z:
            current_x = X[Y==i]
            self.gaussian[i]= {
                'mean':X[Y==i].mean(axis=0),
                'var':X[Y==i].var(axis=0) + smoothing,
            }
            self.prior[i]= len(Y[(Y==i)])/len(Y)
        
    def score(self,X,Y):
        Prediction = self.predict(X)
        return(np.mean(Y==Prediction))
    
    def predict(self,X):
        N,D=X.shape
        k= len(self.gaussian)
        P = np.zeros((N,k))
        for i,j in self.gaussian.items():
            mean=j['mean']
            var=j['var']
            P[:,i]= mvn.logpdf(X,mean=mean,cov=var) + np.log(self.prior[i])     
        
        U = np.argmax(P,axis=1)
        
        return U
        
            

In [42]:
if __name__== '__main__':
    X,Y=get_data(10000)
    Ntrain = int(len(Y)/2)
    Xtrain,Ytrain = X[:Ntrain],Y[:Ntrain]
    Xtest,Ytest = X[Ntrain:],Y[Ntrain:]
    model=NaiveBayes()
    
    t0 = datetime.now()
    model.fit(Xtrain, Ytrain)
    print ("Training time:", (datetime.now() - t0))

    t0 = datetime.now()
    print ("Train accuracy:", model.score(Xtrain, Ytrain))
    print ("Time to compute train accuracy:", (datetime.now() - t0))

    t0 = datetime.now()
    print ("Test accuracy:", model.score(Xtest, Ytest))
    print ("Time to compute test accuracy:", (datetime.now() - t0))
    

Training time: 0:00:00.166615
Train accuracy: 0.812
Time to compute train accuracy: 0:00:01.853663
Test accuracy: 0.803
Time to compute test accuracy: 0:00:01.835861


In [39]:
# import Naive Bayes from SKlearn
from sklearn.naive_bayes import GaussianNB

In [41]:
# Repeat the same thing from SKlearn
X,Y=get_data(10000)
Ntrain = len(Y)/2
Xtrain,Ytrain = X[:Ntrain],Y[:Ntrain]
Xtest,Ytest = X[Ntrain:],Y[Ntrain:]
model1=GaussianNB()
    
t0 = datetime.now()
model1.fit(Xtrain, Ytrain)
print ("Training time:", (datetime.now() - t0))

t0 = datetime.now()
print ("Train accuracy:", model1.score(Xtrain, Ytrain))
print ("Time to compute train accuracy:", (datetime.now() - t0))

t0 = datetime.now()
print ("Test accuracy:", model1.score(Xtest, Ytest))
print ("Time to compute test accuracy:", (datetime.now() - t0))



  app.launch_new_instance()


Training time: 0:00:00.109602
Train accuracy: 0.5842
Time to compute train accuracy: 0:00:00.526996
Test accuracy: 0.5584
Time to compute test accuracy: 0:00:00.575445
