In [72]:
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [95]:
class NBC:
    def __init__(self, feature_types, num_classes):
        self.feature_types = feature_types 
        self.num_classes = num_classes
    
    def fit(self, X, y):
        classes, classCounts = np.unique(y, return_counts=True)
        
        pis = np.expand_dims(classCounts / y.shape[0], axis=1)
        
        allMus = [] 
        allSigmaSqrs = []
        
        for i in range(classes.shape[0]):
            # Get class examples  
            classExampleIdxs = np.argwhere(y==classes[i])[:,0]
        
            # Calculate class parameters
            # Empirical mean / Bernoulli distribution parameter
            mus = np.average(X[classExampleIdxs,:], axis=0)
            # Emperical variance 
            sigmaSqrs = np.var(X[classExampleIdxs,:], axis=0)
            
            allMus.append(mus)
            allSigmaSqrs.append(sigmaSqrs)
        
        allMus = np.array(allMus)
        allSigmaSqrs = np.array(allSigmaSqrs)
        
        self.pis = pis
        self.mus = allMus
        self.sigmaSqrs = allSigmaSqrs
        self.classes = classes
    
    def calcRealProb(self, X, realIndices, clsIdx): 
        mean = self.mus[clsIdx,realIndices]
        variances = self.sigmaSqrs[clsIdx,realIndices] 
        realXs = X[:,realIndices]
        probs = np.exp(-np.square(realXs-mean)/(2*(variances+1e-6)))/np.sqrt(2*np.pi*(variances+1e-6))
        return probs
    
    def calcCatProb(self, X, binIndices, clsIdx):
        params = self.mus[clsIdx,binIndices]
        binXs = X[:,binIndices]
        probs = np.zeros_like(binXs) 
        for i in range(params.shape[0]):
            feature = binXs[:,i]
            featureProbs = np.zeros_like(feature)
            featureProbs[feature == 1] = params[i]
            featureProbs[feature == 0] = 1 - params[i]
            probs[:,i] = featureProbs
        return probs
        
    def calcClassProb(self, X, clsIdx):
        featureTypes = self.feature_types
        binIndices = [i for i, x in enumerate(featureTypes) if x == 'b']
        realIndices = [i for i, x in enumerate(featureTypes) if x == 'r']
        
        realProbs = self.calcRealProb(X, realIndices, clsIdx)
        catProbs = self.calcCatProb(X, binIndices, clsIdx)
        
        # Ensure no zeros
        realProbs[realProbs == 0] = 1e-6
        catProbs[catProbs == 0] = 1e-6
        
        realProbs = np.log(realProbs)
        catProbs = np.log(catProbs)
        realProbs = np.sum(realProbs,axis=1)
        catProbs = np.sum(catProbs, axis=1)
        
        return realProbs + catProbs
    
    def predict(self, X):
        classProbs = []
        for i in range(self.classes.shape[0]):
            classProbs.append(self.calcClassProb(X, i))
        classProbs = np.array(classProbs)
        predictedClassIdx = np.argmax(classProbs,axis=0)
        return self.classes[predictedClassIdx]
        

In [58]:
nbc = NBC(feature_types=['b','r','b','r'], num_classes=2)
nbc.fit(np.array([[[1],[0.5],[1],[0.5]],[[1],[0.5],[0],[0.5]],[[1],[0.5],[0],[0.5]]]),np.array([[2],[0],[2]]))

[[1. ]
 [0.5]
 [0. ]
 [0.5]]
[[1. ]
 [0.5]
 [0.5]
 [0.5]]


In [81]:
X = np.array([[[1],[0.5],[1],[0.5]],[[1],[0.5],[0],[0.5]],[[1],[0.5],[0],[0.5]]])
print(X.shape)
yhat = nbc.predict(X)
y = np.array([[2],[0],[2]])
print(y.shape)
test_accuracy = np.mean(y == yhat)
print(test_accuracy)

(3, 4, 1)
(3, 1)
0.6666666666666666


In [131]:
# Iris Dataset 
iris = load_iris() 
X, y = iris['data'], iris['target']
XTrain, XTest, yTrain, yTest =\
    train_test_split(X, y, test_size=0.25)
XTrain = np.expand_dims(XTrain,axis=2)
XTest = np.expand_dims(XTest,axis=2)
yTrain = np.expand_dims(yTrain,axis=1)
yTest = np.expand_dims(yTest,axis=1)

nbc = NBC(feature_types=['r','r','r','r'], num_classes=3)
nbc.fit(XTrain, yTrain)
nbcYTrainPredict = nbc.predict(XTrain)
nbcYTestPredict = nbc.predict(XTest)
nbcTrainAccuracy = np.mean(nbcYTrainPredict == yTrain)
nbcTestAccuracy = np.mean(nbcYTestPredict == yTest)

logReg = LogisticRegression(\
    solver='lbfgs', multi_class='multinomial', max_iter=1000)
logReg.fit(np.squeeze(XTrain), np.squeeze(yTrain))
logRegYTrainPredict = logReg.predict(np.squeeze(XTrain))
logRegYTestPredict = logReg.predict(np.squeeze(XTest))
logRegTrainAccuracy = np.mean(logRegYTrainPredict == yTrain)
logRegTestAccuracy = np.mean(logRegYTestPredict == yTest)

print("Naive Bayes Training Accuracy", nbcTrainAccuracy)
print("Naive Bayes Test Accuracy", nbcTestAccuracy)
print("Logistic Regression Training Accuracy", logRegTrainAccuracy)
print("Logistic Regression Test Accuracy", logRegTestAccuracy)


Naive Bayes Training Accuracy 0.9553571428571429
Naive Bayes Test Accuracy 0.9736842105263158
Logistic Regression Training Accuracy 0.3335459183673469
Logistic Regression Test Accuracy 0.3365650969529086
