In [219]:
import numpy as np
from numpy import array as ar
import pandas as pd

class NaiveBayesSingleVar:

    def __init__(self):
         pass
        
    def fit(self, X_train, Y_train):

        self.Y_unique = np.unique(Y_train)
        print(self.Y_unique)
        
        self.y_count = len(self.Y_unique)
        self.X_unique = np.unique(X_train)
        print(self.X_unique)
        
        self.x_count = len(self.X_unique)
        self.__X_given_y = [X_train[Y_train == y] for y in np.unique(Y_train)]

        prob_table = np.zeros((self.x_count,self.y_count))
        priors = np.zeros(self.y_count)
        x_normalization = np.zeros(self.x_count)
        n_points = len(Y_train)
        for i, x in enumerate(self.X_unique):
            x_normalization[i] = len(X_train[X_train == x])/n_points
            for j, y in enumerate(self.Y_unique):
                denominator = len(Y_train[Y_train == y])
                
                numerator = len(X_train[(X_train == x) & (Y_train == y)])
                prob = numerator/denominator
                prob_table[i,j] = prob

        for j, y in enumerate(self.Y_unique):
            denominator = len(Y_train[Y_train == y])
            priors[j] = denominator/n_points

        self.prob_table = pd.DataFrame(prob_table.T, columns = self.X_unique, index = self.Y_unique)
        self.priors = pd.DataFrame(priors, index = self.Y_unique)
        self.x_normalization = pd.DataFrame(x_normalization, index = self.X_unique)


    def predict_proba(self, X_test):

        x_test_len = len(X_test)

        results = np.ones((x_test_len, len(self.Y_unique)))
        for i, x_test in enumerate(X_test):
            for j,y in enumerate(self.Y_unique):
                likelihood = self.prob_table[x_test][y]
                prior = self.priors.loc[y]
                x_norm =self.x_normalization.loc[x_test]
                
                results[i,j] = (likelihood*prior)/x_norm

        return results


if __name__ == '__main__':
    X_train = ar([3,3,3,3,3,2,2,2,2,2,7])
    Y_train = ar([0,1,0,1,1,0,1,1,0,0,0])

    naivebayes_single_var = NaiveBayesSingleVar()
    naivebayes_single_var.fit(X_train, Y_train)

    X_test = ar([3,2])

    results = naivebayes_single_var.predict_proba(X_test)

#     print(results)


[0 1]
[2 3 7]


# test data
## https://medium.com/@rangavamsi5/naïve-bayes-algorithm-implementation-from-scratch-in-python-7b2cc39268b9

In [206]:
df = pd.read_csv('naivebayes/toy_data.csv')

X_train = df.drop('Play', axis = 1).values.T

X_train

Y_train = df['Play'].values

In [207]:
X_train[:,]

array([['Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Sunny',
        'Overcast', 'Rainy', 'Rainy', 'Sunny', 'Rainy', 'Overcast',
        'Overcast', 'Sunny'],
       ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild',
        'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
       ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal',
        'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
       ['f', 't', 'f', 'f', 'f', 't', 't', 'f', 'f', 'f', 't', 't', 'f',
        't']], dtype=object)

In [208]:
__prob_table_dd = []
__X_normalization_dd = []

for X_train_single_feature in X_train[:,]:
    naivebayes_single_var = NaiveBayesSingleVar()
    naivebayes_single_var.fit(X_train_single_feature, Y_train)

    __prob_table_dd.append(naivebayes_single_var.prob_table)
    __X_normalization_dd.append(naivebayes_single_var.x_normalization)

__priors = naivebayes_single_var.priors

Y_unique = np.unique(Y_train)

['no' 'yes']
['Overcast' 'Rainy' 'Sunny']
['no' 'yes']
['Cool' 'Hot' 'Mild']
['no' 'yes']
['High' 'Normal']
['no' 'yes']
['f' 't']


In [213]:
X_test = X_test.T

In [214]:
X_test

array([3, 2])

In [215]:
X_test = ar([['Rainy','Mild','Normal','t']])

In [216]:
for i in range(X_train[:,].shape[0]):
   print(__prob_table_dd[0])

     Overcast     Rainy     Sunny
no   0.000000  0.600000  0.400000
yes  0.444444  0.222222  0.333333
     Overcast     Rainy     Sunny
no   0.000000  0.600000  0.400000
yes  0.444444  0.222222  0.333333
     Overcast     Rainy     Sunny
no   0.000000  0.600000  0.400000
yes  0.444444  0.222222  0.333333
     Overcast     Rainy     Sunny
no   0.000000  0.600000  0.400000
yes  0.444444  0.222222  0.333333


In [217]:
results = np.zeros((X_test.shape[0], len(Y_unique)))
for i, x_test in enumerate(X_test):
    print(x_test)
    print(i)
    for j, y in enumerate(Y_unique):

        likelihood = []
        prior = __priors.loc[y][0]
        x_norm = []
        
        # Naive Bayes Assumption of independence among dimensions
        for d in range(X_train[:,].shape[0]):
            x_test_d = x_test[d]
            
            likelihood.append(__prob_table_dd[d][x_test_d][y])
            x_norm.append(__X_normalization_dd[d].loc[x_test_d])
        
        likelihood_prod = np.prod(likelihood)
#         print(likelihood_prod)
        posterior_numerator = likelihood_prod*prior
#         print(posterior_numerator)
        posterior_denominator = np.prod(x_norm)
#         print(posterior_denominator)

        posterior = posterior_numerator/posterior_denominator
        
        results[i,j] = posterior

['Rainy' 'Mild' 'Normal' 't']
0


In [218]:
results

array([[0.3136    , 0.43017833]])