In [1]:
import numpy as np
import pandas as pd

# Naive Bayes Classifier 
It is a conditional probability model, with formula: <br>
$ P(C| x_1, x_2, x_3, ...) = \frac{P(C)P(X|C)}{P(X)}$ <br>
It is naive because we have naive assumption such that every pair of features are independent from each other given C.<br>
So we can rewrite the formula as: <br>
$ P(C| x_1, x_2, x_3, ...) = P(C)P(x_1|C)P(x_2|C)... = P(C)\prod^{n}_{i=1} P(x_i|C)$

In [2]:
class Naive_Bayes():
    """
    
    Naive Bayes classifer
    
    Attributes:
        prior: P(Y)
        likelihood: P(X_j | Y)
    """
    
    def __init__(self):
        """
            Some initializations, if neccesary
        """
        
        self.model_name = 'Naive Bayes'
    
    
    def fit(self, X_train, y_train):
        
        """ 
            The fit function fits the Naive Bayes model based on the training data. 
            Here, we assume that all the features are **discrete** features. 
            
            X_train is a matrix or 2-D numpy array, represnting training instances. 
            Each training instance is a feature vector. 

            y_train contains the corresponding labels. There might be multiple (i.e., > 2) classes.
        """
        
        """
            TODO: 1. Modify and add some codes to the following for-loop
                     to compute the correct prior distribution of all y labels.
                  2. Make sure they are normalized to a distribution.
        """
        X_train = np.array(X_train)
        y_train = np.array(y_train).ravel()
        unique_y = np.unique(y_train)
        self.priors = dict()
        self.likelihood = dict()
        for y in unique_y:
            self.priors[f'Y={y}'] = np.sum(y_train == y) / len(y_train)
        prior_sum = sum(self.priors.values())
        for y in unique_y:
            self.priors[f'Y={y}'] /= prior_sum


            
        """
            TODO: 3. Modify and add some codes to the following for-loops
                     to compute the correct likelihood P(X_j | Y).
                  4. Make sure they are normalized to distributions.
        """


        for y in unique_y:
            X_y = X_train[y_train == y]
            for j in range(X_train.shape[1]):
                values, counts = np.unique(X_y[:, j], return_counts=True)
                for v, c in zip(values, counts):
                    self.likelihood[f'X{j}={v}|Y={y}'] = c / len(X_y)
#         print(self.likelihood)

        """
            TODO: 5. Think about whether we really need P(X_1 = x_1, X_2 = x_2, ..., X_d = x_d)
                     in practice?
                  6. Does this really matter for the final classification results?
        """

        
    def ind_predict(self, x : list):
        
        """ 
            Predict the most likely class label of one test instance based on its feature vector x.
        """
        
        """
            TODO: 7. Enumerate all possible class labels and compute the likelihood 
                     based on the given feature vector x. Don't forget to incorporate 
                     both the prior and likelihood.
                  8. Pick the label with the higest probability. 
                  9. How to deal with very small probability values, especially
                     when the feature vector is of a high dimension. (Hint: log)
                  10. How to how to deal with unknown feature values?
        """
        
        ret, max_prob = None, 0
        for y in self.priors.keys():
            prob = self.priors[y]
            for i in range(x.shape[0]):
                for j in range(x.shape[1]):
                    likelihood_key = f'X{j}={x[i,j]}|{y}'
#                     print(likelihood_key)
                    if likelihood_key in self.likelihood:
                        prob *= self.likelihood[likelihood_key]
                    else:
                        prob *= np.log(prob)
            if prob > max_prob:
                max_prob = prob
                ret = y.split('=')[1]
        return ret
    

    
    def predict(self, X):
        
        """
            X is a matrix or 2-D numpy array, represnting testing instances. 
            Each testing instance is a feature vector. 
            
            Return the predictions of all instances in a list.
        """
        
        """
            TODO: 11. Revise the following for-loop to call ind_predict to get predictions.
        """
        
        ret = []
        for x in X:
            ret.append(self.ind_predict(x))
        
        return ret
        

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
col = ['class_name','left_weight','left_distance','right_weight','right_distance']
data = pd.read_csv(url, delimiter = ',', names = col)

In [4]:
data

Unnamed: 0,class_name,left_weight,left_distance,right_weight,right_distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [5]:
data.class_name.value_counts()

R    288
L    288
B     49
Name: class_name, dtype: int64

In [6]:
X = np.matrix(data.iloc[:,1:])
y = data.class_name
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 88)
y_test

188    R
614    L
30     B
386    R
472    L
      ..
449    R
383    R
378    B
170    R
205    L
Name: class_name, Length: 207, dtype: object

In [31]:
clf = Naive_Bayes()
clf.fit(X_train, y_train)
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

Overall Accuracy

In [8]:
sum(y_hat == y_test)/ 207  # you should get something like 0.88

0.8840579710144928

In [9]:
len(y_test)

207

In [10]:
len(y_hat)

207

My data

In [26]:
df = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')
df_clean = df.drop(columns = ['Fruits','AnyHealthcare','Sex','NoDocbcCost'])
X = np.matrix(df.iloc[:,1:])
y = df['Diabetes_binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 88)
y_test = y_test.astype(str)
y_test

243435    0.0
168878    0.0
5656      0.0
155364    0.0
105198    0.0
         ... 
171531    1.0
66331     0.0
204922    0.0
1052      0.0
169474    0.0
Name: Diabetes_binary, Length: 83715, dtype: object

In [29]:
clf = Naive_Bayes()
clf.fit(X_train, y_train)
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

In [30]:
sum(y_hat == y_test)/len(y_test)

0.8151466284417368