In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(60000,784)
x_test = x_test.reshape(10000,784)

In [3]:
n_samples = x_train.shape[0]+x_test.shape[0]

In [4]:
n_features = x_train.shape[1]

In [5]:
# Creating the dataset
dataset = np.zeros([n_samples,n_features+1])
dataset[:60000,:-1] = x_train
dataset[:60000,-1] = y_train
dataset[60000:,:-1] = x_test
dataset[60000:,-1] = y_test

In [41]:
def naive_bayes_scratch(dataset,alpha):
    
    classes = np.unique(dataset[:,-1])
    n_classes = len(classes)
    
    train_set = dataset[:60000,:]
    test_set = dataset[60000:,:]
    test_set_c=test_set[:,:-1]
    test_set_l=test_set[:,-1]
    
    # Mean and Variance of the Features with respect to the classes
    mean = np.zeros([n_classes,n_features])
    var = np.zeros([n_classes,n_features])
    
    #Store the P(class)
    class_prob = []
    
    #Confusion Matrix
    confusion_matrix = np.zeros([n_classes,n_classes])
    
    #Store the accuracy of each digit while testing on the dataset
    digit_error = []
    
    # Finding the each class probability, mean and variance of each class ans store them in the corresponding matrix
    for c in classes:
        train_set_c = train_set[train_set[:,-1]==c]
        train_set_c = train_set_c[:,:-1]
        c_prob = len(train_set_c)/len(train_set)
        class_prob.append(c_prob)
        mean[int(c),:] = train_set_c.mean(axis=0)
        var[int(c),:] = train_set_c.var(axis=0)
        
    # Adding constant to variance to avoid variance with zero value
    var = var+alpha
    
    # Variable to keep count of correct prediction
    correct_prediction=0
    
    for i in range(test_set.shape[0]):
        ratios=[]
        
        # Calculating the probabilities for each class
        for j in range(n_classes):
            current_class_prob = np.exp(-((test_set_c[i]-mean[j])**2)/(2*var[j]))/np.sqrt(2*np.pi*(var[j]))
            r = np.sum(np.log(current_class_prob))
            r = r+np.log(class_prob[j])
            ratios.append(r)
            
        # Prediction will be equal to index which has highest probability
        prediction = ratios.index(max(ratios))
        
        # Updating the confusion matrix and correct_prediction count
        if prediction == test_set_l[i]:
            correct_prediction = correct_prediction+1
            confusion_matrix[int(test_set_l[i])][int(test_set_l[i])] = confusion_matrix[int(test_set_l[i])][int(test_set_l[i])]+1
        else:
            for p in range(n_classes):
                if prediction == test_set_l[i]:
                    confusion_matrix[int(test_set_l[k])][int(test_set_l[i])] = confusion_matrix[int(test_set_l[k])][int(test_set_l[i])]+1
        
    # Calculating the accuracy for each digit
    for c in classes:
        class_count = test_set[test_set[:,-1]==c]
        error = 1-((confusion_matrix[int(c)][int(c)])/class_count.shape[0])
        digit_error.append(error)
        
    # Calculating the overall accuracy
    total_error = 1-(correct_prediction/test_set.shape[0])
    
    return (total_error,digit_error,confusion_matrix)

In [42]:
def naive_bayes_with_smoothing(dataset):
    
    alpha = [500,1000,2000]
    
    lists=[]
    for i in range(len(alpha)):
        (err,digit_err,cm) = naive_bayes_scratch(dataset,alpha[i])
        lists.append(err)
    
    index_alpha = lists.index(min(lists))
    print("Alpha value with minimum total error = ", alpha[index_alpha])
    
    (total_error,digit_error,confusion_matrix)=naive_bayes_scratch(dataset,alpha[index_alpha])
    
    return (total_error,digit_error,confusion_matrix)
    

In [43]:
(total_error,digit_error,confusion_matrix) = naive_bayes_with_smoothing(dataset)

Alpha value with minimum total error =  1000


In [44]:
digit_error = np.array(digit_error)
digit_error = digit_error.reshape(10,1)
digit_error_df = pd.DataFrame(digit_error, columns = ['Digit Misclassification Error'])
print('Digitwise Misclassification Error ')
digit_error_df

Digitwise Misclassification Error 


Unnamed: 0,Digit Misclassification Error
0,0.079592
1,0.034361
2,0.246124
3,0.194059
4,0.348269
5,0.356502
6,0.097077
7,0.185798
8,0.237166
9,0.107037


In [45]:
print('Total Classification Error: ')
total_error

Total Classification Error: 


0.18500000000000005