# **3. Implement Naive Bayes**

In [1]:
# Importing necessary libraries.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [14]:
# Defining own Gaussian-Naive-Bayes Classifier class.
class GaussianNaiveBayes:

    def __gaussian_density_function(self, mean, variance, x):
        exponent = np.exp(-(x - mean)**2 / (2 * variance))
        return exponent / np.sqrt(2 * np.pi * variance)

    def fit(self, X, Y):
        self.classes = np.unique(Y)
        self.mean = {}
        self.variance = {}
        self.priors = {}

        for cls in self.classes:
            X_cls = X[Y == cls]
            self.mean[cls] = np.mean(X_cls, axis=0)
            self.variance[cls] = np.var(X_cls, axis=0)
            self.priors[cls] = X_cls.shape[0] / float(X.shape[0])

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for cls in self.classes:
                prior = np.log(self.priors[cls])
                class_conditional = np.sum(np.log(self.__gaussian_density_function(self.mean[cls], self.variance[cls], x)))
                posterior = prior + class_conditional
                posteriors.append(posterior)
            predictions.append(self.classes[np.argmax(posteriors)])
        return predictions

In [3]:
# Reading the csv file into dataframe DF. Then printing the first five rows.
DF = pd.read_csv('/content/anemia.csv')
DF.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0


In [4]:
# Checking the DF for null values and datatypes.
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1421 non-null   int64  
 1   Hemoglobin  1421 non-null   float64
 2   MCH         1421 non-null   float64
 3   MCHC        1421 non-null   float64
 4   MCV         1421 non-null   float64
 5   Result      1421 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 66.7 KB


In [5]:
# Printing the min, max, mean, median(50%), Q1(25%), Q3(75%), standard deviation of each feature.
DF.describe()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
count,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0
mean,0.52076,13.412738,22.90563,30.251232,85.523786,0.436312
std,0.499745,1.974546,3.969375,1.400898,9.636701,0.496102
min,0.0,6.6,16.0,27.8,69.4,0.0
25%,0.0,11.7,19.4,29.0,77.3,0.0
50%,1.0,13.2,22.7,30.4,85.3,0.0
75%,1.0,15.0,26.2,31.4,94.2,1.0
max,1.0,16.9,30.0,32.5,101.6,1.0


In [7]:
# Printing the pearson correlation coefficient between each feature. Reason for this is to check if the features are internally dependent or not.
DF.corr()
# We don't need PCA here.(for simplicity)

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
Gender,1.0,0.010972,0.001042,0.014574,-0.040447,0.253169
Hemoglobin,0.010972,1.0,0.014081,-0.042597,-0.025885,-0.796261
MCH,0.001042,0.014081,1.0,0.018795,-0.015948,-0.028678
MCHC,0.014574,-0.042597,0.018795,1.0,0.06845,0.048067
MCV,-0.040447,-0.025885,-0.015948,0.06845,1.0,-0.020571
Result,0.253169,-0.796261,-0.028678,0.048067,-0.020571,1.0


In [19]:
# Scaling the datas using MinMaxScalar. Then spliting the data into training and testing set.
scalar = MinMaxScaler()
scaled_DF = scalar.fit_transform(DF)
DF = pd.DataFrame(scaled_DF, columns=DF.columns)

X = DF.iloc[:, 0:5]
y = DF.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# using my own defined GNBC class.
my_gnb = GaussianNaiveBayes()

my_gnb.fit(X_train.to_numpy(), y_train.to_numpy())
y_pred_1 = my_gnb.predict(X_test.to_numpy())

print(accuracy_score(y_test, y_pred_1))
confusion_matrix(y_test, y_pred_1)

0.9508771929824561


array([[150,   7],
       [  7, 121]])

In [21]:
# using sklearn GNB library
gnb = GaussianNB()

gnb.fit(X_train, y_train)
y_pred_2 = gnb.predict(X_test)

print(accuracy_score(y_test, y_pred_2))
confusion_matrix(y_test, y_pred_2)

0.9508771929824561


array([[150,   7],
       [  7, 121]])