# Naive Bayes Classifier on Wine Dataset

## Import Modules

In [1]:
import numpy as np
import pandas as pd

import random
import math

## Load and Prepare Data

In [2]:
df=pd.read_csv("Datasets/Wine.csv")
df.head()

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
df["label"] = df.Wine
df = df.drop("Wine",axis=1)
df.head()

Unnamed: 0,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline,label
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Alcohol               178 non-null    float64
 1   Malic.acid            178 non-null    float64
 2   Ash                   178 non-null    float64
 3   Acl                   178 non-null    float64
 4   Mg                    178 non-null    int64  
 5   Phenols               178 non-null    float64
 6   Flavanoids            178 non-null    float64
 7   Nonflavanoid.phenols  178 non-null    float64
 8   Proanth               178 non-null    float64
 9   Color.int             178 non-null    float64
 10  Hue                   178 non-null    float64
 11  OD                    178 non-null    float64
 12  Proline               178 non-null    int64  
 13  label                 178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


## Tain-Test Data Split

In [5]:
def train_test_split(df,test_size):
    indices = df.index.tolist()
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    
    test_indices=random.sample(population=indices,k=test_size)
    test_df=df.loc[test_indices]
    train_df=df.drop(test_indices)
    
    return train_df,test_df

In [6]:
random.seed(0)
train_df,test_df=train_test_split(df,test_size=25)

In [7]:
test_df.head()

Unnamed: 0,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline,label
98,12.37,1.07,2.1,18.5,88,3.52,3.75,0.24,1.95,4.5,1.04,2.77,660,2
107,12.72,1.75,2.28,22.5,84,1.38,1.76,0.48,1.63,3.3,0.88,2.42,488,2
10,14.1,2.16,2.3,18.0,105,2.95,3.32,0.22,2.38,5.75,1.25,3.17,1510,1
66,13.11,1.01,1.7,15.0,78,2.98,3.18,0.26,2.28,5.3,1.12,3.18,502,2
130,12.86,1.35,2.32,18.0,122,1.51,1.25,0.21,0.94,4.1,0.76,1.29,630,3


In [8]:
train_df.head()

Unnamed: 0,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline,label
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


## Converting to numpy array

In [9]:
data = train_df.values
data[:5]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03, 1.000e+00],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03, 1.000e+00],
       [1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
        3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
        1.185e+03, 1.000e+00],
       [1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02, 3.850e+00,
        3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
        1.480e+03, 1.000e+00],
       [1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02, 2.800e+00,
        2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
        7.350e+02, 1.000e+00]])

In [10]:
X_train = data[:,:-1]
X_train

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [11]:
Y_train = data[:,-1]
Y_train

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.])

In [12]:
data1 = test_df.values
X_test = data1[:,:-1]
X_test

array([[1.237e+01, 1.070e+00, 2.100e+00, 1.850e+01, 8.800e+01, 3.520e+00,
        3.750e+00, 2.400e-01, 1.950e+00, 4.500e+00, 1.040e+00, 2.770e+00,
        6.600e+02],
       [1.272e+01, 1.750e+00, 2.280e+00, 2.250e+01, 8.400e+01, 1.380e+00,
        1.760e+00, 4.800e-01, 1.630e+00, 3.300e+00, 8.800e-01, 2.420e+00,
        4.880e+02],
       [1.410e+01, 2.160e+00, 2.300e+00, 1.800e+01, 1.050e+02, 2.950e+00,
        3.320e+00, 2.200e-01, 2.380e+00, 5.750e+00, 1.250e+00, 3.170e+00,
        1.510e+03],
       [1.311e+01, 1.010e+00, 1.700e+00, 1.500e+01, 7.800e+01, 2.980e+00,
        3.180e+00, 2.600e-01, 2.280e+00, 5.300e+00, 1.120e+00, 3.180e+00,
        5.020e+02],
       [1.286e+01, 1.350e+00, 2.320e+00, 1.800e+01, 1.220e+02, 1.510e+00,
        1.250e+00, 2.100e-01, 9.400e-01, 4.100e+00, 7.600e-01, 1.290e+00,
        6.300e+02],
       [1.187e+01, 4.310e+00, 2.390e+00, 2.100e+01, 8.200e+01, 2.860e+00,
        3.030e+00, 2.100e-01, 2.910e+00, 2.800e+00, 7.500e-01, 3.640e+00,
        3.80

In [13]:
Y_test = data1[:,-1]
Y_test

array([2., 2., 1., 2., 3., 2., 2., 2., 2., 2., 3., 1., 2., 1., 2., 3., 1.,
       3., 2., 3., 3., 1., 2., 1., 1.])

## Naive Bayes Algorithm

In [14]:
class NaiveBayes:
    # to fit the training data
    def fit(self,X,y):
        n_samples , n_features = X.shape
        self._classes = np.unique(y)     # number of unique labels
        n_classes = len(self._classes)
        
        # init mean,variance ,priors
        self._mean = np.zeros((n_classes , n_features) , dtype = np.float64)
        self._var = np.zeros((n_classes , n_features) , dtype = np.float64)
        self._priors = np.zeros(n_classes , dtype = np.float64)
        
        for idx , c in enumerate(self._classes):
            X_c = X[c==y]
            self._mean[idx,:] = X_c.mean(axis = 0)
            self._var[idx,:] = X_c.var(axis = 0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)
    
    # to predict the label for our tesy data
    def predict(self,X):
        y_pred = [self._predict(x) for x in X]
        return y_pred
    
    # helper function to predict for a single example only
    def _predict(self,x):
        posteriors = []
        
        for idx , c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            class_conditional = np.sum(np.log(self._pdf(idx , x)))
            posterior = prior + class_conditional
            posteriors.append(posterior)
            
        return self._classes[np.argmax(posteriors)]
            
    def _pdf(self , class_idx , x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        #mean+=1;
        #var+=1;
        ans=(-(x-mean)**2/(2*var))
        numerator = [math.exp(x) for x in ans]
        denominator = np.sqrt(2*np.pi*var)
        return numerator / denominator

## Implementation

In [15]:
nb = NaiveBayes()
nb.fit(X_train,Y_train)
Y_pred = nb.predict(X_test)
Y_pred

[2.0,
 2.0,
 1.0,
 2.0,
 3.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 3.0,
 1.0,
 2.0,
 1.0,
 2.0,
 3.0,
 1.0,
 3.0,
 2.0,
 3.0,
 3.0,
 1.0,
 2.0,
 2.0,
 1.0]

## Calculating Accuracy

In [16]:
val = Y_pred==Y_test
accuracy = val.mean()
accuracy

0.96

### We have achieved 96% of accuracy