# Naive Bayes - Mushroom Classification
Goal is to predict the class of mushrooms, given some features of the mushrooms.
We will use Naive Bayes Model for this classification.


### Load the Dataset




In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../Datasets/Mushrooms/mushrooms.csv')
df.head(n=10)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [3]:
df.shape

(8124, 23)

In [4]:
# Here we have 8124 mushrooms and each having 23 features.
# The Problem is that we do not have the numerical data therefore we must...
# find some ways to convert the categorical data to numerical data

### Encode the Categorial Data into Numerical Data

In [5]:
le = LabelEncoder()
#Applies transformation on each columns (by using sklearn)
# We can also do te transformation by mapping the featurs using a 
# for loop.[a,b,...,z] is mapped with [0,1,2,..,25]
ds = df.apply(le.fit_transform)#It encodes the categorical feature to numerical feature

In [6]:
print(type(ds))
ds.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [7]:
data = ds.values #convaerting a dataframe to numpy array
print(data.shape)
print(type(data))
print(data[:5,:])

data_y = data[:,0] # output label
data_x = data[:,1:] # input (features label)

(8124, 23)
<class 'numpy.ndarray'>
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


### Break the Data into train and test

In [8]:
# 80%-Training Data and 20% Test Data
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2)

In [9]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [10]:
print(x_test[1])
print(x_test[-1])

[2 3 3 1 5 1 0 0 9 1 1 2 2 7 7 0 2 1 4 3 5 0]
[ 3  0  9  0  5  1  0  1 10  0  0  1  3  7  4  0  2  1  0  7  4  0]


In [18]:
# Classes Of Mushroom
np.unique(y_train)

array([0, 1])

In [19]:
# From here we can say that we have only 2 kind of mushroom of classes 0 and 1

### Building Our Classifier ! (Naive Bayes Clasifier)

In [20]:
# a = np.array([0,5,5,1,1,1,0,1])
# print(a==5)
# np.sum(a==5)

In [21]:
def prior_prob(y_train,label):
    
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    
    return (class_examples)/float(total_examples)

In [22]:
#y = np.array([0,5,5,1,1,1,1,0,0,0])

#prior_prob(y,5)

In [23]:
# conditional probability of a particular feature
#label-class(either 0 or 1 in this case)
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    
    # x_filtered->Only those rows from x_train where (y_train==label)
    x_filtered = x_train[y_train==label] #All the mushrooms belonging to the given class
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    denominator = np.sum(y_train==label)
    
    return numerator/float(denominator)


### Next Step : Compute Posterior Prob for each test example and make predictions

In [24]:
print(np.unique(y_train))

[0 1]


In [25]:
# Given an unknown mushroom predict the class of the mushroom
# We will calculate posterior probability of every class
def predict(x_train,y_train,xtest):
    """Xtest is a single testing point, n features
       As we will be given a particualr row from 
       xtest.
       Now for a given testing point(i.e.Mushrrom) we need to make predicttion
       about the class of that testing point.
    """
    
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_probs = [] # List of prob for all classes and given a single testing point(1 row at a time)
    #Compute Posterior for each class
    for label in classes:
        
        #Post_c = likelihood*prior
        likelihood = 1.0
        for f in range(n_features):
            #Here dimension of xtest is (1*n) (i.e, we have selected a particular row from xtest)
            # Iterate over each feature of xtest calculate its likelihood
            cond = cond_prob(x_train,y_train,f,xtest[f],label)
            likelihood *= cond 
            
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        post_probs.append(post)
        
    pred = np.argmax(post_probs)
    return pred

In [26]:
output = predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

0
0


In [27]:
def score(x_train,y_train,x_test,y_test):

    pred = []
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i])
        pred.append(pred_label) # <===Correction
    
    pred = np.array(pred)
    
    accuracy = np.sum(pred==y_test)/y_test.shape[0]
    return accuracy

In [28]:
print(score(x_train,y_train,x_test,y_test))

0.9963076923076923


### ..and its works well!

### Multinomail Naive Bayes

In [12]:
class Multinomailnb:
    #constructor
    def __init__(self,alpha=1):
        self.alpha=alpha;
    
    def fit(self, X_train, y_train):
        m, n = X_train.shape
        self._classes = np.unique(y_train)
        n_classes = len(self._classes)

        # init: Prior & Likelihood
        self._priors = np.zeros(n_classes)
        self._likelihoods = np.zeros((n_classes, n))

        # Get Prior and Likelihood
        for idx, c in enumerate(self._classes):
            X_train_c = X_train[c == y_train]
            self._priors[idx] = X_train_c.shape[0] / m 
            self._likelihoods[idx, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))
    
    def calc_likelihood(self, cls_likeli, x_test):
        return np.log(cls_likeli) * x_test
    
    def _predict(self, x_test):
        # Calculate posterior for each class
        posteriors = []
        for idx, c in enumerate(self._classes):
            prior_c = np.log(self._priors[idx])
            likelihoods_c = self.calc_likelihood(self._likelihoods[idx,:], x_test)
            posteriors_c = np.sum(likelihoods_c) + prior_c
            posteriors.append(posteriors_c)
            
        return self._classes[np.argmax(posteriors)]
    
    def predict(self, X_test):
        return [self._predict(x_test) for x_test in X_test]

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return np.sum(y_pred == y_test)/len(y_test)


In [13]:
mnb=Multinomailnb()

In [15]:
mnb.fit(x_train,y_train)

In [16]:
mnb.predict(x_test)

[0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,


In [17]:
mnb.score(x_test,y_test)

0.8387692307692308

### Multinomail Naive Bayes using sklearn

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
mnb=MultinomialNB()

In [31]:
mnb.fit(x_train,y_train)

MultinomialNB()

In [32]:
mnb.predict(x_test)

array([0, 0, 0, ..., 0, 0, 1])

In [33]:
mnb.score(x_test,y_test)

0.8387692307692308

### Why Multinomaiail Naive Bayes Gave less accuracy than the Naive Bayes???