## Naive Bayes - Mushroom Dataset

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/mushrooms.csv")

In [11]:
df.describe()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [18]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


#### Encoding the categorial Data into Numerical Data 

In [14]:
le = LabelEncoder()
# Applies transformation on each coloumn
ds = df.apply(le.fit_transform)

In [20]:
ds.head()
##By the help of above function converted the features to numerical data 

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [36]:
data =ds.values
np.random.shuffle(data,)

In [37]:
data_x = data[:,1:]
data_y= data[:,0]

#### Break the data into train and test

In [47]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2)

In [48]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((6499, 22), (1625, 22), (6499,), (1625,))

In [50]:
np.unique(y_train)

array([0, 1])

### Building our Classifier

In [80]:
def prior_prob(y_train,label):   ## P(y)
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    return class_examples/total_examples
def cond_prob(x_train,y_train,feature_col,feature_val,label) : ### P(x_i/y)
    x_filter = x_train[y_train==label]
    numerator = np.sum(x_filter[:,feature_col]==feature_val)
    denominator = np.sum(y_train==label)
#     print(numerator,denominator)
    return numerator/denominator  
    

#### Computing Posterior Prob for each test example and make predictions

In [108]:
def predict(x_train,y_train,xtest):
#     xtest is a array of features and we have to predict type of mushroom for these labels
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_prob = [] #List of prob for all classes
    # Calculating Posterior for each class
    for label in classes:
        # Post_C = likelihood*prior
        likelihood = 1.0
        for f in range(n_features):
            likelihood = likelihood*cond_prob(x_train,y_train,f,xtest[f],label)  
        prior = prior_prob(y_train,label)
#         print(likelihood,prior)
        post= likelihood*prior
        post_prob.append(post)
    return np.argmax(post_prob)

In [113]:
def accuracy(x_train,y_train,x_test,y_test):
    m = x_test.shape[0]
    s = 0
    for i in range(m):
        if y_test[i] == predict(x_train,y_train,x_test[i]):s=s+1
    return s/m*100        
            

In [114]:
accuracy(x_train,y_train,x_test,y_test)

99.93846153846154

50