# Naive Bayes- Mushroom Dataset

Goal is to to predict the class of mushroom given the features of mushroom, will use Naive Bayes model for this classifiction

## Load the dataset

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../Datasets/Mushroom/mushrooms.csv')
df.head(n=10)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [4]:
df.shape

(8124, 23)

- the data is in textual form, we need to map the values to numbers, one way is to iterate over entire column, figure out unique values, make a dictionary and run a for loop
- use sklearn

## Encode the Categorical data into Numerical Data

In [11]:
le = LabelEncoder()#make an object

#Applies transformation
ds = df.apply(le.fit_transform)

In [10]:
#accepts one array and returns target value
le.fit_transform?
#by default axis=0 means it will apply a function over each column
df.apply?

In [12]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [15]:
data = ds.values

print(data.shape)
print(type(data))
print(data[:5,:])

(8124, 23)
<class 'numpy.ndarray'>
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


## Break the data into train and test

In [16]:
x_data = data[:,1:]
y_data = data[:,0]

print(x_data.shape,y_data.shape)

(8124, 22) (8124,)


In [52]:
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [19]:
# 2 unique mushrooms
np.unique(y_train)

array([0, 1])

### use inverse transformation from sklearn to convert back to categorical data

## Building our classifier

In [25]:
def prior_probab(y_train,label):
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    
    return class_examples/float(total_examples)

In [27]:
y = np.array([1,1,5,5,0,0,0,0,1,1])
print(prior_probab(y,5))
prior_probab(y,1)

0.2


0.4

- P(xi,y=1) is conditional probability (1 feature)
- pi(P(xi,y=1)) is Likelihood  (Product of all features)
- feature_col is like color of mushroom, feature_val is 'green'
- we want to calculate P(mushroom_color='green'|y=2(label))
- we will filter out those rows which have class 2(our sample space got changed here), then we will check which of these mushrooms have color == 'green'

In [53]:
# if our of 10 mushrooms have 3 have 'class-2' and out of those 3, 2 have color=='green' then cond_probab = 2/3
def cond_probab(x_train,y_train,feature_col,feature_val,label):
    
    x_filtered = x_train[y_train == label] #filter the rows
    numerator = np.sum(x_filtered[:,feature_col] == feature_val)
    denominator = np.sum(y_train==label)
    
    return numerator/float(denominator)

## Compute Posterior probability for each test example and make predictions

- for a test example, we will calculate posterior probabilities of belonging it to a particular class, for the class that have max posterior, that will be ans

- given a test point which has n-features,for all possible classes we will calc. cond probabilty of each feature then likelihood
- [0.6 0.4] then we will predict it belongs to class 0

In [37]:
def predict(x_train,y_train,xtest):
    """xtest is a single testing point with n-features"""
    
    classes = np.unique(y_train)
    features = x_train.shape[1]
    
    post_probs = []
    # calc. posterior for each class 
    for label in classes:
        
        likelihood = 1.0
        for f in range(features):
            cond = cond_probab(x_train,y_train,f,xtest[f],label)
            likelihood *= cond
        
        prior = prior_probab(y_train,label)
        post = likelihood*prior
        post_probs.append(post)
        
    
    pred = np.argmax(post_probs)
    return pred    

In [54]:
output = predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

1
1


In [55]:
def score(x_train,y_train,x_test,y_test):
    pred = []
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i])
        pred.append(pred_label)
        
    pred = np.array(pred)
    accuracy = np.sum(pred == y_test)/y_test.shape[0]
    
    return accuracy

In [56]:
score(x_train,y_train,x_test,y_test)

0.9981538461538462

In [61]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [96]:
inverse_label = le.inverse_transform(y_data)

In [97]:
np.unique(inverse_label)

array(['d', 'g'], dtype=object)

In [80]:
df['cap_shape'].unique()

array(['x', 'b', 's', 'f', 'k', 'c'], dtype=object)

In [82]:
le.classes_

array(['d', 'g', 'l', 'm', 'p', 'u', 'w'], dtype=object)

In [93]:
le.get_params('a')

{}