In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../Datasets/Mushroom/mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.shape

(8124, 23)

In [6]:
df['type'].unique()

array(['p', 'e'], dtype=object)

## convert the categorical data into numberical data

In [8]:
le= LabelEncoder() #make an object

In [9]:
le

LabelEncoder()

In [10]:
ds = df.apply(le.fit_transform)

In [12]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [13]:
ds['type'].unique()

array([1, 0])

In [18]:
np.unique(le.inverse_transform(ds['type']))

array(['d', 'g'], dtype=object)

In [20]:
data = ds.values
print(type(data))
print(data.shape)
print(data[:5,:])

<class 'numpy.ndarray'>
(8124, 23)
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


In [21]:
x_data = data[:,1:]
y_data = data[:,0]
print(x_data.shape,y_data.shape)

(8124, 22) (8124,)


In [22]:
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


## classifier

In [23]:
def prior_probability(y_train,label):
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train == label)
    
    return class_examples/total_examples

In [25]:
y = np.array([0,0,1,1,1,1,1,1,7,8])
prior_probability(y,1)

0.6

In [31]:
def conditional_probability(x_train,y_train,label,feature_col,feature_val):
    x_filtered = x_train[y_train == label]  #change the sample space..we want only those that are of a particular class
    numerator = np.sum(x_filtered[:,feature_col] == feature_val)
    denominator = np.sum(y_train==label)
    
    return numerator/denominator

In [34]:
def predict(x_train,y_train,xtest):
    classes = np.unique(y_train)
    #print(classes)
    n_features = x_train.shape[1]
    posterior_probabilities = []
    for label in classes:
        
        likelihood = 1.0
        for f in range(n_features):
            cond = conditional_probability(x_train,y_train,label,f,xtest[f])
            likelihood *= cond

        prior = prior_probability(y_train,label)
        posterior = likelihood*prior
        posterior_probabilities.append(posterior)
        
    predict = np.argmax(posterior_probabilities)
    return predict        

In [37]:
pred = predict(x_train,y_train,x_test[1])
print(pred)
print(y_test[1])

1
1


In [38]:
def score(x_train,y_train,x_test,y_test):
    
    pred = []
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i])
        pred.append(pred_label)
        
    pred = np.array(pred)
    accuracy = np.sum(y_test == pred)/y_test.shape[0]
    
    return accuracy

In [39]:
score(x_train,y_train,x_test,y_test)

0.9969230769230769