In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("mushrooms.csv")

In [18]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [19]:
le=LabelEncoder()

In [20]:
df=df.apply(le.fit_transform)

In [21]:
X=df.drop(["type"],axis=1)
y=df["type"]

In [22]:
X.columns

Index(['cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [23]:
type(y)  #Individual column are always series

pandas.core.series.Series

In [24]:
# X.info()

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
le.classes_

array([0, 1, 2, 3, 4, 5, 6])

In [27]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [28]:
check_prior(y,0) #Probablity that data is zero

0.517971442639094

In [29]:
check_prior(y,1) #Probablity that data is zero

0.48202855736090594

In [30]:
X.loc[0] #This is the data of single row`

cap_shape                   5
cap_surface                 2
cap_color                   4
bruises                     1
odor                        6
gill_attachment             1
gill_spacing                0
gill_size                   1
gill_color                  4
stalk_shape                 0
stalk_root                  3
stalk_surface_above_ring    2
stalk_surface_below_ring    2
stalk_color_above_ring      7
stalk_color_below_ring      7
veil_type                   0
veil_color                  2
ring_number                 1
ring_type                   4
spore_print_color           2
population                  3
habitat                     5
Name: 0, dtype: int64

In [31]:
def check_prior(data,value):
    return np.sum(data==value) /len(data)

In [34]:
X_train.columns

Index(['cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [35]:
set(X_train["cap_shape"])

{0, 1, 2, 3, 4, 5}

In [37]:
select=X.loc[y==0]

In [43]:
np.sum(select["cap_shape"]==2)

1596

In [44]:
class NB:
    def __init__(self):
        pass
    
    def fit(self,X,y):
        model={}
        prior={}
        kclasses=set(y)
        for kclass in kclasses: #Class 0 or 1
            model[kclass]={}
            for col in X.columns: # column which one
                model[kclass][col]={}
                for val in set(X[col]): #unique values in that column
                    select=X.loc[y==kclass]
                    prob=np.sum(select[col]==val)/len(select)
                    model[kclass][col][val]=prob

        for kclass in kclasses:
            prior[kclass]=np.sum(y==kclass)/len(y)
        self.model=model
        self.prior=prior
    def predict_point(self,point):
        probs=[]
        for kclass in self.prior:
            p=self.prior[kclass]
            for col in self.model[kclass]:
                p*=self.model[kclass][col][point[col]]
            probs.append(p)
        return np.argmax(probs)
    def predict(self,X):
        y=[]
        for index,row in X.iterrows():
            y.append(self.predict_point(row))
        return np.array(y)
    def score(self,X,y):
        yp=self.predict(X)
        return np.mean(y==yp)

In [45]:
nb=NB()

In [46]:
nb.fit(X_train,y_train)

In [173]:
nb.predict(X_test[:10])

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int64)

In [174]:
y_test[:10]

1971    0
6654    1
5606    1
3332    0
6988    1
5761    1
5798    1
3064    1
1811    0
3422    0
Name: type, dtype: int32

In [175]:
nb.score(X_test,y_test)

0.9973890339425587