# Naive Bayes classifier
## Dataset

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import cluster_plot
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter, defaultdict

df = pd.read_csv('datasets/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Attribute Information: (classes: edible=e, poisonous=p)

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

In [21]:
# Select only some of columns
index = ['cap-shape', 'cap-surface', 'cap-color', 'odor', 'stalk-shape', 'stalk-root', 'ring-number', 'ring-type', 'habitat']
X = df[index]
x_le = LabelEncoder()
X = X.apply(lambda c: x_le.fit_transform(c), axis=0).values

y_le = LabelEncoder()
y = y_le.fit_transform(df['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Manual implementation

In [106]:
class NaiveBayes:
    def __init__(self):
        pass
    
    def fit(self, X, y):
        self.cls = np.unique(y)
        self.probs = {}
        for c in self.cls:
            self.probs[c] = defaultdict(list)
        # Get probabilities by classes
        self.cls_probs = self._count_freq(y)
        # Get sum of all elements in column by classes
        for c in self.cls:
            subset = X_train[y_train==c]
            for j in range(subset.shape[1]):
                self.probs[c][j] += list(subset[:,j])
        # Find percent for all columns
        for c in self.cls:
            for i in range(X.shape[1]):
                self.probs[c][i] = self._count_freq(self.probs[c][i])
        return self
                 
    def predict_proba(self, X):
        probas = []
        for x in X:
            # For every point there is a dict
            # {class1:proba1, class2:proba2 ... classn:proban}
            res = {}
            for c in self.cls:
                # Count probability that point x belongs to certain class
                prob = self.cls_probs[c]
                for i in range(x.shape[0]):
                    col_probs = self.probs[c][i]
                    # Use Bayes formula to find probability
                    # by multiplying probability of every column 
                    # and probability of the point to be the class 
                    if x[i] in col_probs.keys():
                        prob *= col_probs[x[i]]
                    else:
                        prob = 0
                    res[c] = prob
            probas.append(res)
        return np.array(probas)
    
    def predict(self, X):
        pred = self.predict_proba(X)
        res = []
        for p in pred:
            res.append(np.argmax(list(p.values())))
        return np.array(res)
    
    def _count_freq(self, data):
        probs = dict(Counter(data))
        for k in probs.keys():
            probs[k] /= float(len(data))
        return probs

In [107]:
nb = NaiveBayes()
nb = nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

## Sklearn Naive Bayes

In [104]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# Gaussian NB
gnb_sk = GaussianNB()
gnb_sk = gnb_sk.fit(X_train, y_train)
y_pred_gnb = gnb_sk.predict(X_test)

# Bernoulli NB
bnb_sk = BernoulliNB()
bnb_sk = bnb_sk.fit(X_train, y_train)
y_pred_bnb = bnb_sk.predict(X_test)

# Multinomial NB
mnb_sk = MultinomialNB()
mnb_sk = mnb_sk.fit(X_train, y_train)
y_pred_mnb = mnb_sk.predict(X_test)

## Accuracy test

In [108]:
acc = accuracy_score(y_test, y_pred)
gnb_acc = accuracy_score(y_test, y_pred_gnb)
bnb_acc = accuracy_score(y_test, y_pred_bnb)
mnb_acc = accuracy_score(y_test, y_pred_mnb)
print('Accuracy on test data:\n',
     f'Manual implementation: {acc*100:.2f}%\n',
     f'Gaussian naive bayes: {gnb_acc*100:.2f}%\n',
     f'Bernoulli naive bayes: {bnb_acc*100:.2f}%\n',
     f'Multinomial naive bayes: {mnb_acc*100:.2f}%\n')

Accuracy on test data:
 Manual implementation: 98.52%
 Gaussian naive bayes: 88.63%
 Bernoulli naive bayes: 68.29%
 Multinomial naive bayes: 79.47%



# ¯\\\_(ツ)\_/¯
## Manual rules!?