Naive Bayesian

In [262]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math

In [263]:
# data
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, :])
    # print(data)
    return data[:,:-1], data[:,-1]

In [4]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [247]:
class Naive_Bayes:
    def __init__(self):
        self.model=None
    
    def mean(self, X):
        return np.mean(X)

    def stdev(self, X):       
        return np.std(X)
        

    def gaussian_prob(self, x, mean, stdev):
        exponent = math.exp(-(math.pow(x - mean, 2) / 
                              (2 * math.pow(stdev, 2))))
        Prob=  (1 / (math.sqrt(2 * math.pi) * stdev))*exponent
        return Prob

    def summarize(self, train_data):
        summaries=[(np.mean(i), np.std(i)) for i in zip(*train_data)]
        return summaries
    

    def fit(self, X, y):
        labels=list(set(y))
        data={label:[] for label in labels}

        for f, label in zip(X,y):
            data[label].append(f)
        self.model={
            label:self.summarize(value)
            for label, value in data.items()
        }    
        print('Model fitting is done')
        

    def calculate_prob(self, input_data):
        prob={}
        for label, value in self.model.items():
            prob[label]=1   #since all the categories have the same numbers
            for i in range(len(value)):
                mean, stdev= value[i]
                prob[label] *=self.gaussian_prob(
                    input_data[i], mean, stdev
                )

        return prob

    def predict(self, X_test):
        # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26}
        label = sorted(
            self.calculate_prob(X_test).items(),
            key=lambda x: x[-1])[-1][0]
        return label

        pass

    def score(self, X_test, y_test):
        right = 0
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right += 1

        return right / float(len(X_test))
        pass





In [248]:
NB=Naive_Bayes()
NB.fit(X_train, y_train)
#NB.summarize(X_train)


Model fitting is done


In [249]:
NB.score(X_test, y_test)

1.0

In [250]:
NB.model

{0.0: [(4.965000000000001, 0.3275286247032464),
  (3.3875, 0.36618813470673783),
  (1.4849999999999999, 0.16209565077447327),
  (0.2425, 0.10461237976453838)],
 1.0: [(5.97, 0.49742671151973067),
  (2.8133333333333335, 0.25655841873191815),
  (4.3, 0.48373546489791297),
  (1.3366666666666664, 0.19232495649002207)]}