In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [73]:
import numpy as np
import pandas as pd


class NB():
    def __init__(self):
        self.priors = {}
        self.likelihoods = {}
        self.classes = []

    def fit(self, X, y):
        X = pd.DataFrame(X)
        self.classes = np.unique(y)
        for outcome in self.classes:
            self.priors[outcome] = sum(y == outcome) / len(y)
        k = len(X.columns)
        for feature in X.columns:
            self.likelihoods[feature] = {}
            for outcome in self.classes:
                outcome_count = sum(y == outcome)
                likelihood = X[feature][y[y == outcome].index.values.tolist()].value_counts().to_dict()
                for feat_val, count in likelihood.items():
                    if feat_val not in self.likelihoods[feature]:
                        self.likelihoods[feature][feat_val] = {}
                    self.likelihoods[feature][feat_val][outcome] = (count + 1) / (outcome_count + k)
        return self

    def predict(self, data):
        results = []
        data = pd.DataFrame(data)
        X = np.array(data)

        for sample in X:
            probs = {}
            for outcome in self.classes:
                prob = np.log(self.priors[outcome])
                for feature, feat_val in zip(data.columns, sample):
                    if feat_val not in self.likelihoods[feature]:
                        prob += np.log(1/(len(self.likelihoods[feature])+1))
                    elif outcome not in self.likelihoods[feature][feat_val]:
                        prob += np.log(self.priors[outcome] / len(self.likelihoods[feature]))
                    else:
                        prob += np.log(self.likelihoods[feature][feat_val][outcome])
                probs[outcome] = prob
            results.append(max(probs, key=lambda x: probs[x]))
        return results


In [35]:
df = pd.read_csv('data.csv')

X, y = df.drop('Iris-setosa', 1), df['Iris-setosa']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12345)

In [37]:
from sklearn.naive_bayes import GaussianNB

results = GaussianNB().fit(X_train, y_train).predict(X_test)

accuracy_score(y_test, results)

0.96

In [76]:
nb = NB().fit(X_train, y_train)

results = nb.predict(X_test)

accuracy_score(y_test, results)

0.94