In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [150]:
import numpy as np


class NB():
    def __init__(self):
        self.priors = {}
        self.likelihoods = {}
        self.classes = []

    def fit(self, X, y):
        X = pd.DataFrame(X)
        self.classes = np.unique(y)
        for outcome in self.classes:
            self.priors[outcome] = sum(y == outcome) / len(y)
        
        for feature in X.columns:
            self.likelihoods[feature] = {}
            for outcome in self.classes:
                outcome_count = sum(y == outcome)
                likelihood = X[feature][y[y == outcome].index.values.tolist()].value_counts().to_dict()
                for feat_val, count in likelihood.items():
                    self.likelihoods[feature][feat_val] = {}
                    self.likelihoods[feature][feat_val][outcome] = (count + 1) / (outcome_count + len(self.classes))
        return self

    def predict(self, data):
        results = []
        data = pd.DataFrame(data)
        X = np.array(data)

        for sample in X:
            probs = {}
            for outcome in self.classes:
                prob = np.log(self.priors[outcome])
                for feature, feat_val in zip(data.columns, sample):
                    try:
                        prob += np.log(self.likelihoods[feature][feat_val][outcome])
                    except KeyError:
                        prob += np.log(1/(len(self.likelihoods[feature])+1))

                probs[outcome] = prob
            results.append(max(probs, key=lambda x: probs[x]))
        return results


In [151]:
df = pd.read_csv('data.csv')

X, y = df.drop('Iris-setosa', 1), df['Iris-setosa']

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12345)

In [153]:
nb = NB().fit(X_train, y_train)

In [154]:
results = nb.predict(X_test)

In [155]:
accuracy_score(y_test, results)

0.84

In [156]:
nb = NB().fit(df.drop('Iris-setosa', 1), df['Iris-setosa'])

accuracy_score(df['Iris-setosa'], nb.predict(df.drop('Iris-setosa', 1)))

0.87248322147651