In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections

In [2]:
class NaiveBayesClassifier:
    
    def __init__(self, alpha = 1):
        self.alpha = alpha
        
    def counts(self, feature):
        """
        
        """
        features = self.dataset[feature]
        
        m_sample = len(features)
        
        uniques = set(features)
        
        classCounts = {}
        
        for i in range(m_sample):
            if features.iloc[i] not in classCounts:
                classCounts[features.iloc[i]] = 0
            classCounts[features.iloc[i]] += 1
            
        return classCounts,m_sample
    
    def likelihood(self, choosenClass, feature, choosenFeature):
        """
        
        """

        # Final column is the label column
        labels = self.dataset.iloc[:,-1]
        N = sum(labels == choosenClass)
        
        if feature not in self.dataset:
            return (0 + self.alpha)/(N)
        
        features = self.dataset[feature]
        V = len(set(features))    
        counts = sum((labels == choosenClass) & (features == choosenFeature)) 
        likelihood = (counts + self.alpha)/(N + self.alpha * V)
        
        return likelihood
        
    def fit(self, dataset):
        self.dataset = dataset
        self.classes = set(self.dataset['class'])
        
    
    def predict(self, test_dataset):

        classCounts,m = self.counts('class')
        classes = set(self.dataset['class'])
        features = test_dataset.columns
        
        m_testing_samples,n = test_dataset.shape
        
        
        prediction = []
        best_guess = []
        
        # For each row in test_dataset
        for i in range(m_testing_samples):
            row = test_dataset.iloc[i]
            pred = {}
            # for each class of original dataset
            for c in classes:
                probability = classCounts[c]/m
                for j in features:
                    probability *= self.likelihood(c, j, row[j])
                
                
                pred[c] = probability
            best_pred = max(pred, key = lambda x: pred[x])   
            prediction.append(pred)
            best_guess.append(best_pred)
         
        return prediction,best_guess



    

In [3]:
nb = NaiveBayesClassifier()

In [4]:
dataset = pd.DataFrame([
    ['Green', 3, 'Apple'],
    ['Yellow', 2,'Apple'],
    ['Red', 1,'Grape'],
    ['Red', 1,'Grape'],
    ['Yellow', 4,'Lemon'],
],
columns=['color','size','class'])

In [5]:
nb.fit(dataset)

In [6]:
test = pd.DataFrame([
    
    ['Green',3],
    ['Yellow',4],
    ['Red',1]
],
columns=['color','size'])

In [7]:
nb.predict(test)

([{'Apple': 0.053333333333333344,
   'Grape': 0.013333333333333336,
   'Lemon': 0.010000000000000002},
  {'Apple': 0.026666666666666672,
   'Grape': 0.013333333333333336,
   'Lemon': 0.04000000000000001},
  {'Apple': 0.013333333333333336,
   'Grape': 0.12,
   'Lemon': 0.010000000000000002}],
 ['Apple', 'Lemon', 'Grape'])