In [17]:
import numpy as np
import pandas as pd
import random
import operator
import scipy.stats

from sklearn import model_selection
from sklearn.model_selection import train_test_split

In [18]:
#load data
df = pd.read_csv('../data/breast-cancer-wisconsin.data.txt')

#clean data
df = df.replace('?',-99999)
df = df.astype(float)
df = df.drop(['id'],1)

#shuffle data
df = df.reindex(np.random.permutation(df.index))
df = df.reset_index(drop=True)

#copy data
tf = df.copy()

In [19]:
# prepare configuration for cross validation test harness
seed = 7

X_train, X_test, y_train, y_test = model_selection.train_test_split(tf.iloc[:,:9], 
                                                                    tf.iloc[:,9], 
                                                                    test_size=0.25, 
                                                                    random_state=seed)

In [20]:
class NaiveBayes(object):
    
    #initialize parameters
    def __init__(self, printing=False):
        self.printing = printing
    
    #build model    
    def fit(self, X_train, y_train):
        self.trainSet = pd.concat([X_train, y_train],axis=1)
        
        
        class_summaries = self.class_summaries(self.trainSet)
        
        self.summaries = class_summaries
        
        return class_summaries
    
    #summarize class by mean and stdev
    def class_summaries(self, dataset):
        summaries = {}
        
        for cls in dataset['class'].unique():
            cls_data = dataset[dataset['class'] == cls]
            summary = cls_data.describe().loc[['mean','std']]
            summaries[cls] = summary
            
        return summaries
    
    #find gaussian probability
    def gaussian_probability(self, x, mean, std):
        probability = scipy.stats.norm(mean, std).pdf(x)
        return probability
    
    #calculate probabilities of testInstance belonging to each class
    def class_probabilities(self, summaries, testInstance):
        probabilities = {}

        #for each class/summary...
        for cls, classSummary in summaries.items():
            probabilities[cls] = 1

            #for each feature...
            for i in range(np.shape(summaries[cls].values)[1] - 1):
                mean = classSummary.iloc[0][i]
                stdev = classSummary.iloc[1][i]
                x = testInstance[i]

                #apply chain rule of probabilities
                probabilities[cls] *= self.gaussian_probability(x, mean, stdev)

        return probabilities
     
    #predict class for datapoint
    def class_predict(self, summaries, testInstance):
        probabilities = self.class_probabilities(summaries, testInstance)
        bestLabel, bestProb = None, -1
        
        for cls, probability in probabilities.items():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = cls
        
        return bestLabel
    
    #make predictions
    def predict(self, X_test):
        self.testSet = X_test
        predictions = []
        
        for i in range(len(self.testSet)):
            testInstance = self.testSet.iloc[i]
            result = self.class_predict(self.summaries, testInstance)
            predictions.append(result)
        return predictions
        
    #evaluate predictions
    def score(self, predictions, y_test):
        actual = y_test
        correct = sum(predictions == actual) 
        accuracy = correct / len(actual)
        return accuracy
            

In [21]:
nb = NaiveBayes()
summaries = nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
nb.score(predictions, y_test)

0.97142857142857142