In [2]:
import numpy as np
import pandas as pd
import random
import operator
import scipy.stats

In [3]:
#load data
df = pd.read_csv('../data/breast-cancer-wisconsin.data.txt')

#preprocess data
df = df.replace('?',-99999) #not vulnerable to outliers
df = df.astype(float)
df = df.drop(['id'],1)

#Shuffle the Data
df = df.reindex(np.random.permutation(df.index))
df = df.reset_index(drop=True)

In [4]:
#create copy
tf = df.copy()
#tf = tf.drop('class',axis=1) #drop class

#feature names
features = tf.columns

#Define the percentage of the data that you want to use for testing
test_size = 0.2

#Grabs the first (1-test_size) of the data
train_data = tf[:-int(test_size*len(tf))]

#Grabs the last (test_size) of the data
test_data = tf[-int(test_size*len(tf)):]

#save as value arrays
#train_data = train_data.values
#test_data = test_data.values

In [5]:
class NaiveBayes(object):
    
    #initialize parameters
    def __init__(self, printing=False):
        self.printing = printing
    
    #build model    
    def fit(self, trainSet, testSet):
        self.trainSet = trainSet
        self.testSet = testSet
        
        class_summaries = self.class_summaries(self.trainSet)
        
        self.summaries = class_summaries
        
        return class_summaries
    
    #summarize class by mean and stdev
    def class_summaries(self, dataset):
        summaries = {}
        
        for cls in dataset['class'].unique():
            cls_data = dataset[dataset['class'] == cls]
            summary = cls_data.describe().loc[['mean','std']]
            summaries[cls] = summary
            
        return summaries
    
    #find gaussian probability
    def gaussian_probability(self, x, mean, std):
        probability = scipy.stats.norm(mean, std).pdf(x)
        return probability
    
    #calculate probabilities of testInstance belonging to each class
    def class_probabilities(self, summaries, testInstance):
        probabilities = {}

        #for each class/summary...
        for cls, classSummary in summaries.items():
            probabilities[cls] = 1

            #for each feature...
            for i in range(np.shape(summaries[cls].values)[1] - 1):
                mean = classSummary.iloc[0][i]
                stdev = classSummary.iloc[1][i]
                x = testInstance[i]

                #apply chain rule of probabilities
                probabilities[cls] *= self.gaussian_probability(x, mean, stdev)

        return probabilities
     
    #predict class for datapoint
    def class_predict(self, summaries, testInstance):
        probabilities = self.class_probabilities(summaries, testInstance)
        bestLabel, bestProb = None, -1
        
        for cls, probability in probabilities.items():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = cls
        
        return bestLabel
    
    #make predictions
    def predict(self):
        predictions = []
        
        for i in range(len(self.testSet)):
            testInstance = self.testSet.iloc[i]
            result = self.class_predict(self.summaries, testInstance)
            predictions.append(result)
        return predictions
        
    #evaluate predictions
    def score(self, predictions):
        actual = pd.DataFrame(self.testSet)['class']
        correct = sum(predictions == actual) 
        accuracy = correct / len(self.testSet)
        return accuracy
            

In [7]:
nb = NaiveBayes()
summaries = nb.fit(train_data,test_data)
predictions = nb.predict()
nb.score(predictions)

0.98561151079136688