In [143]:
import numpy as np
import pandas as pd
import random
import operator

In [144]:
class kNN_Classifier(object):
    
    """k-Nearest Neighbors Classifier"""
    
    def __init__(self, _k=5, printing=False):
        self.k = _k
        self.printing = printing
        
    def fit(self, trainingSet, testSet):
        predictions = []
        
        #for each test instance...
        for x in range(len(testSet)):
            testInstance = testSet[x]
            
            #find k nearest neighbors
            neighbors = self.k_neighbors(trainingSet, testInstance)
            
            #vote on class
            result = self.predict(neighbors)
            predictions.append(result)
            
            #printing options
            if self.printing:
                print('> predicted=' + repr(result) + ', actual=' + repr(testInstance[-1]))
        
        #assess accuracy
        accuracy = self.score(testSet, predictions)
        print('Accuracy: ' + repr(accuracy) + '%')
        
    def euclidean_distance(self,instance1,instance2):
        #euclidean_distance = np.sqrt(np.sum((np.array(instance1) - np.array(instance2)**2)))
        euclidean_distance = np.linalg.norm(np.array(instance1) - np.array(instance2))
        return euclidean_distance
    
    def k_neighbors(self, trainingSet, testInstance):
        distances = []
        
        #for each training sample...
        for x in range(len(trainingSet)):
            #calculate the distance to the testInstance
            dist = self.euclidean_distance(trainingSet[x], testInstance)
            distances.append((trainingSet[x], dist))
        
        #sort distances
        distances.sort(key=operator.itemgetter(1))
        
        #create neighborhood
        neighborhood = []
        
        #for each of the top k nearsest neighbors...
        for x in range(self.k):
            #add to neighborhood
            neighborhood.append(distances[x][0])
        
        return neighborhood
    
    def predict(self, neighbors):
        classVotes = {}
        
        #for each of the top k nearest neighbors...
        for x in range(len(neighbors)):
            #record neighbor class
            response = neighbors[x][-1]
            
            #update class vote count
            if response in classVotes:
                classVotes[response] += 1
            else:
                classVotes[response] = 1
        
        #return class with most votes
        vote = max(classVotes.items(), key=operator.itemgetter(1))[0]
        return vote

    def score(self, testSet, predictions):
        correct = 0
        
        #for each testInstance...
        for x in range(len(testSet)):
            #add point if actual class == predicted class...
            if testSet[x,-1] == predictions[x]:
                correct += 1
        return (correct/float(len(testSet))) * 100.0

In [145]:
#load data
df = pd.read_csv('../data/breast-cancer-wisconsin.data.txt')

#preprocess data
df = df.replace('?',-99999) #not vulnerable to outliers
df = df.astype(float)
df = df.drop(['id'],1)

#Shuffle the Data
df = df.reindex(np.random.permutation(df.index))
df = df.reset_index(drop=True)

#preview data
df.head()

Unnamed: 0,clump_thickness,uni_cell_size,uni_cell_shape,marg_adhes,epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,4.0,2.0,3.0,5.0,3.0,8.0,7.0,6.0,1.0,4.0
1,5.0,4.0,6.0,10.0,2.0,10.0,4.0,1.0,1.0,4.0
2,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
3,5.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,1.0,2.0
4,7.0,5.0,6.0,10.0,5.0,10.0,7.0,9.0,4.0,4.0


In [146]:
#create copy
tf = df.copy()
#tf = tf.drop('class',axis=1) #drop class

#Define the percentage of the data that you want to use for testing
test_size = 0.2

#Grabs the first (1-test_size) of the data
train_data = tf[:-int(test_size*len(tf))]

#Grabs the last (test_size) of the data
test_data = tf[-int(test_size*len(tf)):]

#save as value arrays
train_data = train_data.values
test_data = test_data.values

In [147]:
#run model
knn = kNN_Classifier(printing=False).fit(train_data,test_data)

Accuracy: 98.56115107913669%
