# Implementing KNN

How KNN trains:
    - Record every data point and its class

How KNN classifies a new data point:
    - Check how far is the new data point to the "trained" data point.
    - Sort existing data point by distance to the new point
    - Select the majority class for the closest K points

In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import cross_validation

In [2]:
def distance(point1, point2):
    # calculate the distance between two single points based on manhattam distance
    
    distance = 0.0 # replace this
    distance = np.sum(point1[1] - point2[1])
    
    return abs(distance)


class MyKNN(object):
    
    n_neighbors = None
    fitted_X = None
    fitted_y = None
    
    def __init__(self, n_neighbors = 10):
        self.n_neighbors = n_neighbors
        self.fitted_X = None
        self.fitted_y = None
        
    def fit(self, X, y):
        # record instances in self.learned_instances
        self.fitted_X = X
        self.fitted_y = y

        return self
    
    def predict(self, X):
        # write this
        
        y_pred = [] # replace this
        
        for x in X.iterrows():
            x_dist = []
            for x_t, y_t in zip(self.fitted_X.iterrows(), self.fitted_y):
                x_dist.append((distance(x, x_t), y_t))
            
            x_dist.sort(reverse=False)
            x_classes = {}
            for _, y_t in x_dist[:self.n_neighbors]:
                if y_t not in x_classes:
                    x_classes[y_t] = 1.0
                else:
                    x_classes[y_t] += 1.0
            
            x_class_sorted = sorted(x_classes.items(), key=lambda (k,v): (v,k), reverse=True)
            majority_class, _freq = x_class_sorted[0]        

            y_pred.append(majority_class)
        
        return y_pred

### Load data and test

In [8]:
column_names = ['id',
                'clump_thickness',
                'cell_size_uniformity',
                'cell_shape_uniformity',
                'marginal_adhesion',
                'single_epithelial_size',
                'bare_nuclei',
                'bland_chromatin',
                'normal_nucleoli',
                'mitoses',
                'class']

df = pd.read_csv('../../assets/data/breast-cancer-wisconsin.csv',
                 names=column_names, na_values=['?'], index_col="id")
df.dropna(inplace=True)
df['class'] = df['class'] == 2

In [9]:
X = df[list(set(column_names)-set(['id','class']))]
y = df['class']

# x_cols = ['clump_thickness','cell_size_uniformity']
# X = df[x_cols]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, train_size=0.5, test_size=0.5)

In [10]:
k = MyKNN(10)

k.fit(X_train, y_train)
y_pred = k.predict(X_test)

print metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
print metrics.accuracy_score(y_true=y_test, y_pred=y_pred)

[[112   3]
 [  4 223]]
0.979532163743
