In [1]:
from collections import Counter
import math

In [14]:
def knn(data, query, k, dist_fn, choice_fn):
    neighbor_dist_and_indices = []
    for i, example in enumerate(data):
        distance = dist_fn(example[:-1], query)
        neighbor_dist_and_indices.append((distance, i))
        
    print('dist&indices : ',neighbor_dist_and_indices)
    sorted_neighbor_dist_and_indices = sorted(neighbor_dist_and_indices)
    
    k_nearest_neighbor_dist_and_indices = sorted_neighbor_dist_and_indices[:k]
    print('sorted k = 3 : ', k_nearest_neighbor_dist_and_indices)
    k_entries_labels = [data[i][1] for distance, i in k_nearest_neighbor_dist_and_indices]
    print('k_entries_labels',k_entries_labels)
    
    return k_nearest_neighbor_dist_and_indices, choice_fn(k_entries_labels)


# mean is selected as choice_fn , if regression
def mean(labels):
    return sum(labels)/len(labels)


# mode is selected as choice_fn , if classification
def mode(labels):
	return Counter(labels).most_common(1)[0][0]


def euclidean_dist(p1, p2):
	sum_sq_dist = 0
	for i in range(len(p1)):
		sum_sq_dist += math.pow(p1[i] - p2[i], 2)
	return math.sqrt(sum_sq_dist)


In [15]:
def main():
	'''
    # Regression Data
    # 
    # Column 0: height (inches)
    # Column 1: weight (pounds)
    '''

	reg_data = [
        [65.75, 112.99],
        [71.52, 136.49],
        [69.40, 153.03],
        [68.22, 142.34],
        [67.79, 144.30],
        [68.70, 123.30],
        [69.80, 141.49],
        [70.01, 136.46],
        [67.90, 112.37],
        [66.49, 127.45],
    ]

	reg_query = [60]

	reg_k_nearest_neighbors, reg_pred = knn(reg_data, reg_query, k =3 , dist_fn = euclidean_dist, choice_fn = mean)

	print('reg_k_nearest_neighbors : {}, reg_pred : {}'.format(reg_k_nearest_neighbors, reg_pred))

	clf_data = [
	    [22, 1],
        [23, 1],
        [21, 1],
        [18, 1],
        [19, 1],
        [25, 0],
        [27, 0],
        [29, 0],
        [31, 0],
        [45, 0],
	]

	clf_query = [33]

	clf_k_nearest_neighbors, clf_pred = knn(clf_data, clf_query, k=3, dist_fn=euclidean_dist, choice_fn=mode)

	print('clf_k_nearest_neighbors : {}, clf_pred : {}'.format(clf_k_nearest_neighbors, clf_pred))


In [16]:
	if __name__ == '__main__':
		main()


dist&indices :  [(5.75, 0), (11.519999999999996, 1), (9.400000000000006, 2), (8.219999999999999, 3), (7.790000000000006, 4), (8.700000000000003, 5), (9.799999999999997, 6), (10.010000000000005, 7), (7.900000000000006, 8), (6.489999999999995, 9)]
sorted k = 3 :  [(5.75, 0), (6.489999999999995, 9), (7.790000000000006, 4)]
k_entries_labels [112.99, 127.45, 144.3]
reg_k_nearest_neighbors : [(5.75, 0), (6.489999999999995, 9), (7.790000000000006, 4)], reg_pred : 128.24666666666667
dist&indices :  [(11.0, 0), (10.0, 1), (12.0, 2), (15.0, 3), (14.0, 4), (8.0, 5), (6.0, 6), (4.0, 7), (2.0, 8), (12.0, 9)]
sorted k = 3 :  [(2.0, 8), (4.0, 7), (6.0, 6)]
k_entries_labels [0, 0, 0]
clf_k_nearest_neighbors : [(2.0, 8), (4.0, 7), (6.0, 6)], clf_pred : 0
