In [1]:
import numpy as np
from collections import Counter

In [2]:
def kNN_classify(inputs, datasets, labels, k):
    # inputs: vector, shape=[dim]
    # datasets: numpy array, shape=[num, dim]
    # labels: numpy array,shape=[num,1]
    # k: number of  neighbors
    # return prediction label
    calMat = np.tile(inputs, [datasets.shape[0], 1])
    assert calMat.shape==datasets.shape
    calMat = (calMat - datasets)**2
    calMat = np.sum(calMat, axis=1)
    assert calMat.ndim==1
    indices = np.argsort(calMat)
    neighbor_labels = [labels[indices[i]][0] for i in range(k)]
    label_counter = Counter(neighbor_labels)
    return label_counter.most_common()[0][0]

In [3]:
def load_data():
    label_dict = {"didntLike":1, "smallDoses":2, "largeDoses":3}
    with open('datingTestSet.txt') as f:
        textLines = f.readlines()
        datasets = np.zeros([len(textLines), 3])
        labels = np.zeros([len(textLines), 1])
        for index, line in enumerate(textLines):
            line = line.strip().split('\t')
            datasets[index] = line[0:3]
            labels[index] = label_dict[line[-1]]
        return datasets, labels

In [4]:
def norm_data(datasets):
    mins = np.min(datasets,axis=0,keepdims=True)
    maxs = np.max(datasets,axis=0,keepdims=True)
    return (datasets-mins)/(maxs-mins)

In [5]:
datasets, labels = load_data()
datasets = norm_data(datasets)
print(datasets[0:5])
print(labels[0:5])
print(datasets.shape)
print(labels.shape)

[[ 0.44832535  0.39805139  0.56233353]
 [ 0.15873259  0.34195467  0.98724416]
 [ 0.28542943  0.06892523  0.47449629]
 [ 0.82320073  0.62848007  0.25248929]
 [ 0.42010233  0.07982027  0.0785783 ]]
[[ 3.]
 [ 2.]
 [ 1.]
 [ 1.]
 [ 1.]]
(1000, 3)
(1000, 1)


In [6]:
splitRatio=0.9
splitIdx=int(splitRatio*datasets.shape[0])
train_datasets=datasets[:splitIdx]
train_labels=labels[:splitIdx]
test_datasets=datasets[splitIdx:]
test_labels=labels[splitIdx:]
test_pred=np.zeros(test_labels.shape)
for i in range(test_datasets.shape[0]):
    test_pred[i] = kNN_classify(test_datasets[i], train_datasets, train_labels, 5)
error=1-np.mean((test_labels==test_pred)*1)
print("The error rate is {}".format(error))

The error rate is 0.050000000000000044


In [7]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(train_datasets, train_labels) 
test_pred1 = neigh.predict(test_datasets).reshape(-1,1)
error1=1-np.mean((test_labels==test_pred1)*1)
print("The error rate is {}".format(error1))

The error rate is 0.050000000000000044


  app.launch_new_instance()
