In [0]:
import numpy as np
import operator
import os
from collections import Counter

# 手写KNN

In [0]:
def createDataSet():
    group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels

In [0]:
def classify0(inX, dataSet, labels, k):   
    # 1. 计算距离    
    # inx - dataset 使用了numpy broadcasting
	dist = np.sum((inX - dataSet)**2, axis=1)**0.5
	# print(dist.shape)
    

    # 2. k个最近的标签    
    # 对距离排序使用numpy中的argsort函数， 见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sort.html#numpy.sort
    # 函数返回的是数组值从小到大的索引值 ，因此取前k个索引使用[0 : k]
    # 将这k个标签存在列表k_labels中    
	k_labels = [ labels[index] for index in dist.argsort()[0 : k] ]

    # 3. 出现次数最多的标签即为最终类别    
    # 使用collections.Counter可以统计各个标签的出现次数，most_common返回出现次数最多的标签tuple，例如[('lable1', 2)]，因此[0][0]可以取出标签值

	label = Counter(k_labels).most_common(1)[0][0]
	return label

In [0]:
def test1():
    """
    第一个例子演示
    """
    group, labels = createDataSet()
    print(str(group))
    print(str(labels))
    print(classify0([0.1, 0.1], group, labels, 3))

In [5]:
test1()

[[1.  1.1]
 [1.  1. ]
 [0.  0. ]
 [0.  0.1]]
['A', 'A', 'B', 'B']
B


In [0]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']

In [0]:
data = np.array(df.iloc[:100, [0, 1, -1]])
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
X_train.shape

(80, 2)

In [0]:
def irisTest():

    # 设置测试的样本数量
    numTestVecs = X_test.shape[0]
    
    errorCount = 0
    for i in range(numTestVecs):
        # 对数据测试
        classifierResult = classify0(X_test[i], X_train, y_train, 3)
        errorCount += classifierResult != y_test[i]
    print("the total error rate is: %f" % (errorCount / numTestVecs))
    print(errorCount)

In [15]:
irisTest()

the total error rate is: 0.050000
1


# 李航版KNN

In [0]:
class KNN:
    def __init__(self, X_train, y_train, n_neighbors=3, p=2):
        """
        parameter: n_neighbors 临近点个数
        parameter: p 距离度量
        """
        self.n = n_neighbors
        self.p = p
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X):
        # 取出n个点
        knn_list = []
        for i in range(self.n):
#             计算每个样本到训练样本的距离
            dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
            knn_list.append((dist, self.y_train[i]))
            
        for i in range(self.n, len(self.X_train)):
            max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
            dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
            if knn_list[max_index][0] > dist:
                knn_list[max_index] = (dist, self.y_train[i])
                
        # 统计
        knn = [k[-1] for k in knn_list]
        count_pairs = Counter(knn)
        max_count = sorted(count_pairs, key=lambda x:x)[-1]
        return max_count
    
    def score(self, X_test, y_test):
        right_count = 0
        n = 10
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right_count += 1
        return right_count / len(X_test)

In [0]:
clf = KNN(X_train, y_train)

In [18]:
clf.score(X_test, y_test)

0.95