In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import math
import collections

In [2]:
iris = datasets.load_iris()
x = iris.data
y = iris.target
print(x[0:10])
print(y[0:10])
print(iris.DESCR)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]
[0 0 0 0 0 0 0 0 0 0]
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=0.8)
print(type(Y_train))
print(len(Y_train))
print(len(Y_test))
print(X_train[0:5])

<class 'numpy.ndarray'>
120
30
[[5.1 3.4 1.5 0.2]
 [5.5 4.2 1.4 0.2]
 [4.9 3.6 1.4 0.1]
 [4.6 3.6 1.  0.2]
 [7.6 3.  6.6 2.1]]


In [4]:
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)
print(X_train[0:5])
print(Y_train[0:5])

[[-0.90854496  0.7269493  -1.25466468 -1.30109579]
 [-0.42559747  2.51636296 -1.3105725  -1.30109579]
 [-1.15001871  1.17430272 -1.3105725  -1.43055806]
 [-1.51222932  1.17430272 -1.53420378 -1.30109579]
 [ 2.10987684 -0.16775753  1.59663418  1.1586873 ]]
[0 0 0 0 2]


In [5]:
class knn:
    def __init__(self, datas, labels, n_neighbors=6):
        self.datas = datas
        self.labels = labels
        self.k = n_neighbors

    def distance(self, cur, target):
        # cur和target都是一个一维向量，代表当前的坐标
        # 计算两个点之间的欧拉距离
        dis = 0
        for i in range(len(cur)):
            dis += (cur[i] - target[i]) ** 2
        dis = math.sqrt(dis)
        return dis

    def predict(self, cur_data):
        ans = []
        for i in cur_data:
            # dis存储当前这个点到相应点的距离，那个点所属的label
            dis = []
            for j in range(len(self.datas)):
                dis.append([self.distance(i, self.datas[j]), self.labels[j]])
            sorted_list = sorted(dis, key=lambda x: x[0])
            # t存储当前前k个最近的点的labels
            t = []
            for j in range(self.k):
                t.append(sorted_list[j][1])
            # 找出t中出现次数最多的labels做为i的label
            count = collections.Counter(t)
            ans.append(count.most_common(1)[0][0])
        return ans

In [6]:
KNN = knn(X_train, Y_train)
pre = KNN.predict(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
Y_predict = knn.predict(X_test)

print("实际结果为")
print(Y_test)
print("调包预测结果为")
print(Y_predict)
print("手算预测结果为")
print(pre)

实际结果为
[1 1 1 0 1 1 0 0 2 0 1 0 0 0 0 2 1 1 1 2 2 0 2 2 2 1 2 2 1 2]
调包预测结果为
[1 1 1 0 1 1 0 0 2 0 1 0 0 0 0 2 1 1 1 1 2 0 2 2 2 1 2 2 1 2]
手算预测结果为
[1, 1, 1, 0, 1, 1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 2, 2, 2, 1, 2, 2, 1, 2]
