### kNN算法
kNN算法是机器学习中最简单的算法
- 分类，选取k个近邻，如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别，则该样本也属于这个类别。  

  
- 回归，选取k个近邻，该样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中均值

In [1]:
import numpy as np 
np.random.seed(0)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error
X,y=load_iris().data,load_iris().target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.7)

In [2]:
class kNN:
    '''k:选取的近邻个数，
    kind:如果为C为分类器，如果为R为回归模型
    '''
    def __init__(self,k,kind='classification'):
        self.k=k         #k个近邻
        if (kind!='classification') and (kind!='regression'):
            raise('Algorithm type error')   #如果输入的任务类型参数有错，报错
        self.kind=kind
    def fit(self,X,y):
        self.interX=X=(X-np.min(X,axis=0))/(np.max(X,axis=0)-np.min(X,axis=0))  #将X数据标准化
        self.intery=y.reshape(-1,1)
        
    def predict(self,X):
        X=(X-np.min(X,axis=0))/(np.max(X,axis=0)-np.min(X,axis=0))  #将数据标准化
        y_pre=[]
        if self.kind=='classification':  #如果是分类任务
            for i in X:
                distance=np.sqrt(np.sum((self.interX-i)**2,axis=1)).reshape(-1,1)
                data=np.concatenate([distance,self.intery],axis=1)
                data = data[data[:,-2].argsort()] #按照倒数第二列距离排序
                class_list=data[:self.k,-1].tolist()  #获取前k个相近样本的预测值
                result=max(class_list,key=class_list.count)  #获取列表中出现最多的元素
                y_pre.append(int(result))
        if self.kind=='regression':  #如果为回归任务
            for i in X:
                distance=np.sqrt(np.sum((self.interX-i)**2,axis=1)).reshape(-1,1)
                data=np.concatenate([distance,self.intery],axis=1)
                data = data[data[:,-2].argsort()] #按照倒数第二列距离排序
                values=data[:self.k,-1]  #获取前k个相近样本的预测值
                result=np.mean(values)  #获取数组的平均值
                y_pre.append(result)
        return np.array(y_pre)

In [3]:
%time
model=kNN(k=5,kind='classification')
model.fit(X_train,y_train)
model.predict(X_test)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1,
       1, 2, 0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0,
       0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2])

In [4]:
accuracy_score(model.predict(X_test),y_test)

0.9333333333333333

In [5]:
from sklearn.datasets import load_boston
X,y=load_iris().data,load_iris().target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

model=kNN(k=5,kind='regression')
model.fit(X_train,y_train)
result=model.predict(X_test)
mean_squared_error(result,y_test)

0.05866666666666668

In [6]:
%time
from sklearn.neighbors import KNeighborsRegressor
sklearn_kNN=KNeighborsRegressor(n_neighbors=5)
sklearn_kNN.fit(X_train,y_train)
sklearn_result=sklearn_kNN.predict(X_test)
mean_squared_error(sklearn_result,y_test)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs


0.03644444444444444