# KNN最近邻算法

In [None]:
'''
完全手动实现knn最近领算法
使用了np.argsort()找排序后的序号
np.bincount()分箱计数
np.argmax()最大数的序号
'''

### 0.引入依赖

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #准确率

### 1.数据加载和预处理

In [2]:
iris = load_iris()
df = pd.DataFrame(data=iris.data,columns=iris.feature_names)

df['class']=iris.target

#df['class']=df['class'].map({0:iris.target_names[0], 1:iris.target_names[1], 2:iris.target_names[2]})
print(df)
df.describe()

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
5                  5.4               3.9                1.7               0.4   
6                  4.6               3.4                1.4               0.3   
7                  5.0               3.4                1.5               0.2   
8                  4.4               2.9                1.4               0.2   
9                  4.9               3.1                1.5               0.1   
10                 5.4               3.7                1.5               0.2   
11                 4.8      

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [3]:
x = iris.data
y = iris.target.reshape(-1,1) # y是一维数组，转为二维
type(x) # x是 ndarray多维数组

numpy.ndarray

In [4]:
x.shape,y.shape

((150, 4), (150, 1))

In [5]:
# random_state随机种子，stratify(分层抽样)还要按照y的分类来等比例分割
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=35, stratify=y) # 3/7比例

In [6]:
y_test

array([[2],
       [1],
       [2],
       [2],
       [0],
       [0],
       [2],
       [0],
       [1],
       [1],
       [2],
       [0],
       [1],
       [1],
       [1],
       [2],
       [2],
       [0],
       [1],
       [2],
       [1],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [2],
       [0],
       [0],
       [2],
       [1],
       [0],
       [2],
       [1],
       [0],
       [2],
       [1],
       [2],
       [2],
       [1],
       [1],
       [1],
       [0],
       [0]])

In [7]:
dist = np.array([3,2,53,37,43])
nn_index = np.argsort(dist)
print(y_train[nn_index]) # 输出的是二维数组
print(type(y_train[nn_index]))
print(y_train[nn_index].reshape(-1)) #这里reshape(-1)和reval等效
print(y_train[nn_index].ravel()) #返回array
print(y_train[nn_index])

t_category = y_train[nn_index].ravel()

#【1,0,1,1,2，】
#统计类别中出现频率最高的那个
bb = np.bincount(t_category)
print(bb)
cc = np.argmax(np.bincount(t_category)) # np.bincount非负数的分箱统计, 
print(cc)


[[2]
 [1]
 [2]
 [1]
 [2]]
<class 'numpy.ndarray'>
[2 1 2 1 2]
[2 1 2 1 2]
[[2]
 [1]
 [2]
 [1]
 [2]]
[0 2 3]
2


### 2.核心算法实现

In [24]:
#距离函数定义
def l1_distance(a,b): #矩阵和向量减法
    return np.sum(np.abs(a-b),axis=1) # axis=1列方向

def l2_distance(a,b):
    return np.sqrt(np.sum((a-b) ** 2, axis=1))


# knn分类器实现
class Knn(object):
    
    #定义一个初始化方法,__init__类的构造方法
    def __init__(self,num_neighbors=1, dist_func=l1_distance):
        self.num_neighbors=num_neighbors
        self.dist_func=dist_func
    
    #训练模型方法，knn没有训练
    def fit(self,x,y):
        self.x_train=x
        self.y_train=y
        
    
    #模型预测方法
    def predict(self,x):
        
        #初始化返回的数据
        y_pred = np.zeros((x.shape[0],1),dtype=self.y_train.dtype) 
        
        #计算测试数据与各个训练数据之间的距离
        for i,xx_test in enumerate(x): #枚举(序号和元素)组成元祖,x_test是一个向量
            distance = self.dist_func(self.x_train,xx_test) #注意变量作用域
        
            #按照距离的递增关系排序, 对实际距离并没有兴趣，需要的是排序的索引值
            dist_sorted = np.argsort(distance) 
            
            #选取距离最小的k个点
            dist_topk = dist_sorted[:self.num_neighbors] #最小的前k个的
            
            #确定前k个点所在类别的出现频率
            nn_y = y_train[dist_topk].ravel() #返回是2维数组，需要ravel或者reshape
            
            #返回前k个点出现评率最高的类别作为测试数据的预测分类,np.bincount将非负数数据分箱count
            #排序字典试试
            category_pred = np.argmax(np.bincount(nn_y))
            
            #预测填写到y_pred
            y_pred[i] = category_pred
            
        # 返回全部预测
        return y_pred
        

In [25]:
# 创建一个实例
knn = Knn(num_neighbors=3)
# 训练模型
knn.fit(x_train,y_train)
# 预测数据
y_pred = knn.predict(x_test)

### 4.评估模型

In [26]:
# 这里是3分类，求出准确率
accuracy = accuracy_score(y_test,y_pred)
print("预测准确率：",accuracy)

预测准确率： 0.9333333333333333


In [27]:
### 5.超参搜索
knn = Knn()
knn.fit(x_train, y_train)

#结果数据
result_list=[]

for p in [1,2]: # l1/l2范数
    knn.dist_func = l1_distance if p==1 else l2_distance
    for k in range(1,10,2): # 奇数个k
        knn.num_neighbors = k
        y_pred = knn.predict(x_test)
        accuracy = accuracy_score(y_test,y_pred)
        result_list.append([k, 'l1_distance' if p==1 else 'l2_distance', accuracy])

df = pd.DataFrame(result_list,columns=['k','距离函数','准确率']) 
df

Unnamed: 0,k,距离函数,准确率
0,1,l1_distance,0.933333
1,3,l1_distance,0.933333
2,5,l1_distance,0.977778
3,7,l1_distance,0.955556
4,9,l1_distance,0.955556
5,1,l2_distance,0.933333
6,3,l2_distance,0.933333
7,5,l2_distance,0.977778
8,7,l2_distance,0.977778
9,9,l2_distance,0.977778
