# KNN算法基本操作
## 1.载入库和相关模块

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

import warnings
warnings.filterwarnings('ignore')

# 导入KNN相关模块
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier  # KNN分类器
from sklearn.model_selection import train_test_split
import sklearn.metrics

## 2.载入数据，做简单的观察

In [2]:
iris = datasets.load_iris()

In [3]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [4]:
iris.data[:10]
# 特征

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [5]:
iris.target
# 目标

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

## 3.将数据集拆分为训练集和测试集

In [7]:
# 训练集：X_train, y_train
# 测试集：X_test, y_test
X_train, X_test, y_train, y_test = train_test_split(
    iris.data,    # 特征部分数据
    iris.target,  # 目标部分数据
    test_size = 0.3,
    random_state = 123456
)

In [8]:
X_train[:10]

array([[6.9, 3.1, 5.4, 2.1],
       [5.6, 3. , 4.5, 1.5],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.8, 1.5, 0.3],
       [4.3, 3. , 1.1, 0.1],
       [7.3, 2.9, 6.3, 1.8],
       [6.4, 2.7, 5.3, 1.9],
       [4.6, 3.1, 1.5, 0.2],
       [6.4, 3.1, 5.5, 1.8],
       [6.4, 2.9, 4.3, 1.3]])

In [9]:
X_train.shape # 测试0.3   训练就是0.7   也就是150 X 0.7 = 105

(105, 4)

In [10]:
iris.data.shape

(150, 4)

## 4.实例化算法模型并进行训练

    sklearn.neighbors.KNeighborsClassifier(
        n_neighbors=5,             # KNN中的K值，最近邻的个数，默认是5
        *,
        weights='uniform',           # 权重的计算方式，默认是按照各点的权重一致计算，设置为distance表示权重按距离计算，越近的点权重越大 
        algorithm='auto',
        leaf_size=30,
        p=2,                     # 计算闵氏距离的参数，为1,计算曼哈顿距离，为2计算欧式距离
        metric='minkowski',           # 距离的计算方式，默认是计算闵氏距离
        metric_params=None,
        n_jobs=None,
    )

In [11]:
# 使用默认参数构建一个KNN模型：k=5,计算欧式距离
knn_model = KNeighborsClassifier()

In [12]:
# 训练模型
knn_model.fit(X_train,y_train)

In [13]:
# 训练模型
knn_model.fit(X_train, y_train)

KNN算法的fit方法并不进行模型的训练，仅仅是将数据传入内存

## 5.在测试集上进行预测

In [14]:
y_pred = knn_model.predict(X_test)

In [15]:
y_pred

array([0, 2, 0, 1, 0, 0, 2, 2, 2, 0, 1, 2, 2, 0, 0, 2, 1, 2, 1, 0, 1, 2,
       1, 1, 1, 2, 2, 2, 2, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 2, 1, 2, 0, 0,
       2])

In [16]:
y_test

array([0, 2, 0, 1, 0, 0, 2, 2, 2, 0, 1, 2, 2, 0, 0, 2, 1, 2, 1, 0, 1, 2,
       1, 1, 1, 2, 2, 2, 2, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 2, 1, 2, 0, 0,
       1])

## 6.模型效果的评估
### 6.1 误分类矩阵

In [17]:
sklearn.metrics.confusion_matrix(y_test, y_pred)

array([[15,  0,  0],
       [ 0, 14,  1],
       [ 0,  0, 15]], dtype=int64)

### 6.2 分类评估报告

In [18]:
print(sklearn.metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.93      0.97        15
           2       0.94      1.00      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

