# 用sklearn中的KNN模型完成分类问题

## 一、数据的预处理

通过numpy和pandas对csv文件进行读取，创建数据集X、目标y，并保存为npy的格式。

In [1]:
import numpy as np
import pandas as pd

# # # 用pandas读取iris.csv，并对鸢尾花的分类名称编码为0,1,2
df = pd.read_csv("iris.csv")
d = {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}
df['Species'] = df['Species'].map(d)
df.head()

Unnamed: 0,Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0


In [2]:
# # # 将四种特征组合为数据集X,目标为Y
data_y = df['Species'].values
data_x = df.iloc[:,1:5].values

# # # 将X,Y保存为npy格式，便于numpy的读取
np.save("data_x.npy",data_x)
np.save("data_y.npy",data_y)

## 二、模型及测试

这部分主要是用sklearn上的模块，搭建机器学习的框架，并进行训练和测试

In [3]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline, FeatureUnion

# # # 加载数据集X,Y
data_x = np.load("data_x.npy")
data_y = np.load("data_y.npy")
data_y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [4]:
# # # 划分数据集，80%的训练集和20%的测试集
X_train, X_test, y_train, y_test = train_test_split(data_x,data_y,test_size=0.2,
                                                    shuffle=True,random_state=10,stratify=data_y)

# # # 用PCA降维
pca = PCA(n_components=2)

# # # 对特征进行卡方检验，并选出k个最佳特征
selection = SelectKBest(chi2,k=2)

# # # 用FeatureUnion对PCA和SelectKBest进行组合（并行处理）
combined_features = FeatureUnion([("pca", pca), ("chi2_select", selection)])

# # # 用Pipeline对combined_features、standard、knn进行组合（串行处理）
pipe = Pipeline([("features", combined_features),("standard",StandardScaler()),
                 ("knn", KNeighborsClassifier())])
pipe

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('chi2_select', SelectKBest(k=2, score_func=<function chi2 at 0x0000016FA03701E0>))],
       transform...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])

In [5]:
# # # 网格法搜索超参数,十折交叉验证
params = {'features__pca__n_components': [1, 2, 3],
          'features__chi2_select__k':[1, 2],
          'knn__n_neighbors': [1, 3, 5],
          'knn__p': [1, 2]}

clf = GridSearchCV(pipe,param_grid=params,cv=10)
clf.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('chi2_select', SelectKBest(k=2, score_func=<function chi2 at 0x0000016FA03701E0>))],
       transform...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'features__pca__n_components': [1, 2, 3], 'features__chi2_select__k': [1, 2], 'knn__n_neighbors': [1, 3, 5], 'knn__p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [6]:
# # # 输出训练结果
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

print("best CV_score: %0.3f   best parameter: %r" % (clf.best_score_, clf.best_params_))

0.908 (+/-0.174) for {'features__chi2_select__k': 1, 'features__pca__n_components': 1, 'knn__n_neighbors': 1, 'knn__p': 1}
0.908 (+/-0.174) for {'features__chi2_select__k': 1, 'features__pca__n_components': 1, 'knn__n_neighbors': 1, 'knn__p': 2}
0.933 (+/-0.163) for {'features__chi2_select__k': 1, 'features__pca__n_components': 1, 'knn__n_neighbors': 3, 'knn__p': 1}
0.933 (+/-0.163) for {'features__chi2_select__k': 1, 'features__pca__n_components': 1, 'knn__n_neighbors': 3, 'knn__p': 2}
0.942 (+/-0.167) for {'features__chi2_select__k': 1, 'features__pca__n_components': 1, 'knn__n_neighbors': 5, 'knn__p': 1}
0.942 (+/-0.167) for {'features__chi2_select__k': 1, 'features__pca__n_components': 1, 'knn__n_neighbors': 5, 'knn__p': 2}
0.933 (+/-0.125) for {'features__chi2_select__k': 1, 'features__pca__n_components': 2, 'knn__n_neighbors': 1, 'knn__p': 1}
0.933 (+/-0.125) for {'features__chi2_select__k': 1, 'features__pca__n_components': 2, 'knn__n_neighbors': 1, 'knn__p': 2}
0.950 (+/-0.082)

In [7]:
# # # 用训练好的模型对测试集进行测试
score = clf.score(X_test,y_test)

print("test score:",score)

test score: 1.0
