In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.93,
    random_state=666)  #默认 test_size 为0.2，表示取20%的数据作为验证数据集。random_state 表示随机种子


In [8]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)    # 分割数据集
kNN_classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)     # n_neighbors表示判断最近“距离”用到的数据点，n_jobs表示使用的本机cpu核数
kNN_classifier.fit(X_train, y_train)    # 拟合
print('kNN 算法准确度:\n', kNN_classifier.score(X_test, y_test))   # 显示分类准确度

kNN 算法准确度:
 0.9916666666666667


In [9]:
# coding:utf-8
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# 采用网格方法搜索最优参数，更简单的方法见下一节
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

bestK1 = -1
bestP1 = -1
bestScore1 = 0.0
for k in range(1, 11):
# method = uniform 时不涉及 p 值选取
    for p in range(1, 6):
        kNN_classifier = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p, n_jobs=-1)    # weights表示距离权重种类，p 表示明可夫斯基距离参数
        kNN_classifier.fit(X_train, y_train)
        score = kNN_classifier.score(X_test, y_test)
        if score > bestScore1:
            bestK1 = k
            bestP1 = p
            bestScore1 = score

bestK2 = -1
bestScore2 = 0.0
for k in range(1,11):
    kNN_classifier = KNeighborsClassifier(n_neighbors=k, weights="uniform", n_jobs=-1)
    kNN_classifier.fit(X_train, y_train)
    score = kNN_classifier.score(X_test, y_test)
    if score > bestScore2:
        bestK2 = k
        bestScore2 = score
print("For method = distance:")
print("The best k is:\n", bestK1)
print("The best p is:\n", bestP1)
print("The best score is:\n", bestScore1)
print("For method = uniform:")
print("The best k is:\n", bestK2)
print("The best score is:\n", bestScore2)


For method = distance:
The best k is:
 3
The best p is:
 2
The best score is:
 0.9916666666666667
For method = uniform:
The best k is:
 3
The best score is:
 0.9916666666666667


In [None]:
# coding:utf-8
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV    # CV 表示 cross validation，交叉验证
param_grid = [
    {
        'weights': ['uniform'], 
        'n_neighbors': [k for k in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [k for k in range(1, 11)], 
        'p': [p for p in range(1, 6)]
    }
]
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=666)
kNN_classifier = KNeighborsClassifier()
grid_search = GridSearchCV(kNN_classifier, param_grid, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

kNN_classifier = grid_search.best_estimator_
print(kNN_classifier.score(X_test, y_test))
