In [47]:
import numpy as np
import pandas as pd 

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor as KNNR

#### 加载数据集

In [21]:
# 加载标题
with open("./housingPrice/readme.txt.txt", encoding='UTF8') as fr:
    columns = fr.readlines() 

columns = [x.split(" ")[0] for x in columns]

In [25]:
data = pd.read_csv("./housingPrice/housing.data.txt", sep='\\s+', header=None)
data.columns = columns

#### 拆分特征和标签列

In [30]:
features, labels = data.drop('MEDV', axis=1), data.MEDV

#### 拆分训练集和测试集

In [32]:
train, test, train_label, test_label = train_test_split(features, labels, test_size=0.2)

##### 数据归一化

In [43]:
scalar = MinMaxScaler()
scalar.fit(train)

train_scalar = scalar.transform(train)
test_scalar = scalar.transform(test)

#### 网格搜索查询最佳参数组合

In [52]:
# 网格搜索寻找最佳参数
parmas = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9], 'p': [1, 2]}
model = KNNR(algorithm='brute', metric='minkowski')
grid_search = GridSearchCV(model, parmas, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(train, train_label)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='brute', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [3, 4, 5, 6, 7, 8, 9], 'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [53]:
# 查看最佳参数
best_params = grid_search.best_params_
best_params

{'n_neighbors': 5, 'p': 1}

In [55]:
# 查看最佳得分
best_score = grid_search.best_score_
best_score

-35.9622297029703

#### 使用最佳参数训练模型

In [57]:
knnr_model = KNNR(algorithm='brute', metric='minkowski', **best_params)
knnr_model.fit(train, train_label)

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=1,
                    weights='uniform')

In [59]:
knnr_model.predict(test)

array([22.22, 20.62, 23.54, 21.28, 24.7 , 13.36, 20.08, 29.7 , 12.98,
       22.18, 28.12, 27.78, 23.5 , 20.48, 28.6 , 40.6 , 44.2 , 18.  ,
       27.98, 20.44, 44.58, 24.02, 18.82, 34.16, 10.  , 30.94, 27.58,
       15.14, 24.5 , 28.34, 11.16, 33.7 , 31.24, 16.48, 17.14, 22.74,
       37.48, 13.8 , 16.54, 39.14, 19.36, 21.08, 32.96, 27.32, 14.48,
       32.96, 37.42, 13.42, 26.7 , 20.72, 21.68, 28.26, 34.8 , 13.66,
       29.04, 23.26, 20.12, 17.66, 19.9 , 24.7 , 13.42, 23.58, 20.72,
       12.98, 21.82, 15.08, 22.86, 20.72, 27.52, 37.1 , 21.2 , 28.92,
       24.06, 28.88, 19.38, 17.96, 14.8 , 24.1 , 20.82, 25.96, 20.8 ,
       11.98, 28.24, 12.6 , 18.48, 19.72, 40.6 , 15.  , 22.3 , 23.22,
       18.06, 25.62, 25.16, 13.96, 20.72, 23.52, 23.02, 14.7 , 20.8 ,
       29.54, 19.38, 27.34])

In [63]:
test_label.values

array([20. , 22.2, 28.1, 17.4, 18.7, 12.5, 19. , 29.1, 14.1, 29.8, 22.8,
       28.7, 27.1, 19.3, 28.2, 36.5, 31.6, 15.2, 26.7, 15. , 30.7, 50. ,
       16.5, 29.8,  5.6, 29.1, 19.4, 14.6, 22.3, 24.1, 10.2, 24.5, 23.3,
       17.8, 14.5, 23.3, 50. , 12.7, 19.9, 33.8, 19.8, 23.9, 32.9, 23.5,
       13.5, 34.9, 50. , 18.4, 18.9, 19.5, 22.5, 32. , 48.3, 16.4, 35.2,
       11.8, 22.6, 18. , 22.9, 23.4,  8.5, 22.6, 20.4, 16.1, 23.1, 16.7,
       22.2, 19.5, 23.1, 50. , 19.4, 16. , 32.2, 33.3, 18.8, 14.2, 27.5,
       23.9, 17.7, 23.1, 19.5,  9.5, 24.8, 12.7, 23. , 21.2, 36. , 17.1,
       34.7, 23.7, 18.4, 26.6, 28.4, 14.8, 19.3, 11.9, 20.4, 10.2, 19.9,
       50. , 20.3, 29. ])