## scikit-learn中的回归问题

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
boston = datasets.load_boston() #加载波士顿房价数据

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
X.shape

(490, 13)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=666) # 数据相应的切分

## scikit-learn中的线性回归

In [6]:
from sklearn.linear_model import LinearRegression # 加载线性回归

In [7]:
lin_reg = LinearRegression()

In [8]:
lin_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
lin_reg.coef_ # 系数

array([-1.14235739e-01,  3.12783163e-02, -4.30926281e-02, -9.16425531e-02,
       -1.09940036e+01,  3.49155727e+00, -1.40778005e-02, -1.06270960e+00,
        2.45307516e-01, -1.23179738e-02, -8.80618320e-01,  8.43243544e-03,
       -3.99667727e-01])

In [10]:
lin_reg.intercept_ # 截距

32.64566083965359

In [11]:
lin_reg.score(X_test,y_test)

0.8008916199519112

In [27]:
X_train

array([[9.25200e-02, 3.00000e+01, 4.93000e+00, ..., 1.66000e+01,
        3.83780e+02, 7.37000e+00],
       [8.66400e-02, 4.50000e+01, 3.44000e+00, ..., 1.52000e+01,
        3.90490e+02, 2.87000e+00],
       [2.87500e-02, 2.80000e+01, 1.50400e+01, ..., 1.82000e+01,
        3.96330e+02, 6.21000e+00],
       ...,
       [6.71800e-01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        4.30600e+01, 2.39800e+01],
       [9.59571e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.76110e+02, 2.03100e+01],
       [2.06080e-01, 2.20000e+01, 5.86000e+00, ..., 1.91000e+01,
        3.72490e+02, 1.25000e+01]])

In [24]:
X_train.shape

(367, 13)

In [25]:
y_train.shape

(367,)

## kNN Regressor

In [12]:
from sklearn.neighbors import KNeighborsRegressor

In [13]:

knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train,y_train)
knn_reg.score(X_test,y_test)

0.602674505080953

In [14]:
from sklearn.model_selection import GridSearchCV # 进行网格搜索

param_grid = [
    {
        'weights': ['uniform'], 
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

In [15]:
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg,param_grid,n_jobs=-1,verbose=1)
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [16]:
grid_search.best_params_# 最好的结果，使用曼哈顿距离

{'n_neighbors': 6, 'p': 1, 'weights': 'distance'}

In [17]:
grid_search.best_score_

0.6060327991735741

In [18]:
grid_search.best_estimator_.score(X_test,y_test) # 相同维度的score

0.7354244906092771