## 实现多元线性回归方程

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [None]:
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [11]:
X.shape

(490, 13)

In [12]:
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)

In [13]:
from playML.LinearRegression import LinearRegression

reg = LinearRegression()
reg.fit_normal(X_train, y_train)

LinearRegression()

In [14]:
reg.coef_

array([-1.20354261e-01,  3.64423279e-02, -3.61493155e-02,  5.12978140e-02,
       -1.15775825e+01,  3.42740062e+00, -2.32311760e-02, -1.19487594e+00,
        2.60101728e-01, -1.40219119e-02, -8.35430488e-01,  7.80472852e-03,
       -3.80923751e-01])

In [15]:
reg.interception_

34.11739972320087

In [16]:
reg.score(X_test, y_test)

0.812979405621294

## 使用 scikit-learn 解决回归问题

In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [18]:
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [19]:
X.shape

(490, 13)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

### scikit-learn 中的线性回归

In [24]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [25]:
lin_reg.fit(X_train, y_train)

LinearRegression()

In [26]:
lin_reg.coef_ # 系数

array([-1.20354261e-01,  3.64423279e-02, -3.61493155e-02,  5.12978140e-02,
       -1.15775825e+01,  3.42740062e+00, -2.32311760e-02, -1.19487594e+00,
        2.60101728e-01, -1.40219119e-02, -8.35430488e-01,  7.80472852e-03,
       -3.80923751e-01])

In [27]:
lin_reg.intercept_ # 截距

34.117399723229596

In [28]:
lin_reg.score(X_test, y_test)

0.812979405621281

### kNN Regressor

In [30]:
from sklearn.neighbors import KNeighborsRegressor

In [31]:
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
knn_reg.score(X_test, y_test)

0.5865412198300899

#### 使用网格超参数的方法

In [35]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             verbose=1)

In [36]:
grid_search.best_params_

{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}

In [37]:
grid_search.best_score_ # 之所以比较低是因为是用了CV 交叉验证方式

0.652216494152461

In [38]:
grid_search.best_estimator_.score(X_test, y_test) # 经过网格搜索后真正训练算法得到的结果

0.7160666820548707