# 实现多元线性回归
2019/10/22   zx青 

In [10]:
import numpy as np
from sklearn.metrics import r2_score
from sklearn import datasets

![theta](./theta.png)
![theta](./theta1.jpg)

In [24]:
class LinearRegression:
    """线性回归模型"""
    def __init__(self):
        """初始化Liner Regression模型"""
        self.coef_ = None
        self.interception_ = None
        self._theta = None   # 私有变量
    
    def fit(self,X_train, y_train):
        """训练函数，根据训练集x_train,y_train训练Linear Regression"""
        X_b = np.hstack([np.ones((len(X_train),1)), X_train])
        self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]
        return self


    def predict(self, X_predict):
        """预测函数，给定预测数据集X_predict,返回X_predict的结果向量"""
        X_b = np.hstack([np.ones((len(X_predict),1)), X_predict])
        return X_b.dot(self._theta)

    
    def score(self, X_test, y_test):
        """测试函数,根据测试集X_test和y_test确定当前模型的准确度"""
        y_predict = self.predict(X_test)
        return r2_score(y_test, y_predict)


In [25]:
# 波士顿数据集
boston = datasets.load_boston()

X = boston.data
y = boston.target

In [26]:
np.max(y)

50.0

In [27]:
# 去掉问题数据
X = X[y<50]
y = y[y<50]

In [28]:
X.shape

(490, 13)

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# 实例化
reg = LinearRegression()
reg.fit(X_train,y_train )

<__main__.LinearRegression at 0x1f7d2843668>

In [30]:
reg.coef_

array([-1.08553039e-01,  3.72696654e-02, -6.38258405e-02, -3.88700870e-01,
       -1.17963032e+01,  3.52112746e+00, -1.57488148e-02, -1.22406414e+00,
        3.16239958e-01, -1.59757907e-02, -9.29830875e-01,  6.93548271e-03,
       -3.77135092e-01])

In [31]:
reg.interception_

In [32]:
reg.score(X_test, y_test)

0.7700541118005138

 ## scikit-learn中的回归问题

In [33]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [34]:
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [35]:
lin_reg.coef_

array([-1.08553039e-01,  3.72696654e-02, -6.38258405e-02, -3.88700870e-01,
       -1.17963032e+01,  3.52112746e+00, -1.57488148e-02, -1.22406414e+00,
        3.16239958e-01, -1.59757907e-02, -9.29830875e-01,  6.93548271e-03,
       -3.77135092e-01])

In [36]:
lin_reg.intercept_

35.8388105167592

In [37]:
lin_reg.score(X_test, y_test)

0.7700541118005264

## KNN Regression

In [38]:
from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
knn_reg.score(X_test, y_test)

0.4556278587908029

In [40]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'weights':['uniform'],
         'n_neighbors':[i for i in range(1,11)]
     },
     {
         'weights':['distance'],
         'n_neighbors':[i for i in range(1,11)],
         'p':[i for i in range(1,6)]
     }
 ]
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg,param_grid,n_jobs=-1,verbose=1)
grid_search.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [41]:
grid_search.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [45]:
grid_search.best_score_

0.6358684577690323

In [46]:
grid_search.best_estimator_.score(X_test, y_test)

0.655246008446609