# 数据准备

In [1]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
data = load_boston()

In [3]:
print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)

In [5]:
# 实例化标准化api：
# 因为x和y不是同一类型数据，需要两个api对象
std_x = StandardScaler()
std_y = StandardScaler()

# 标准化x
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)  # fit一次即可

y_train = std_y.fit_transform(y_train.reshape(-1, 1))
y_test = std_y.transform(y_test.reshape(-1, 1))  # fit一次即可
# 新版的sklearn,std.fit_transfrom传入参数需要为二维
# reshape时，如果有一个为负数，不会读取该负数，那么会根据另一个的值，自动计算出负数占位的值

# 正规方程

In [6]:
# 正规方程api
lr = LinearRegression()

lr.fit(x_train, y_train)

print("正确率：\n%s" % lr.score(x_test, y_test))

print("回归系数：\n%s" % lr.coef_)

# 因为有标准化，所以需要变回来
y_lr_predict = std_y.inverse_transform(lr.predict(x_test))
print("预测值：\n%s" % y_lr_predict)

正确率：
0.7116668128256014
回归系数：
[[-0.10099135  0.11816513  0.00068658  0.06244738 -0.19968827  0.32017465
  -0.03684756 -0.3284982   0.27413815 -0.24057788 -0.2005851   0.1090139
  -0.36617659]]
预测值：
[[18.2714746 ]
 [19.4174104 ]
 [33.11124575]
 [22.41039739]
 [15.88186612]
 [23.69159188]
 [31.23147006]
 [ 9.98167705]
 [36.88064776]
 [13.45360187]
 [15.00138043]
 [21.78613706]
 [20.0772258 ]
 [ 6.71311287]
 [30.32824133]
 [ 1.37587071]
 [15.02058717]
 [29.29362038]
 [17.82421919]
 [21.58697389]
 [25.2820192 ]
 [10.87301315]
 [22.63721485]
 [27.9810624 ]
 [22.31296419]
 [21.6730532 ]
 [25.11191648]
 [32.33701737]
 [ 9.60141778]
 [14.46799803]
 [ 6.31587826]
 [26.55762999]
 [35.08513114]
 [33.81673905]
 [17.55665155]
 [38.38899877]
 [14.11686926]
 [27.56425939]
 [21.23960356]
 [ 8.95120254]
 [22.17435921]
 [20.87349516]
 [25.0160604 ]
 [13.73982764]
 [35.87765978]
 [24.4584668 ]
 [15.19944935]
 [39.06423588]
 [23.61178754]
 [14.23380142]
 [15.48420514]
 [19.23879523]
 [12.67991756]
 [28.60

# 梯度下降

In [7]:
# 梯度下降api
sgd = SGDRegressor()

sgd.fit(x_train, y_train.ravel())

# 如果不换为一维会有提示
print("正确率：\n%s" % sgd.score(x_test, y_test.ravel()))

print("回归系数：\n%s" % sgd.coef_)

# 因为有标准化，所以需要变回来
y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test))
print("预测值：\n%s" % y_sgd_predict)

正确率：
0.6985297885209896
回归系数：
[-0.08306893  0.07575347 -0.05270987  0.06676974 -0.13505587  0.34711112
 -0.04424577 -0.25985127  0.10435882 -0.07916862 -0.18489367  0.11151946
 -0.36199686]
预测值：
[18.17896888 19.99376904 32.66074986 22.87882833 16.29009101 23.67876478
 30.92381267 10.58606403 36.06357484 13.53847575 15.59519819 20.43983271
 19.55127847  6.27871188 30.38098856  0.14638519 16.85671117 30.12930857
 17.60339344 21.32029295 26.45951151  9.98156367 24.8452219  28.77116428
 22.42285055 19.82369029 25.66713725 31.67535819  9.4472396  14.75615918
  5.36425279 26.11191847 35.85636994 34.24642952 17.3177351  38.14642414
 14.53517204 26.97731758 21.64375508  9.41830639 21.32750631 21.54250344
 24.32483848 13.14716326 36.14406045 25.138888   15.8295685  38.82433523
 23.99935181 15.00659281 17.71600927 19.83638247 11.93189769 27.34744638
 20.05878267 18.66290202 18.90817602 39.694199    5.82735122 17.69594265
 18.61337748 28.60992176 25.09787899 20.48433157 24.32672616 10.01574576
 2

# 回归评估

In [8]:
print(mean_squared_error(std_y.inverse_transform(y_test), y_lr_predict))
print(mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

31.339468090816005
32.76735559151371


# Ridge岭回归

In [9]:
from sklearn.linear_model import Ridge

In [10]:
rd = Ridge(alpha=1.0)
rd.fit(x_train,y_train)
print("准确率为：%s" % rd.score(x_test,y_test))

准确率为：0.7110556168147781
