### P093-房价预估-加载波斯顿房价数据

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_boston

In [3]:
raw_data = load_boston()

In [4]:
raw_data.data.shape

(506, 13)

In [5]:
raw_data.target.shape

(506,)

In [6]:
df = pd.DataFrame(
    data = np.c_[raw_data.data, raw_data.target],
    columns = list(raw_data.feature_names) + ['target']
)

In [7]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### P094-房价预估-计算房价和特征的相关性

In [8]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [9]:
df.corr()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
CRIM,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305
ZN,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
INDUS,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725
CHAS,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526
NOX,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
RM,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
AGE,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955
DIS,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
RAD,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
TAX,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536


In [11]:
df.corr()["target"].sort_values(ascending=False)

target     1.000000
RM         0.695360
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
AGE       -0.376955
RAD       -0.381626
CRIM      -0.388305
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: target, dtype: float64

```
CRIM--城镇人均犯罪率
ZN - 占地面积超过25,000平方英尺的住宅用地比例。
INDUS - 每个城镇非零售业务的比例。
CHAS - Charles River虚拟变量（如果是河道，则为1;否则为0）
NOX - 一氧化氮浓度（每千万份）
RM - 每间住宅的平均房间数
AGE - 1940年以前建造的自住单位比例
DIS加权距离波士顿的五个就业中心
RAD - 径向高速公路的可达性指数
TAX - 每10,000美元的全额物业税率
PTRATIO - 城镇的学生与教师比例
B - 1000（Bk - 0.63）^ 2其中Bk是城镇黑人的比例
LSTAT - 人口状况下降％
MEDV - 自有住房的中位数报价, 单位1000美元

```

### P095 房价预估  - 分离特征和预估目标数据

In [None]:
target.head()

In [12]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [13]:
data = df.copy()

In [14]:
target = data.pop("target")

In [15]:
data.shape

(506, 13)

In [16]:
target.shape

(506,)

In [17]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [18]:
target.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: target, dtype: float64

### P096 房价预估  - 拆分训练集和测试集

In [19]:
data.shape

(506, 13)

In [20]:
target.shape

(506,)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [23]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((379, 13), (127, 13), (379,), (127,))

### P097 房价预估  - 使用线性回归训练模型

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
regressor = LinearRegression()

In [26]:
regressor.fit(x_train, y_train)

LinearRegression()

In [27]:
regressor.score(x_test, y_test)

0.6844267283527129

### P098 房价预估  - 在测试集上实现预估

In [29]:
x_test.shape

(127, 13)

In [30]:
x_test.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
173,0.09178,0.0,4.05,0.0,0.51,6.416,84.1,2.6463,5.0,296.0,16.6,395.5,9.04
274,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.9,3.53
491,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52
452,5.09017,0.0,18.1,0.0,0.713,6.297,91.8,2.3682,24.0,666.0,20.2,385.09,17.27


In [31]:
target_pred = regressor.predict(x_test)

In [32]:
target_pred

array([28.83885359, 36.00783288, 15.08324755, 25.23090886, 18.87864064,
       23.21398327, 17.5931124 , 14.30508093, 23.05438985, 20.62008346,
       24.78514683, 18.66833668, -6.9788951 , 21.83575737, 19.20898992,
       26.2868054 , 20.54379176,  5.65713224, 40.42358065, 17.64146116,
       27.32258958, 30.05056174, 11.15013704, 24.11530393, 17.89145648,
       15.79348591, 22.94743453, 14.2586068 , 22.26731194, 19.24709013,
       22.26897546, 25.24344002, 25.69165643, 17.98759507, 16.70286649,
       17.11631225, 31.19643534, 20.17835831, 23.71828436, 24.79196868,
       13.94575895, 32.00389982, 42.53869791, 17.44523722, 27.15354457,
       17.07482215, 13.89272021, 26.06440323, 20.36888769, 29.97813037,
       21.35346608, 34.32287916, 15.88498671, 26.17757739, 39.50970314,
       22.84123308, 18.95049088, 32.68913818, 25.02057949, 12.90539147,
       22.76052302, 30.53884316, 31.60797905, 15.92162168, 20.50670563,
       16.50798147, 20.50202198, 26.00723901, 30.63860954, 11.42

### P099 房价预估  - 比较预估结果和真实值

In [34]:
predictions = pd.DataFrame(
    np.c_[y_test, target_pred],
    columns = ["y_test", "target_pred"]
)

In [35]:
predictions.head()

Unnamed: 0,y_test,target_pred
0,23.6,28.838854
1,32.4,36.007833
2,13.6,15.083248
3,22.8,25.230909
4,16.1,18.878641


In [37]:
predictions["error"] = predictions["target_pred"] - predictions["y_test"]

In [38]:
predictions["abs_error"] = abs(predictions["error"])

In [40]:
predictions.head(10)

Unnamed: 0,y_test,target_pred,error,abs_error
0,23.6,28.838854,5.238854,5.238854
1,32.4,36.007833,3.607833,3.607833
2,13.6,15.083248,1.483248,1.483248
3,22.8,25.230909,2.430909,2.430909
4,16.1,18.878641,2.778641,2.778641
5,20.0,23.213983,3.213983,3.213983
6,17.8,17.593112,-0.206888,0.206888
7,14.0,14.305081,0.305081,0.305081
8,19.6,23.05439,3.45439,3.45439
9,16.8,20.620083,3.820083,3.820083


### P100 房价预估  - 使用GBDT训练模型

In [41]:
from sklearn.ensemble import GradientBoostingRegressor

In [42]:
regressor = GradientBoostingRegressor(random_state=42)

In [43]:
regressor.fit(x_train, y_train)

GradientBoostingRegressor(random_state=42)

In [44]:
regressor.score(x_test, y_test)

0.8721047222908935

### P101 房价预估  - 模型的保存和加载

In [45]:
import pickle

In [46]:
with open("model.pkl", "wb") as file:
    pickle.dump(regressor, file)

In [47]:
with open("model.pkl", "rb") as file:
    regressor_loaded = pickle.load(file)

In [48]:
regressor_loaded.predict(x_test)

array([23.04052211, 30.80125144, 16.59687893, 23.98455389, 17.51775094,
       22.20424803, 18.37679613, 13.87826852, 20.69567546, 21.05630329,
       20.65800177, 18.25866334,  7.37218556, 21.70412526, 20.42411344,
       25.68150134, 19.65025197,  9.04186004, 45.87531955, 16.24352807,
       24.16750847, 25.58242866, 13.55637357, 21.63216117, 15.24383795,
       16.02780139, 21.97039232, 14.1308893 , 19.7882457 , 21.4943401 ,
       19.96374277, 23.55638271, 23.44639924, 19.94756007, 14.59751632,
       17.07294658, 33.48430381, 19.44918017, 21.13751246, 23.5933276 ,
       18.32202324, 30.25253963, 45.28348352, 20.90449728, 22.53997442,
       15.13571919, 16.28600727, 23.60866936, 18.01993116, 27.80166399,
       20.29367355, 35.77815626, 16.5197479 , 25.490704  , 47.51880799,
       21.53764501, 15.99636471, 31.79864176, 21.85748794, 18.22407593,
       22.7379009 , 34.12951603, 30.7125856 , 19.8255971 , 25.01159597,
       18.05612794, 14.58785612, 23.67111194, 28.79028621, 15.10