# 洛杉矶房价预测

## 1 加载数据

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("./data/data.csv")

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 2 理解数据
- 理解各个字段（特征）的含义，实际应用若能理解业务最好
- 用describe()查看基本统计量
- 用info()查看空值情况

In [4]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [6]:
type(train)

pandas.core.frame.DataFrame

### 小结
- 通过查看dataFrame的数据类型与空值表现后，第一步往往是空值的填充与Outlier的清洗。

### 常用机器学习算法是否允许空值Nan?
- SVM是否容忍空值？ 基于数值计算、数值回归，一定要将内容转化为数值，因此不支持空值。
- DecisionTree能否容忍空值？ CART(ID3,C4.5基本不用)基于空间的划分，允许空值。
- DNN？基于数值计算，不允许空值。
- 通常通过梯度下降法求解的模型通常是需要归一化的，包括线性回归、逻辑回归、支持向量机、神经网络等模型。

## 3 数据填充
### 3.1 结构化数据分类与常见处理办法
- 连续型数据（Numerical Feature）：往往需要进行归一化。
- 离散型数据（Categorial Feature）：也叫类别型数据。往往需要进行标签编码（LabelEncode）转换成数值类型再使用。常见的编码有：
    - 序号编码（Ordinal Encoding）
    - 独热编码（One-hot Encoding）
    - 二进制编码（Binary Encoding）

### 3.2 如何统计数据缺失率
- 可以从以下几个维度思考：
    - 1.哪些列有缺失
    - 2.缺失多少
    - 3.缺失率是多少
    - 4.可否按照缺失率由大到小排列

In [7]:
train.get_dtype_counts()   # 得到各类数据类型的统计信息

float64     3
int64      35
object     43
dtype: int64

In [8]:
# 得到一个列表，每一列的空值数量
train.isnull().sum()[3]

259

In [9]:
# 缺失率统计
null_sum = train.isnull().sum()
null_rate = [i/len(train) for i in null_sum]
null_rate

[0.0,
 0.0,
 0.0,
 0.1773972602739726,
 0.0,
 0.0,
 0.9376712328767123,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.005479452054794521,
 0.005479452054794521,
 0.0,
 0.0,
 0.0,
 0.025342465753424658,
 0.025342465753424658,
 0.026027397260273973,
 0.025342465753424658,
 0.0,
 0.026027397260273973,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0006849315068493151,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.4726027397260274,
 0.05547945205479452,
 0.05547945205479452,
 0.05547945205479452,
 0.0,
 0.0,
 0.05547945205479452,
 0.05547945205479452,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.9952054794520548,
 0.8075342465753425,
 0.963013698630137,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [10]:
null_sum.tail(10)

PoolArea            0
PoolQC           1453
Fence            1179
MiscFeature      1406
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
SalePrice           0
dtype: int64

### count与sum的区别
- sum会将true=1的都加起来，false=1的不计算。
- count会无脑把所有的都加起来
- train.isnull().count() 

In [11]:
# 统计各列的缺失值数量，并按数量由低到高排列
tmp = train.isnull().sum().sort_values()
tmp[tmp>0]

Electrical         1
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtFinType2      38
BsmtExposure      38
GarageQual        81
GarageFinish      81
GarageYrBlt       81
GarageType        81
GarageCond        81
LotFrontage      259
FireplaceQu      690
Fence           1179
Alley           1369
MiscFeature     1406
PoolQC          1453
dtype: int64

可以看出，共有19列有缺失值，且后面4列特征缺失率较大

## 一个算法的Baseline

### 算法的Pipeline:

数据采集->数据清洗->数据仓库->数据挖掘->数据标注->数据集市(训练集/评测集)->训练->评测->模型工程(Int8定点化)->模型的集成和应用

### 算法的BaseLine:

训练集的制作->训练->评测

在形成一个BaseLine之后，模型迭代的过程进步的标志是评测指标的提升

### 评测指标：

- Regression

    - MSE(Mean Squared Error)-L2 / MAE(Mean Absolute Error)-L1

- Classification

    - Cross-Entropy(交叉熵)

    - 信息量 -log2(P)-> 信息熵 -$\Sigma$(p * log2(p)) ->交叉熵 -$\Sigma$(p * log2(q))

    - Center-Loss

### 3.3 数据列的简单填充

In [12]:
train1 = train
train1.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### 如何进行简单填充
#### 缺失值的类型
Numerical | Categorial

#### Numerical

1.均值填充

2.中位数填充

3.高频填充--高频会"稍微"少一些--连续型数据很难数字相等，但是很容易接近

4.分布拟合填充--回归一个分布，或者是 ($\mu$-均值 , $\sigma$-标准差)-(+-$\sigma$的范围中进行随机)

5.内在关系填充(根据列本身存在的意义和数据集内其他列的相关性进行构建数学映射模型进行填充)

#### Categorial

1.高频填充

"男"|"女"-->0|1

"小孩"|"青年"|"中年"|"老年"-->0|1|2|3

* No Free Lunch Theory（没有免费午餐定理）-不存在超级机器学习模型

2.内在关系填充


In [13]:
train1.mean()   # 数据列均值

Id                  730.500000
MSSubClass           56.897260
LotFrontage          70.049958
LotArea           10516.828082
OverallQual           6.099315
OverallCond           5.575342
YearBuilt          1971.267808
YearRemodAdd       1984.865753
MasVnrArea          103.685262
BsmtFinSF1          443.639726
BsmtFinSF2           46.549315
BsmtUnfSF           567.240411
TotalBsmtSF        1057.429452
1stFlrSF           1162.626712
2ndFlrSF            346.992466
LowQualFinSF          5.844521
GrLivArea          1515.463699
BsmtFullBath          0.425342
BsmtHalfBath          0.057534
FullBath              1.565068
HalfBath              0.382877
BedroomAbvGr          2.866438
KitchenAbvGr          1.046575
TotRmsAbvGrd          6.517808
Fireplaces            0.613014
GarageYrBlt        1978.506164
GarageCars            1.767123
GarageArea          472.980137
WoodDeckSF           94.244521
OpenPorchSF          46.660274
EnclosedPorch        21.954110
3SsnPorch             3.409589
ScreenPo

In [14]:
train1.median()   # 中位数

Id                  730.5
MSSubClass           50.0
LotFrontage          69.0
LotArea            9478.5
OverallQual           6.0
OverallCond           5.0
YearBuilt          1973.0
YearRemodAdd       1994.0
MasVnrArea            0.0
BsmtFinSF1          383.5
BsmtFinSF2            0.0
BsmtUnfSF           477.5
TotalBsmtSF         991.5
1stFlrSF           1087.0
2ndFlrSF              0.0
LowQualFinSF          0.0
GrLivArea          1464.0
BsmtFullBath          0.0
BsmtHalfBath          0.0
FullBath              2.0
HalfBath              0.0
BedroomAbvGr          3.0
KitchenAbvGr          1.0
TotRmsAbvGrd          6.0
Fireplaces            1.0
GarageYrBlt        1980.0
GarageCars            2.0
GarageArea          480.0
WoodDeckSF            0.0
OpenPorchSF          25.0
EnclosedPorch         0.0
3SsnPorch             0.0
ScreenPorch           0.0
PoolArea              0.0
MiscVal               0.0
MoSold                6.0
YrSold             2008.0
SalePrice        163000.0
dtype: float

### 3.3.1 对于连续性数据进行均值|中位数填充

In [15]:
# 均值填充
train1=train1.fillna(train1.mean())  # fillna函数会找到对应列的均值或者是中位数，对于该列进行相应的填充
train1

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.000000,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.000000,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.000000,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.000000,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.000000,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.000000,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.000000,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,70.049958,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.000000,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.000000,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [16]:
tmp=train1.isnull().sum()
tmp[tmp>0].shape

(16,)

### 3.3.2 对于离散型数据进行填充

朴素的办法就是将NaN这种空类型变成'None'或者是'NA'的字段

In [17]:
train1=train1.fillna('None')
train1

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.000000,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.000000,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.000000,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.000000,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.000000,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.000000,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.000000,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,70.049958,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.000000,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.000000,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


### 暴力填充完毕
- 到目前为止，我们已经完成了“暴力”填充阶段
- 目前的训练集噪声很大,留给后面的EDA(Exploratory Data Analysis 探索性数据分析)过程

In [18]:
# 统计数据集空值NaN的数量
def get_null_count(t):
    tmp = t.isnull().sum()
    print(tmp[tmp>0])

In [19]:
get_null_count(train1)

Series([], dtype: int64)


这样说明已经没有空值了。注意，这里的空值指的是panda认为的空值NaN。

## 4 训练数据
### 4.1 准备训练集和测试集

In [20]:
y = train1['SalePrice']
# 一定要去掉答案
train2 = train1.drop(['Id','SalePrice'],axis=1)
train1.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [21]:
train2.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [22]:
X = pd.get_dummies(train2)  # 获取训练集.如果没有指定列，默认只对category和objects数据（离散型数据）进行one-Hot编码
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=123)
X_train.shape

(1168, 303)

In [25]:
X_test.shape

(292, 303)

In [26]:
col1 = X_test.columns
col1

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=303)

### 4.2 模型的训练
#### xgboost参数
- booster[默认gbtree]：
    - 选择每次迭代的模型，有两种选择：
    - gbtree：基于树的模型
    - gbliner：线性模型
- silent[默认0]
    - 当这个参数值为1时，静默模式开启，不会输出任何信息。
    - 一般这个参数就保持默认的0，因为这样能帮我们更好地理解模型。
- nthread[默认值为最大可能的线程数]
    - 这个参数用来进行多线程控制，应当输入系统的核数。如果你希望使用CPU全部的核，那就不要输入这个参数，算法会自动检测它。
    - 还有两个参数，XGBoost会自动设置，目前你不用管它。接下来咱们一起看booster参数。
- eta[默认0.3]
    - 和GBM中的 learning rate 参数类似。
    - 通过减少每一步的权重，可以提高模型的鲁棒性。
    - 典型值为0.01-0.2。
- min_child_weight[默认1]
    - 决定最小叶子节点样本权重和。
    - 和GBM的 min_child_leaf 参数类似，但不完全一样。XGBoost的这个参数是最小样本权重的和，而GBM参数是最小样本总数。
    - 这个参数用于避免过拟合。当它的值较大时，可以避免模型学习到局部的特殊样本。
    - 但是如果这个值过高，会导致欠拟合。这个参数需要使用CV来调整。
- max_depth[默认6]
    - 和GBM中的参数相同，这个值为树的最大深度。
    - 这个值也是用来避免过拟合的。max_depth越大，模型会学到更具体更局部的样本。
    - 需要使用CV函数来进行调优。
    - 典型值：3-10
- max_leaf_nodes
    - 树上最大的节点或叶子的数量。
    - 可以替代max_depth的作用。因为如果生成的是二叉树，一个深度为n的树最多生成n2 n^2n2个叶子。
    - 如果定义了这个参数，GBM会忽略max_depth参数。
- gamma[默认0]
    - 在节点分裂时，只有分裂后损失函数的值下降了，才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。
    - 这个参数的值越大，算法越保守。这个参数的值和损失函数息息相关，所以是需要调整的。
- max_delta_step[默认0]
    - 这参数限制每棵树权重改变的最大步长。如果这个参数的值为0，那就意味着没有约束。如果它被赋予了某个正值，那么它会让这个算法更加保守。
    - 通常，这个参数不需要设置。但是当各类别的样本十分不平衡时，它对逻辑回归是很有帮助的。
    - 这个参数一般用不到，但是你可以挖掘出来它更多的用处。
- subsample[默认1]
    - 和GBM中的subsample参数一模一样。这个参数控制对于每棵树，随机采样的比例。
    - 减小这个参数的值，算法会更加保守，避免过拟合。但是，如果这个值设置得过小，它可能会导致欠拟合。
    - 典型值：0.5-1
- colsample_bytree[默认1]
    - 和GBM里面的max_features参数类似。用来控制每棵随机采样的列数的占比(每一列是一个特征)。
    - 典型值：0.5-1
- colsample_bylevel[默认1]
    - 用来控制树的每一级的每一次分裂，对列数的采样的占比。
    - 我个人一般不太用这个参数，因为subsample参数和colsample_bytree参数可以起到相同的作用。但是如果感兴趣，可以挖掘这个参数更多的用处。
- lambda[默认1]
    - 权重的L2正则化项。(和Ridge regression类似)。
    - 这个参数是用来控制XGBoost的正则化部分的。虽然大部分数据科学家很少用到这个参数，但是这个参数在减少过拟合上还是可以挖掘出更多用处的。
- alpha[默认1]
    - 权重的L1正则化项。(和Lasso regression类似)。
    - 可以应用在很高维度的情况下，使得算法的速度更快。
- scale_pos_weight[默认1]
    - 在各类别样本十分不平衡时，把这个参数设定为一个正值，可以使算法更快收敛。
- objective[默认reg:linear]
    - 这个参数定义需要被最小化的损失函数。最常用的值有：
    - binary:logistic 二分类的逻辑回归，返回预测的概率(不是类别)。
    - multi:softmax 使用softmax的多分类器，返回预测的类别(不是概率)。
    - 在这种情况下，你还需要多设一个参数：num_class(类别数目)。
    - multi:softprob 和multi:softmax参数一样，但是返回的是每个数据属于各个类别的概率。
- eval_metric[默认值取决于objective参数的取值]
    - 对于有效数据的度量方法。
    - 对于回归问题，默认值是rmse，对于分类问题，默认值是error。
- seed(默认0)
    - 随机数的种子
    - 设置它可以复现随机数据的结果，也可以用于调整参数

In [27]:
import xgboost as xgb

In [28]:
# 训练参数的意思得知道
xg_reg=xgb.XGBRegressor(objective='reg:linear',
                        colsample_bytree=0.6,
                        learning_rate=0.01,
                        max_depth=8,
                        alpha=10,
                        n_estimators=700,
                        subsample=0.7)

In [29]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=700,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

### 4.3 模型评测

In [30]:
pred = xg_reg.predict(X_test)

In [31]:
rmse = np.sqrt(mean_squared_error(y_test,pred))

In [32]:
rmse  # 标准差(均方根误差)

24922.42625711337

In [33]:
logrmse = np.sqrt(mean_squared_error(np.log(y_test),np.log(pred)))

In [34]:
logrmse   # 对数均方根误差

0.11403318080040263

In [35]:
# 可决系数
from sklearn.metrics import r2_score

In [36]:
r2_score(y_test, pred)

0.8994876975287012

## 5 使用中位数填充作为对比
### 5.1 简单填充数据

In [37]:
train3 = train
train3.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [38]:
train3 = train3.fillna(train2.median())  # 使用中位数填充

In [39]:
train3.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [40]:
tmp=train3.isnull().sum()
tmp[tmp>0].shape

(16,)

In [41]:
train3=train3.fillna('None')
train3.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [42]:
get_null_count(train3)

Series([], dtype: int64)


### 5.2 制作训练集与测试集

In [43]:
y=train3['SalePrice']

In [44]:
train3=train3.drop(['Id','SalePrice'], axis=1)
X=pd.get_dummies(train3)  
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)
X_train.shape

(1168, 303)

In [46]:
X_test.shape

(292, 303)

### 5.3 模型的训练

In [47]:
# max_depth是标准的前剪枝，当枝达到5时就不增加了
xg_reg=xgb.XGBRegressor(objective='reg:linear',
                        colsample_bytree=0.60,
                        learning_rate=0.01,
                        max_depth=6,
                        alpha=10,
                        n_estimators=3000,
                        subsample=0.7)

In [48]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=3000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

### 5.4 模型的评测

In [49]:
pred=xg_reg.predict(X_test)
rmse=np.sqrt(mean_squared_error(y_test,pred))
rmse

24470.368592646646

In [50]:
logrmse=np.sqrt(mean_squared_error(np.log(y_test),np.log(pred)))
logrmse

0.10952361765023395

In [51]:
r2_score(y_test, pred)

0.9031009309639566

## 6 小结
- 通过对数据集的简单填充、训练、评估，对机器学习的工业流程有进一步了解。
- 通过对比数值特征中位数和均值填充，可知，该项目使用中位数填充效果好一些（均方根误差小）。
- 在做更完备的数据填充之前，可以做探索性数据分析（EDA），分析特征与标签之间的相关性，以便更好地填充数据。

In [58]:
## 暴力调参
# from sklearn.model_selection import GridSearchCV
# GridSearch

# gs=GridSearch(xg_reg,{
#     "n_estimators":[100,500,1000,3000],
#     "alpha":[0.01,0.1,1.0,10]
#     "lambda":[...]
# })

# Randomized Search

# rs=RandomizedSearch(xg_reg,{
#     "n_estimators":[100,500,1000,3000],
#     "alpha":np.norm(1.0,0.7),
#     "lambda":[...]
# })

## 7 交叉验证（Cross Validation）

- 问题
1.数据有限如何发挥数据本来的效率
2.数据的训练集和评测集的矛盾

如果用更多的数据去训练，那么就会有更少的数据来评测->失去了对于真实泛化能力的考量
如果用更少的数据来训练，很大可能造成欠拟合，在评测集上表现就一定不好。

- 解决方法k折交叉验证：

1.将数据集拆成K份->(首先进行shuffle)

2.规定k-1份进行训练，剩下的1份进行评测，总共训练k次，轮流每个子数据集作为评测集

数据集被分成K份，1......K

第一次训练使用第1份数据集作为评测集，剩余的k-1份作为训练集

第i次训练使用第i份数据集作为评测集，剩余的k-1份作为训练集

做k次训练
3.k次训练之后，评测的分值=k次评测结果的平均

4.不同的模型会提供不同的子模型的合并方法，会将所有的k个子模型进行合并

In [59]:
params = {"objective":"reg:linear",'colsample_bytree': 0.7,'learning_rate': 0.1,'max_depth': 8, 'alpha': 10}

In [60]:
# 把训练集和label转换成xgboost可接受的数据类型
matrix=xgb.DMatrix(data=X,label=y)  # xgboost可以接受的Data的一种压缩后的数据结构,.lmdb/.h5,基于此可以对训练集的读写进行优化提升训练速度

In [61]:
cv_results=xgb.cv(dtrain=matrix,params=params,nfold=10,num_boost_round=500,metrics='rmse',as_pandas=True,verbose_eval=2)
# tree pruning end  剪枝
# 决策树的学习过程分为2个阶段，分裂和剪枝(前剪枝和后剪枝)

[14:05:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=7
[14:05:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 52 extra nodes, 0 pruned nodes, max_depth=8
[14:05:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=8
[14:05:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=7
[14:05:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=7
[14:05:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=7
[14:05:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=7
[14:05:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=8
[14:05:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_

In [62]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,178775.700000,923.382736,178656.100000,8722.863706
1,161900.451563,821.663729,162129.529688,8247.784847
2,146674.510937,750.482680,147257.670313,7761.388245
3,132932.560938,680.667064,133695.891406,7279.138560
4,120573.022656,640.171997,121685.979687,6994.706868
5,109401.315625,582.866401,110912.640625,6752.794112
6,99317.382812,532.411881,101432.971875,6727.770961
7,90219.503125,473.438328,92978.532812,6504.022887
8,82033.225781,426.640540,85174.523437,6400.745733
9,74629.846875,383.494283,78235.996094,6224.818375
