In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv("new data/train_data_new.csv")
test_data = pd.read_csv("new data/test_data_new.csv")

print(train_data.shape)
print(test_data.shape)

(1458, 252)
(1459, 251)


将训练数据划分为训练集和验证集用来调参

In [3]:
from sklearn.model_selection import train_test_split

train_label = train_data['SalePrice']
train_features = train_data.drop(['SalePrice'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(train_features, train_label, test_size=0.2, random_state=42)

定义K折交叉验证和评估指标，本次Kaggle比赛的官方评估指标是rmse

In [4]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

kf = KFold(n_splits=10, random_state=42, shuffle=True)

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X_train):
    rmse = np.sqrt(-cross_val_score(model, X, y_train, scoring="neg_mean_squared_error", cv=kf))
    return rmse

## GBDT模型调参

先用初始参数拟合模型

In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

gbr = GradientBoostingRegressor()
model = gbr.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("初始参数训练集rmse = ", score_train)
print("初始参数测试集rmse = ", score_val)

初始参数训练集rmse =  0.0765984934729955
初始参数测试集rmse =  0.17173252389215457


训练集rmse比测试集rmse低很多，可能存在过拟合的情况，我们希望能够通过调参改变这种情况

调参按照如下顺序调节，先调节类参数，再调节弱学习器参数

先调类参数learning_rate和n_estimators，我们还需要设置其它参数的默认值

对于learning_rate和n_estimators而言，可以先将学习率固定为0.1，搜索迭代次数

由于我们样本数只有1458个，min_samples_split的取值范围一般是样本数目的0.5%-1%之间，所以可以取7到14，我们取5作为默认值，后续可以逐渐增加

min_samples_leaf的取值凭感觉取一个数只要不造成过拟合即可，我们也取5作为默认值

由于样本数目不多，在sklearn中max_depth默认为3，取默认值即可

max_features一般默认取sqrt，subsample一般默认取0.8，由于数据集比较多的离群值，所以loss取huber

In [6]:
param_test1 = {'n_estimators': [i for i in range(100, 500, 100)]}

gbr = GradientBoostingRegressor(learning_rate=0.1, 
                                min_samples_split=5,
                                min_samples_leaf=5,
                                max_depth=3,
                                max_features='sqrt', 
                                subsample=0.8,
                                loss='huber',
                                random_state=42)
grid_search = GridSearchCV(estimator=gbr, param_grid=param_test1, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'n_estimators': 100}
0.19407874805162664


得到最佳迭代次数为100，此时学习率为0.1

这两个参数在后续可以成比例缩放进行调参

接下来调节弱学习器的参数，对结果影响最大的参数应该优先调节，弱学习器参数的调参顺序如下

1.调节max_depth和 min_samples_split

2.调节min_samples_leaf

3.调节max_features

调参max_depth和min_samples_split

由于样本数量不多所以我们将max_depth的取值范围设为[2, 4]

min_samples_split的取值范围设为[5, 15]

In [7]:
param_test2 = {'max_depth': [i for i in range(2, 5)],
              'min_samples_split': [i for i in range(5, 16, 2)]}

gbr = GradientBoostingRegressor(learning_rate=0.1, 
                                n_estimators=100,
                                min_samples_leaf=5,
                                max_features='sqrt', 
                                subsample=0.8,
                                loss='huber',
                                random_state=42)
grid_search = GridSearchCV(estimator=gbr, param_grid=param_test2, scoring=make_scorer(rmse), cv=5, verbose=True, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
{'max_depth': 2, 'min_samples_split': 11}
0.21598003619187325


得到了max_depth为2，由于样本数比较少，我认为这是合理的，min_samples_split为11，我们可以将max_depth确定下来，但是min_samples_split受到其它参数的影响还不能确定下来

接下来我们对划分最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf进行调参

In [8]:
param_test3 = {'min_samples_split': [i for i in range(5, 20, 2)],
              'min_samples_leaf': [i for i in range(5, 20, 2)]}
                
gbr = GradientBoostingRegressor(learning_rate=0.1, 
                                n_estimators=100,
                                max_depth=2,
                                max_features='sqrt', 
                                subsample=0.8,
                                loss='huber',
                                random_state=42)
grid_search = GridSearchCV(estimator=gbr, param_grid=param_test3, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
{'min_samples_leaf': 19, 'min_samples_split': 5}
0.21920648938938977


得到了min_samples_leaf为19，min_samples_split为5

现在我们已经得到了绝大部分参数，可以先用验证集进行验证查看调参的情况

In [9]:
gbr = GradientBoostingRegressor(learning_rate=0.1, 
                                n_estimators=100,
                                max_depth=2,
                                min_samples_leaf=19,
                                min_samples_split=5,
                                max_features='sqrt', 
                                subsample=0.8,
                                loss='huber',
                                random_state=42)

model = gbr.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("部分调参后训练集rmse = ", score_train)
print("部分调参后测试集rmse = ", score_val)

部分调参后训练集rmse =  0.16959425123075478
部分调参后测试集rmse =  0.22097611132153475


可以看到通过部分调参后我们的训练集和测试集的rmse有所下降，但是有效的解决了过拟合的问题

接下来我们再对max_features和subsample进行调参

In [10]:
param_test4 = {'max_features': ['auto', 'sqrt', 'log2']}

gbr = GradientBoostingRegressor(learning_rate=0.1, 
                                n_estimators=100,
                                max_depth=2,
                                min_samples_leaf=19,
                                min_samples_split=5, 
                                subsample=0.8,
                                loss='huber',
                                random_state=42)
grid_search = GridSearchCV(gbr, param_test4, scoring=make_scorer(rmse), cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_features': 'log2'}
0.24299647040333908


In [11]:
param_test5 = {'subsample': np.arange(0.5, 0.9, 0.05)}

gbr = GradientBoostingRegressor(learning_rate=0.1, 
                                n_estimators=100,
                                max_depth=2,
                                min_samples_leaf=19,
                                min_samples_split=5, 
                                max_features='log2',
                                loss='huber',
                                random_state=42)

grid_search = GridSearchCV(gbr, param_test5, scoring=make_scorer(rmse), cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'subsample': 0.5}
0.2462582146591538


将得到的所有参数放入模型，并同时调整学习率和迭代次数，查看模型训练结果

In [12]:
gbr = GradientBoostingRegressor(learning_rate=0.05, 
                                n_estimators=500,
                                max_depth=2,
                                min_samples_leaf=19,
                                min_samples_split=5,
                                max_features='log2', 
                                subsample=0.5,
                                loss='huber',
                                random_state=42)

model = gbr.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("完成调参后训练集rmse = ", score_train)
print("完成调参后测试集rmse = ", score_val)

完成调参后训练集rmse =  0.1301719365550433
完成调参后测试集rmse =  0.20128830666446296


## 随机森林模型调参

随机森林模型的参数比GBDT模型的参数要少，也可以用类似的方式调参得到，这里就不重复演示

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=500,
                          max_depth=12,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

model = rf.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("完成调参后训练集rmse = ", score_train)
print("完成调参后袋外分数 = ", model.oob_score_)
print("完成调参后测试集rmse = ", score_val)

完成调参后训练集rmse =  0.10171651006696977
完成调参后袋外分数 =  0.7644437660923032
完成调参后测试集rmse =  0.20917457525260985


实际上由于随机森林有袋外数据的存在可以不用划分训练集和测试集

## XGBoost模型调参

同样先用初始参数拟合模型

In [14]:
from xgboost import XGBRegressor


xgboost = XGBRegressor(objective='reg:squarederror',
                       random_state=42)

model = xgboost.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("初始参数训练集rmse = ", score_train)
print("初始参数测试集rmse = ", score_val)

初始参数训练集rmse =  0.00041859204434890906
初始参数测试集rmse =  0.20088078912490726


可以看到存在严重的过拟合问题

在XGBoost调参中也是先调类参数，再调弱学习器参数

类参数与GBDT类似，最重要的是迭代次数n_estimators和学习率learning_rate

弱学习器参数对模型影响最大的参数如下
1. 树的最大深度max_depth
2. 最小的子节点权重阈值min_child_weight
3. 决策树分裂所带来的损失减小阈值gamma

一般按照顺序先调这三个参数，如果这三个参数调整之后可以解决过拟合问题且模型效果还不错，剩下的参数可以设为默认值

剩下的参数包括子采样参数subsample，整棵树的特征采样比例colsample_bytree，正则化参数reg_alpha/reg_lambda等

In [15]:
param_test1 = {'n_estimators': [i for i in range(100, 500, 100)]}

xgboost = XGBRegressor(learning_rate=0.1, 
                       max_depth=5,
                       min_child_weight=1,
                       gamma=0,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       reg_alpha=0,
                       reg_lambda=1,
                       objective='reg:squarederror',
                       random_state=42)

grid_search = GridSearchCV(estimator=xgboost, param_grid=param_test1, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'n_estimators': 100}
0.17953909458338022


接下来调max_depth和min_child_weight

In [16]:
param_test2 = {'max_depth': [i for i in range(3, 10)], 
               'min_child_weight': [i for i in range(3, 10)]}

xgboost = XGBRegressor(n_estimators=100,
                       learning_rate=0.1, 
                       gamma=0,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       reg_alpha=0,
                       reg_lambda=1,
                       objective='reg:squarederror',
                       random_state=42)

grid_search = GridSearchCV(estimator=xgboost, param_grid=param_test2, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 49 candidates, totalling 245 fits
{'max_depth': 9, 'min_child_weight': 3}
0.18068171277629458


得到max_depth为9，min_child_weight为3

接下来调gamma

In [17]:
param_test3 = {'gamma': np.arange(0.2, 0.9, 0.1)}

xgboost = XGBRegressor(n_estimators=100,
                       learning_rate=0.1, 
                       max_depth=9,
                       min_child_weight=3,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       reg_alpha=0,
                       reg_lambda=1,
                       objective='reg:squarederror',
                       random_state=42)

grid_search = GridSearchCV(estimator=xgboost, param_grid=param_test3, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
{'gamma': 0.8000000000000003}
0.19624260758369796


得到gamma为0.8

In [18]:
xgboost = XGBRegressor(learning_rate=0.1,
                       n_estimators=100,
                       max_depth=9,
                       min_child_weight=3,
                       gamma=0.8,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       reg_alpha=0,
                       reg_lambda=1,
                       objective='reg:squarederror',
                       random_state=42)

model = xgboost.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("部分调参后训练集rmse = ", score_train)
print("部分调参后测试集rmse = ", score_val)

部分调参后训练集rmse =  0.14826534667430974
部分调参后测试集rmse =  0.20633124732079536


可以看到已经解决了过拟合的问题，那么剩下的参数可以选择不调

最后同时缩放类参数，提高模型的泛化能力，同时注意过拟合

In [19]:
xgboost = XGBRegressor(learning_rate=0.05,
                       n_estimators=1000,
                       max_depth=9,
                       min_child_weight=3,
                       gamma=0.8,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       reg_alpha=0,
                       reg_lambda=1,
                       objective='reg:squarederror',
                       random_state=42)

model = xgboost.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("完成调参后训练集rmse = ", score_train)
print("完成调参后测试集rmse = ", score_val)

完成调参后训练集rmse =  0.14414262647675175
完成调参后测试集rmse =  0.19746198376168503


## lightgbm模型调参

In [20]:
from lightgbm import LGBMRegressor

lightgbm = LGBMRegressor(objective='regression',
                       random_state=42)

model = lightgbm.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("初始参数训练集rmse = ", score_train)
print("初始参数测试集rmse = ", score_val)

初始参数训练集rmse =  0.020193735388523044
初始参数测试集rmse =  0.1718814611734789


In [21]:
param_test1 = {'n_estimators': [i for i in range(100, 500, 100)]}

lightgbm = LGBMRegressor(learning_rate=0.1, 
                         num_leaves=31,
                         max_depth=5,
                         min_child_weight=1,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         objective='regression',
                         random_state=42)

grid_search = GridSearchCV(estimator=lightgbm, param_grid=param_test1, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'n_estimators': 100}
0.17193835548870867


lightgbm中max_depth和num_leaves是相互影响的参数

一般而言，num_leaves <= 2^n-1，其中n为max_depth的值

In [22]:
param_test2 = {'max_depth': [i for i in range(3, 10)],
              'num_leaves': [i for i in range(10, 31, 2)]}

lightgbm = LGBMRegressor(learning_rate=0.1, 
                         n_estimators=100,
                         min_child_weight=1,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         objective='regression',
                         random_state=42)

grid_search = GridSearchCV(estimator=lightgbm, param_grid=param_test2, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 77 candidates, totalling 385 fits
{'max_depth': 6, 'num_leaves': 30}
0.17211791364042112


In [23]:
param_test3 = {'min_child_weight': [i for i in range(3, 10)]}

lightgbm = LGBMRegressor(learning_rate=0.1, 
                         n_estimators=100,
                         max_depth=6,
                         num_leaves=30,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         objective='regression',
                         random_state=42)

grid_search = GridSearchCV(estimator=lightgbm, param_grid=param_test3, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
{'min_child_weight': 3}
0.17211791364042112


In [24]:
lightgbm = LGBMRegressor(learning_rate=0.1, 
                         n_estimators=100,
                         max_depth=6,
                         num_leaves=30,
                         min_child_weight=3,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         objective='regression',
                         random_state=42)

model = lightgbm.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("部分调参后训练集rmse = ", score_train)
print("部分调参后测试集rmse = ", score_val)

部分调参后训练集rmse =  0.030520437973626992
部分调参后测试集rmse =  0.16665352013041423


可以看到仍然存在过拟合的情况，我们还需要继续调剩下的参数

In [25]:
param_test4 = {'subsample': np.arange(0.5, 0.9, 0.1)}

lightgbm = LGBMRegressor(learning_rate=0.1, 
                         n_estimators=100,
                         max_depth=6,
                         num_leaves=30,
                         min_child_weight=3,
                         colsample_bytree=0.8,
                         reg_alpha=0,
                         reg_lambda=1,
                         objective='regression',
                         random_state=42)

grid_search = GridSearchCV(estimator=lightgbm, param_grid=param_test4, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'subsample': 0.5}
0.17211791364042112


In [26]:
param_test5 = {'colsample_bytree': np.arange(0.5, 0.9, 0.1)}

lightgbm = LGBMRegressor(learning_rate=0.1, 
                         n_estimators=100,
                         max_depth=6,
                         num_leaves=30,
                         min_child_weight=3,
                         subsample=0.5,
                         reg_alpha=0,
                         reg_lambda=1,
                         objective='regression',
                         random_state=42)

grid_search = GridSearchCV(estimator=lightgbm, param_grid=param_test5, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'colsample_bytree': 0.5}
0.17655370838035325


In [27]:
param_test6 = {'reg_alpha': [i for i in range(0, 11)],
              'reg_lambda': [i for i in range(1, 26)]}

lightgbm = LGBMRegressor(learning_rate=0.1, 
                         n_estimators=100,
                         max_depth=6,
                         num_leaves=30,
                         min_child_weight=3,
                         subsample=0.5,
                         colsample_bytree=0.5,
                         objective='regression',
                         random_state=42)

grid_search = GridSearchCV(estimator=lightgbm, param_grid=param_test6, scoring=make_scorer(rmse), cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 275 candidates, totalling 1375 fits
{'reg_alpha': 10, 'reg_lambda': 24}
0.20524636352001685


In [28]:
lightgbm = LGBMRegressor(learning_rate=0.1, 
                         n_estimators=100,
                         max_depth=6,
                         num_leaves=30,
                         min_child_weight=3,
                         subsample=0.5,
                         colsample_bytree=0.5,
                         reg_alpha=10,
                         reg_lambda=24,
                         objective='regression',
                         random_state=42)

model = lightgbm.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("完成调参后训练集rmse = ", score_train)
print("完成调参后测试集rmse = ", score_val)

完成调参后训练集rmse =  0.15493532208240576
完成调参后测试集rmse =  0.19966635030234303


In [29]:
lightgbm = LGBMRegressor(learning_rate=0.05, 
                         n_estimators=800,
                         max_depth=6,
                         num_leaves=30,
                         min_child_weight=3,
                         subsample=0.5,
                         colsample_bytree=0.5,
                         reg_alpha=10,
                         reg_lambda=24,
                         objective='regression',
                         random_state=42)

model = lightgbm.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
score_train = rmse(y_train, y_pred_train)

y_pred_val = model.predict(X_val)
score_val = rmse(y_val, y_pred_val)

print("完成调参后训练集rmse = ", score_train)
print("完成调参后测试集rmse = ", score_val)

完成调参后训练集rmse =  0.14947870244676917
完成调参后测试集rmse =  0.19495610296499724


可以看到调参后过拟合的情况有所改善并且模型精度有所提高