In [21]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

## 获取训练数据

In [2]:
train_feature = np.genfromtxt("/Users/wangyajun/workspace/feature_train.txt", dtype=np.float32)

In [4]:
feature_num = len(train_feature[0])
train_feature = pd.DataFrame(train_feature)
train_label = train_feature.iloc[:, feature_num-1]
train_feature = train_feature.iloc[:, 0:feature_num-2]

In [5]:
train_feature

Unnamed: 0,0,1,2,3,4,5,6
0,0.005988,0.569231,0.647059,0.95122,-0.225434,0.837989,0.357258
1,0.161677,0.743195,0.682353,0.960976,-0.086705,0.780527,0.282945
2,0.113772,0.744379,0.541176,0.990244,-0.00578,0.721468,0.43411
3,0.053892,0.608284,0.764706,0.95122,-0.248555,0.821229,0.848604
4,0.173653,0.866272,0.682353,0.95122,0.017341,0.704709,-0.021002


In [6]:
train_label

0    320.0
1    361.0
2    364.0
3    336.0
4    358.0
Name: 8, dtype: float32

In [7]:
test_feature = np.genfromtxt("/Users/wangyajun/workspace/feature_test.txt", dtype=np.float32)

In [8]:
feature_num = len(test_feature[0])
test_feature = pd.DataFrame(test_feature)
test_label = test_feature.iloc[:, feature_num-1]
test_feature = test_feature.iloc[:, 0:feature_num-2]

In [9]:
test_feature

Unnamed: 0,0,1,2,3,4,5,6
0,0.005988,0.569231,0.647059,0.95122,-0.225434,0.837989,0.357258
1,0.161677,0.743195,0.682353,0.960976,-0.086705,0.780527,0.282945
2,0.113772,0.744379,0.541176,0.990244,-0.00578,0.721468,0.43411
3,0.053892,0.608284,0.764706,0.95122,-0.248555,0.821229,0.848604
4,0.173653,0.866272,0.682353,0.95122,0.017341,0.704709,-0.021002


In [10]:
test_label

0    320.0
1    361.0
2    364.0
3    336.0
4    358.0
Name: 8, dtype: float32

## GBDT模型
1) n_estimators: 也就是弱学习器的最大迭代次数，或者说最大的弱学习器的个数。

2) learning_rate: 即每个弱学习器的权重缩减系数，也称作步长。

3) subsample: 即我们在原理篇的正则化章节讲到的子采样，取值为(0,1]。注意这里的子采样和随机森林不一样，随机森林使用的是放回抽样，而这里是不放回抽样。推荐在[0.5, 0.8]之间，默认是1.0，即不使用子采样。

4) init: 即我们的初始化的时候的弱学习器，拟合对应原理篇里面的f0(x)，如果不输入，则用训练集样本来做样本集的初始化分类回归预测。否则用init参数提供的学习器做初始化分类回归预测。一般用在我们对数据有先验知识，或者之前做过一些拟合的时候，如果没有的话就不用管这个参数了。

5) loss: 即我们GBDT算法中的损失函数。分类模型和回归模型的损失函数是不一样的。
    
    对于分类模型，有对数似然损失函数"deviance"和指数损失函数"exponential"两者输入选择。默认是对数似然损失函数"deviance"。在原理篇中对这些分类损失函数有详细的介绍。一般来说，推荐使用默认的"deviance"。它对二元分离和多元分类各自都有比较好的优化。而指数损失函数等于把我们带到了Adaboost算法。
    
    对于回归模型，有均方差"ls", 绝对损失"lad", Huber损失"huber"和分位数损失“quantile”。默认是均方差"ls"。一般来说，如果数据的噪音点不多，用默认的均方差"ls"比较好。如果是噪音点较多，则推荐用抗噪音的损失函数"huber"。而如果我们需要对训练集进行分段预测的时候，则采用“quantile”。

6) alpha：这个参数只有GradientBoostingRegressor有，当我们使用Huber损失"huber"和分位数损失“quantile”时，需要指定分位数的值。默认是0.9，如果噪音点较多，可以适当降低这个分位数的值。

In [11]:
gbdt = GradientBoostingRegressor(
  loss = 'ls' 
, learning_rate = 0.1
, n_estimators = 100
, subsample = 1
, min_samples_split = 2
, min_samples_leaf = 1
, max_depth = 3
, init = None
, random_state = None
, max_features = None
, alpha = 0.9
, verbose = 0
, max_leaf_nodes = None
, warm_start = False
)

In [13]:
gbdt.fit(train_feature, train_label)
pre = gbdt.predict(test_feature)
total_error = 0

for i in range(pre.shape[0]):
    print("pre:", pre[i], '  label:', test_label[i])
print("均方误差：", np.sqrt((test_label - pre) ** 2).mean())

pre: 320.0008173984891   label: 320.0
pre: 360.99965033119537   label: 361.0
pre: 363.99928183902097   label: 364.0
pre: 336.0002344322584   label: 336.0
pre: 358.0000159974151   label: 358.0
均方误差： 0.0004271315892651728


## 调参过程

参考博客： https://www.cnblogs.com/DjangoBlog/p/6201663.html

In [8]:
data = pd.read_csv("/Users/wangyajun/workspace/Machine Learning Demo/GBDT/train_modified.csv", header = 0)

In [9]:
data

Unnamed: 0,Disbursed,Existing_EMI,ID,Loan_Amount_Applied,Loan_Tenure_Applied,Monthly_Income,Var4,Var5,Age,EMI_Loan_Submitted_Missing,...,Var2_2,Var2_3,Var2_4,Var2_5,Var2_6,Mobile_Verified_0,Mobile_Verified_1,Source_0,Source_1,Source_2
0,0.0,0.0,ID000002C20,300000.0,5.0,20000,1,0,37,1,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,ID000004E40,200000.0,2.0,35000,3,13,30,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,0.0,0.0,ID000007H20,600000.0,4.0,22500,1,0,34,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,ID000008I30,1000000.0,5.0,35000,3,10,28,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,25000.0,ID000009J40,500000.0,2.0,100000,3,17,31,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87015,0.0,14500.0,ID124813N30,1000000.0,5.0,71901,3,9,46,1,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
87016,0.0,0.0,ID124814O40,0.0,0.0,16000,5,1,25,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
87017,0.0,0.0,ID124816Q10,0.0,0.0,118000,3,8,43,1,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
87018,0.0,13660.0,ID124818S30,800000.0,5.0,98930,3,18,38,1,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [10]:
target = 'Disbursed'
train_label = data[target]

In [13]:
train_label.value_counts()

0.0    85747
1.0     1273
Name: Disbursed, dtype: int64

In [14]:
IDcol = 'ID'
feature_col = [x for x in data.columns if x not in [target, IDcol]]
train_feature = data[feature_col]

### 默认参数

In [22]:
gbdt_0 = GradientBoostingClassifier(random_state=10)
gbdt_0.fit(train_feature, train_label)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=10, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [23]:
pre_0 = gbdt_0.predict(train_feature)
pre_prob = gbdt_0.predict_proba(train_feature)[:,1]

In [30]:
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline

In [33]:
print("Accaurcy: %.4g" %metrics.accuracy_score(train_label, pre_0))

Accaurcy: 0.9856


In [37]:
print("AUC Score: %f" %metrics.roc_auc_score(train_label, pre_prob))

AUC Score: 0.862264


#### 1、对迭代次数进行网格搜索
首先我们从步长(learning rate)和迭代次数(n_estimators)入手。一般来说,开始选择一个较小的步长来网格搜索最好的迭代次数。这里，我们将步长初始值设置为0.1。对于迭代次数进行网格搜索如下

##### n_estimators

In [43]:
param_test1 = {'n_estimators':range(20,101,10)}
gsearch1 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = 0.1
        , min_samples_split = 300
        , min_samples_leaf = 20
        , max_depth = 8
        , max_features = 'sqrt'
        , subsample = 0.8
        , random_state = 10
    )
    , param_grid = param_test1
    , scoring = 'roc_auc'
    , iid = False
    , cv = 5
)
gsearch1.fit(train_feature, train_label)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=8,
                                                  max_features='sqrt',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=20,
                                                  min_samples_split=300,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  pre

In [45]:
gsearch1.cv_results_

{'mean_fit_time': array([1.92715359, 2.64256039, 3.37707567, 4.03654528, 4.66962829,
        5.22545018, 5.82572961, 6.56250124, 7.35079737]),
 'std_fit_time': array([0.0662159 , 0.04726978, 0.07169142, 0.1206033 , 0.0710805 ,
        0.0753033 , 0.07085114, 0.19467505, 0.16936587]),
 'mean_score_time': array([0.0267415 , 0.0312242 , 0.03565388, 0.04057517, 0.04535813,
        0.04769998, 0.05198579, 0.05865922, 0.0617569 ]),
 'std_score_time': array([0.00086475, 0.00101114, 0.00215531, 0.00193362, 0.00207893,
        0.00253177, 0.00221425, 0.00439735, 0.00340013]),
 'param_n_estimators': masked_array(data=[20, 30, 40, 50, 60, 70, 80, 90, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 20},
  {'n_estimators': 30},
  {'n_estimators': 40},
  {'n_estimators': 50},
  {'n_estimators': 60},
  {'n_estimators': 70},
  {'n_estimators': 80},
  {'n_estima

In [46]:
gsearch1.best_params_

{'n_estimators': 70}

In [47]:
gsearch1.best_score_

0.8371140502636909

#### 2、对决策树内部参数进行网格搜索

##### max_depth、min_samples_split

In [48]:
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
gsearch2 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = 0.1
        , n_estimators = 70
        , min_samples_leaf = 20
        , max_features = 'sqrt'
        , subsample = 0.8
        , random_state = 10
    )
    , param_grid = param_test2
    , scoring = 'roc_auc'
    , iid = False
    , cv = 5
)
gsearch2.fit(train_feature, train_label)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features='sqrt',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=20,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=70,
                                                  n_iter_no_change=None,
                                                  presor

In [51]:
gsearch2.cv_results_

{'mean_fit_time': array([ 1.9606616 ,  1.98076715,  1.91867046,  1.88400292,  3.20682516,
         3.00555868,  3.00125575,  2.82209935,  4.81687179,  4.34802389,
         4.24468956,  4.00643873,  7.08417878,  5.9742547 ,  5.48670163,
         5.20206695,  9.60571437,  7.59467516,  6.84400673,  6.37569771,
        12.23614702,  9.40089245,  8.30377383,  7.75830951]),
 'std_fit_time': array([0.04403883, 0.03569067, 0.03315101, 0.02680777, 0.08043004,
        0.03215486, 0.08526881, 0.03001332, 0.075378  , 0.06271317,
        0.07524248, 0.03514032, 0.08080019, 0.06732483, 0.04501635,
        0.07327228, 0.11387817, 0.18288223, 0.07955533, 0.09825675,
        0.30893199, 0.24986221, 0.15289091, 0.31119739]),
 'mean_score_time': array([0.02937188, 0.03132019, 0.02904096, 0.02992325, 0.03855934,
        0.0368094 , 0.03660936, 0.0349494 , 0.04426403, 0.04376459,
        0.04401369, 0.0431478 , 0.05687609, 0.05117836, 0.05055671,
        0.05035143, 0.06156621, 0.06012368, 0.05774088, 0.05

In [52]:
gsearch2.best_params_

{'max_depth': 7, 'min_samples_split': 500}

In [53]:
gsearch2.best_score_

0.8407756723923849

##### min_samples_split、min_samples_leaf

In [54]:
param_test3 = {'min_samples_split':range(800,1900,200), 'min_samples_leaf':range(60,101,10)}
gsearch3 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = 0.1
        , n_estimators = 60
        , max_depth = 7
        , max_features = 'sqrt'
        , subsample = 0.8
        , random_state = 10
    )
    , param_grid = param_test3
    , scoring = 'roc_auc'
    , iid = False
    , cv = 5
)
gsearch3.fit(train_feature, train_label)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=7,
                                                  max_features='sqrt',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=60,
                                                  n_iter_no_change=None,
                                                  presort

In [56]:
gsearch3.cv_results_

{'mean_fit_time': array([3.6136797 , 3.56349874, 3.38349857, 3.36260929, 3.34853096,
        3.233002  , 3.47678986, 3.33010006, 3.29263115, 3.17466578,
        3.32943439, 3.18062863, 3.60235925, 3.521417  , 3.4663476 ,
        3.35977783, 3.22683015, 3.13653383, 3.62920527, 3.63176136,
        3.54018869, 3.47565598, 3.38594227, 3.23226523, 3.69081707,
        3.57132921, 3.4125185 , 3.23171363, 3.19083114, 3.14731174]),
 'std_fit_time': array([0.17666672, 0.0899354 , 0.1257653 , 0.18477189, 0.05674742,
        0.09913924, 0.03927875, 0.05214116, 0.0381351 , 0.04976497,
        0.1737572 , 0.06691888, 0.122152  , 0.04988689, 0.08395796,
        0.1279552 , 0.07698787, 0.07008333, 0.06119059, 0.08580396,
        0.03685446, 0.10403667, 0.11780227, 0.10891754, 0.05479102,
        0.06761084, 0.10992821, 0.06511857, 0.04766657, 0.15175563]),
 'mean_score_time': array([0.04293318, 0.04032698, 0.04113879, 0.04037638, 0.04041753,
        0.04019051, 0.03979225, 0.03879657, 0.03900943, 0.03

In [57]:
gsearch3.best_params_

{'min_samples_leaf': 70, 'min_samples_split': 1000}

In [58]:
gsearch3.best_score_

0.84018902830047

In [59]:
gbdt_1 = GradientBoostingClassifier(
    learning_rate = 0.1
    , n_estimators = 70
    , max_depth = 7
    , min_samples_leaf = 70
    , min_samples_split = 1000
    , max_features = 'sqrt'
    , subsample = 0.8
    , random_state = 10
)
gbdt_1.fit(train_feature, train_label)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=7,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=70, min_samples_split=1000,
                           min_weight_fraction_leaf=0.0, n_estimators=70,
                           n_iter_no_change=None, presort='auto',
                           random_state=10, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [63]:
pre_1 = gbdt_0.predict(train_feature)
pre_prob_1 = gbdt_1.predict_proba(train_feature)[:,1]

In [64]:
print("Accaurcy: %.4g" %metrics.accuracy_score(train_label, pre_1))

Accaurcy: 0.9856


In [65]:
print("AUC Score: %f" %metrics.roc_auc_score(train_label, pre_prob_1))

AUC Score: 0.884777


##### max_features

In [66]:
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = 0.1
        , n_estimators = 70
        , max_depth = 7
        , min_samples_leaf = 70
        , min_samples_split = 1000
        , subsample = 0.8
        , random_state = 10
    ), 
    param_grid = param_test4
    , scoring = 'roc_auc'
    , iid = False
    , cv = 5
)
gsearch4.fit(train_feature, train_label)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=7,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=70,
                                                  min_samples_split=1000,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=70,
                                                  n_iter_no_change=None,
                                                  preso

In [67]:
gsearch4.cv_results_

{'mean_fit_time': array([4.08061996, 4.63288827, 5.05220623, 5.68218942, 6.16083579,
        7.06394553, 7.65083318]),
 'std_fit_time': array([0.09017312, 0.0986523 , 0.0454801 , 0.10029491, 0.08689917,
        0.17256974, 0.23833731]),
 'mean_score_time': array([0.04510865, 0.0441052 , 0.04260511, 0.04222341, 0.04131503,
        0.04254513, 0.0400598 ]),
 'std_score_time': array([0.00222515, 0.00294608, 0.00190274, 0.00191421, 0.00110569,
        0.00217815, 0.00158179]),
 'param_max_features': masked_array(data=[7, 9, 11, 13, 15, 17, 19],
              mask=[False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_features': 7},
  {'max_features': 9},
  {'max_features': 11},
  {'max_features': 13},
  {'max_features': 15},
  {'max_features': 17},
  {'max_features': 19}],
 'split0_test_score': array([0.82863088, 0.82281758, 0.82302578, 0.82867799, 0.82706751,
        0.81927068, 0.82621026]),
 'split1_test_score': array([0.

In [68]:
gsearch4.best_params_

{'max_features': 7}

In [69]:
gsearch4.best_score_

0.840259343448756

In [70]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = 0.1
        , n_estimators = 70
        , max_depth = 7
        , min_samples_leaf = 70
        , min_samples_split = 1000
        , max_features = 7
        , random_state = 10
    ), 
    param_grid = param_test5
    , scoring = 'roc_auc'
    , iid = False
    , cv = 5
)
gsearch5.fit(train_feature, train_label)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=7,
                                                  max_features=7,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=70,
                                                  min_samples_split=1000,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=70,
                                                  n_iter_no_change=None,
                                                  presort=

In [71]:
gsearch5.cv_results_

{'mean_fit_time': array([4.11828041, 4.3651608 , 4.40829864, 4.33629217, 4.32963405,
        4.63706155]),
 'std_fit_time': array([0.25395423, 0.17997033, 0.19645065, 0.13064765, 0.16989518,
        0.22603813]),
 'mean_score_time': array([0.05362363, 0.05125823, 0.05497155, 0.05680208, 0.04883304,
        0.06357045]),
 'std_score_time': array([0.00515803, 0.00497287, 0.00360297, 0.00818797, 0.004013  ,
        0.0256789 ]),
 'param_subsample': masked_array(data=[0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'subsample': 0.6},
  {'subsample': 0.7},
  {'subsample': 0.75},
  {'subsample': 0.8},
  {'subsample': 0.85},
  {'subsample': 0.9}],
 'split0_test_score': array([0.82334957, 0.8199904 , 0.81855794, 0.82863088, 0.82392706,
        0.82344595]),
 'split1_test_score': array([0.82712456, 0.82680741, 0.82512171, 0.83384931, 0.8377503 ,
        0.83242794]),
 'split2_test_sc

In [72]:
gsearch5.best_params_

{'subsample': 0.8}

In [73]:
gsearch5.best_score_

0.840259343448756

In [75]:
gbdt_2 = GradientBoostingClassifier(
    learning_rate = 0.05
    , n_estimators = 140
    , max_depth = 7
    , min_samples_leaf = 70
    , min_samples_split = 1000
    , max_features = 7
    , subsample=0.8
    , random_state = 10
)
gbdt_2.fit(train_feature, train_label)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.05, loss='deviance', max_depth=7,
                           max_features=7, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=70, min_samples_split=1000,
                           min_weight_fraction_leaf=0.0, n_estimators=140,
                           n_iter_no_change=None, presort='auto',
                           random_state=10, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [76]:
pre_2 = gbdt_2.predict(train_feature)

In [77]:
pre_prob_2 = gbdt_2.predict_proba(train_feature)[:,1]

In [78]:
print("Accaurcy: %.4g" %metrics.accuracy_score(train_label, pre_2))

Accaurcy: 0.9854


In [79]:
print("AUC Score: %f" %metrics.roc_auc_score(train_label, pre_prob_2))

AUC Score: 0.885606


In [80]:
gbdt_3 = GradientBoostingClassifier(
    learning_rate = 0.01
    , n_estimators = 700
    , max_depth = 7
    , min_samples_leaf = 70
    , min_samples_split = 1000
    , max_features = 7
    , subsample=0.8
    , random_state = 10
)
gbdt_3.fit(train_feature, train_label)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=7,
                           max_features=7, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=70, min_samples_split=1000,
                           min_weight_fraction_leaf=0.0, n_estimators=700,
                           n_iter_no_change=None, presort='auto',
                           random_state=10, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [81]:
pre_3 = gbdt_3.predict(train_feature)
pre_prob_3 = gbdt_3.predict_proba(train_feature)[:,1]
print("Accaurcy: %.4g" %metrics.accuracy_score(train_label, pre_3))
print("AUC Score: %f" %metrics.roc_auc_score(train_label, pre_prob_3))

Accaurcy: 0.9854
AUC Score: 0.888031


In [82]:
gbdt_4 = GradientBoostingClassifier(
    learning_rate = 0.005
    , n_estimators = 1400
    , max_depth = 7
    , min_samples_leaf = 70
    , min_samples_split = 1000
    , max_features = 7
    , subsample=0.8
    , random_state = 10
)
gbdt_4.fit(train_feature, train_label)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.005, loss='deviance', max_depth=7,
                           max_features=7, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=70, min_samples_split=1000,
                           min_weight_fraction_leaf=0.0, n_estimators=1400,
                           n_iter_no_change=None, presort='auto',
                           random_state=10, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [83]:
pre_4 = gbdt_4.predict(train_feature)
pre_prob_4 = gbdt_4.predict_proba(train_feature)[:,1]
print("Accaurcy: %.4g" %metrics.accuracy_score(train_label, pre_4))
print("AUC Score: %f" %metrics.roc_auc_score(train_label, pre_prob_4))

Accaurcy: 0.9854
AUC Score: 0.888795


In [84]:
gbdt_5 = GradientBoostingClassifier(
    learning_rate = 0.0025
    , n_estimators = 2800
    , max_depth = 7
    , min_samples_leaf = 70
    , min_samples_split = 1000
    , max_features = 7
    , subsample=0.8
    , random_state = 10
)
gbdt_5.fit(train_feature, train_label)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.0025, loss='deviance', max_depth=7,
                           max_features=7, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=70, min_samples_split=1000,
                           min_weight_fraction_leaf=0.0, n_estimators=2800,
                           n_iter_no_change=None, presort='auto',
                           random_state=10, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [85]:
pre_5 = gbdt_5.predict(train_feature)
pre_prob_5 = gbdt_5.predict_proba(train_feature)[:,1]
print("Accaurcy: %.4g" %metrics.accuracy_score(train_label, pre_5))
print("AUC Score: %f" %metrics.roc_auc_score(train_label, pre_prob_5))

Accaurcy: 0.9854
AUC Score: 0.888396
