In [25]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split,StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
from pylightgbm.models import GBMRegressor
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse

%matplotlib inline

In [13]:
def log_mae(labels,preds,lift=200):
    return mean_absolute_error(np.exp(labels)-lift, np.exp(preds)-lift)

## Load Data

In [3]:
# Load data
start = time.time() 
train_data = pd.read_csv('../input/train.csv')
train_size=train_data.shape[0]
print ("Loading train data finished in %0.3fs" % (time.time() - start))        

test_data = pd.read_csv('../input/test.csv')
print ("Loading test data finished in %0.3fs" % (time.time() - start))      

Loading train data finished in 2.563s
Loading test data finished in 4.266s


#### Merge train and test

This will save our time on duplicating logics for train and test and will also ensure the transformations applied on train and test are the same.

In [4]:
full_data=pd.concat([train_data,test_data])
del( train_data, test_data)
print ("Full Data set created.")

Full Data set created.


In [5]:
data_types = full_data.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

id_col = 'id'
target_col = 'loss'
num_cols.remove('id')
num_cols.remove('loss')

### Numeric features

Two preprocessings on numeric features are applied:

   1. Apply box-cox transformations for skewed numeric features.

   2. Scale numeric features so they will fall in the range between 0 and 1.



In [8]:
skewed_cols = full_data[num_cols].apply(lambda x: skew(x.dropna()))

SSL = preprocessing.StandardScaler()
skewed_cols = skewed_cols[skewed_cols > 0.25].index.values
for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] + 1)
for num_col in num_cols:
    full_data[num_col] = SSL.fit_transform(full_data[num_col].values.reshape(-1,1))

### Model LE Coding

#### Categorical features

   1. Label Encoding (Factorizing)


In [9]:
LBL = preprocessing.LabelEncoder()
start=time.time()
for cat_col in cat_cols:
    full_data[cat_col] = LBL.fit_transform(full_data[cat_col])
print ('Label enconding finished in %f seconds' % (time.time()-start))

Label enconding finished in 30.559000 seconds


In [12]:
lift = 200

full_cols = cat_cols+num_cols
train_x = full_data[full_cols][:train_size]
test_x = full_data[full_cols][train_size:]
train_y = np.log(full_data[:train_size].loss.values + lift)
ID = full_data.id[:train_size].values

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)

### LightGBM Tuning
* LightGBM

https://github.com/Microsoft/LightGBM

LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

    * Faster training speed and higher efficiency
    * Lower memory usage
    * Better accuracy
    * Parallel learning supported
    * Capable of handling large-scale data
    * 12/02/2016 : Release python-package beta version, welcome to have a try and provide issues and feedback

* pyLIghtGBM

pyLightGBM is a python binding for Microsoft LightGBM

https://github.com/ArdalanM/pyLightGBM

    * The official python-package was released a little late for the competition. The pyLightGBM was used.

#### 1. Tune num_leaves
* default=127, type=int, alias=num_leaf
* number of leaves in one tree
* control overfit
    * Smaller: underfit
    * larger: overfit

In [15]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=127,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'num_leaves: ', rgr.param['num_leaves']
print "MAE: ", log_mae(y_val,y_pred, 200)

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 2.781159 seconds
[LightGBM] [Info] Number of data: 150654, number of features: 130
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration: 1, e:\temp\tmpeemhc6\X0_test.csv : l1 loss : 7.018314
[LightGBM] [Info] 0.062004 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration: 2, e:\temp\tmpeemhc6\X0_test.csv : l1 loss : 6.316628
[LightGBM] [Info] 0.131008 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration: 3, e:\temp\tmpeemhc6\X0_test.csv : l1 loss : 5.685116
[LightGBM] [Info] 0.200011 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration: 4, e:\temp\tmpeemhc6\X0_test.csv : l1 loss : 5.116726
[LightGBM] [Info] 0.269015 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration: 5, e:\temp\tmpeemhc6\X0_test.csv : l1 loss : 4.605308
[LightGBM] [Info] 0.335019 seconds elapsed, finished iteration 5
[

    num_leaves score
    31         1153.86041042
    63         1153.60820315
    127        1152.81015463
    255        1154.79474712
    511        1157.68422243
    1023       1162.16024421

In [16]:
num_leaves = 127

#### 2. Tune min_data_in_leaf
* default=100, type=int, alias=min_data_per_leaf , min_data
* Minimal number of data in one leaf. Can use this to deal with over-fit.
* control overfit
    * Smaller: overfit
    * larger: underfit

In [17]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=280,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'min_data_in_leaf: ', rgr.param['min_data_in_leaf']
print "MAE: ", log_mae(y_val,y_pred, 200)

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 2.089119 seconds
[LightGBM] [Info] Number of data: 150654, number of features: 130
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration: 1, e:\temp\tmp4shcxa\X0_test.csv : l1 loss : 7.018372
[LightGBM] [Info] 0.075004 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration: 2, e:\temp\tmp4shcxa\X0_test.csv : l1 loss : 6.316750
[LightGBM] [Info] 0.151009 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration: 3, e:\temp\tmp4shcxa\X0_test.csv : l1 loss : 5.685277
[LightGBM] [Info] 0.212012 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration: 4, e:\temp\tmp4shcxa\X0_test.csv : l1 loss : 5.116922
[LightGBM] [Info] 0.276016 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration: 5, e:\temp\tmp4shcxa\X0_test.csv : l1 loss : 4.605422
[LightGBM] [Info] 0.343020 seconds elapsed, finished iteration 5
[

    min_data_in_leaf score
    60               1155.52500899
    80               1154.09968427
    100              1152.81015463
    120              1153.33487688
    140              1152.54005633
    160              1150.64379314
    180              1152.37675518
    200              1154.83701585
    220              1151.61492247
    240              1152.06894503
    260              1152.96379754
    280              1152.9718746

In [18]:
min_data_in_leaf = 160

#### 3. Tune feature_fraction
* feature_fraction, default=1.0, type=double, 0.0 < feature_fraction < 1.0, alias=sub_feature
* LightGBM will random select part of features on each iteration if feature_fraction smaller than 1.0. For example, if * set to 0.8, will select 80% features before training each tree.
* Can use this to speed up training
* Can use this to deal with over-fit
    * Smaller: overfit
    * larger: underfit

In [19]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=min_data_in_leaf,
                   feature_fraction = 0.3,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'feature_fraction: ', rgr.param['feature_fraction']
print "MAE: ", log_mae(y_val,y_pred, 200)

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 2.349134 seconds
[LightGBM] [Info] Number of data: 150654, number of features: 130
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration: 1, e:\temp\tmpej1r6b\X0_test.csv : l1 loss : 7.018046
[LightGBM] [Info] 0.027002 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration: 2, e:\temp\tmpej1r6b\X0_test.csv : l1 loss : 6.316356
[LightGBM] [Info] 0.058003 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration: 3, e:\temp\tmpej1r6b\X0_test.csv : l1 loss : 5.684683
[LightGBM] [Info] 0.087005 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration: 4, e:\temp\tmpej1r6b\X0_test.csv : l1 loss : 5.116394
[LightGBM] [Info] 0.108006 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration: 5, e:\temp\tmpej1r6b\X0_test.csv : l1 loss : 4.604854
[LightGBM] [Info] 0.132008 seconds elapsed, finished iteration 5
[

    feature_fraction score
    1                1150.64379314
    0.9              1151.16820046
    0.8              1154.22553973
    0.7              1152.51609115
    0.6              1152.44971615
    0.5              1151.15217181
    0.4              1151.43463957
    0.3              1149.56731238
    0.2              1149.85465289
    0.1              1157.74466215

In [20]:
feature_fraction = 0.3

#### 4. Tune bagging_freq
* default=0, type=int
* Frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration.
* Note: To enable bagging, should set bagging_fraction as well (1 is default).

In [22]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=min_data_in_leaf,
                   feature_fraction = feature_fraction,
                   bagging_freq = 1, # this has to be set to an integer greater than 0 to enable bagging
                   bagging_fraction = 0.3,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'bagging_fraction: ', rgr.param['bagging_fraction']
print "MAE: ", log_mae(y_val,y_pred, 200)

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 2.390137 seconds
[LightGBM] [Info] Number of data: 150654, number of features: 130
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration: 1, e:\temp\tmpetmzgh\X0_test.csv : l1 loss : 7.018109
[LightGBM] [Info] 0.020001 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration: 2, e:\temp\tmpetmzgh\X0_test.csv : l1 loss : 6.316344
[LightGBM] [Info] 0.048003 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration: 3, e:\temp\tmpetmzgh\X0_test.csv : l1 loss : 5.684719
[LightGBM] [Info] 0.070004 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration: 4, e:\temp\tmpetmzgh\X0_test.csv : l1 loss : 5.116239
[LightGBM] [Info] 0.090005 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration: 5, e:\temp\tmpetmzgh\X0_test.csv : l1 loss : 4.604678
[LightGBM] [Info] 0.109006 seconds elapsed, finished iteration 5
[

    bagging_fraction score
    1                1149.56731238
    0.9              1149.96495733
    0.8              1151.75700055
    0.7              1153.61143432
    0.6              1154.60250299
    0.5              1155.6322919
    0.4              1161.04853734
    0.3              1163.06440411

In [21]:
bagging_fraction = 0.9

#### 5. Tune max_bin
* default=255, type=int
* max number of bin that feature values will bucket in. Small bin may reduce training accuracy but may increase general power (deal with over-fit).

In [23]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=min_data_in_leaf,
                   feature_fraction = feature_fraction,
                   bagging_freq = 1,
                   bagging_fraction = bagging_fraction,
                   max_bin = 8191,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'max_bin: ', rgr.param['max_bin']
print "MAE: ", log_mae(y_val,y_pred, 200)

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 2.331133 seconds
[LightGBM] [Info] Number of data: 150654, number of features: 130
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration: 1, e:\temp\tmpvmarof\X0_test.csv : l1 loss : 7.018015
[LightGBM] [Info] 0.031002 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration: 2, e:\temp\tmpvmarof\X0_test.csv : l1 loss : 6.316273
[LightGBM] [Info] 0.079004 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration: 3, e:\temp\tmpvmarof\X0_test.csv : l1 loss : 5.684672
[LightGBM] [Info] 0.114007 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration: 4, e:\temp\tmpvmarof\X0_test.csv : l1 loss : 5.116426
[LightGBM] [Info] 0.142008 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration: 5, e:\temp\tmpvmarof\X0_test.csv : l1 loss : 4.604871
[LightGBM] [Info] 0.167010 seconds elapsed, finished iteration 5
[

    max_bin score
    255     1149.96495733
    511     1151.06769767
    1023    1149.30236952
    2047    1148.34217195
    4095    1149.84096619
    6143    1149.82891523
    7167    1148.8171578
    8191    1148.13447774
    9215    1148.27304965
    10239   1148.83482642
    12287   1149.10247673
    16383   1148.17375539
    32767   1148.17375539
    65535   1148.17375539

In [24]:
max_bin = 8191

### Automated tuning - [Bayesian Optimization](https://github.com/fmfn/BayesianOptimization)

The idea is to set a range for each parameters, for which we can leverage the parameters from manual tuning, then let the bayersian optimization to seek best parameters.

It's more efficient than grid search but is still time consuming. Therefore knowing an approximate range of values for each parameter will greatly improve the performance.

In [None]:
def lgbm_cv(max_bin, num_leaves, min_data_in_leaf, feature_fraction,bagging_fraction, learning_rate=0.1):
    skf = list(KFold(len(train_y), 4))
    scores=[]
    for i, (train, val) in enumerate(skf):
        est=GBMRegressor(learning_rate = learning_rate,
                        max_bin=int(max_bin),
                        num_leaves=int(num_leaves),
                        min_data_in_leaf=int(min_data_in_leaf),
                        feature_fraction=feature_fraction,
                        bagging_fraction=bagging_fraction,
                        bagging_freq = 1,
                        num_threads=4,
                        )
        train_x_fold = train_x.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_x.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( num_iterations=100000)
        est.set_params( early_stopping_round=50)
        est.set_params( metric='l1')
        est.set_params(verbose = False)
        est.fit(train_x_fold,train_y_fold,
                test_data=[(val_x_fold, val_y_fold)]
               )
        val_y_predict_fold = est.predict(val_x_fold)
        score = log_mae(val_y_fold, val_y_predict_fold,200)
        scores.append(score)
    return -np.mean(scores)
            


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (8167,10939),
                                'num_leaves': (31,155),
                                'min_data_in_leaf' :(170,250),
                                'feature_fraction': (0.4,0.8),
                                'bagging_fraction' : (0.9,1)})

lgbm_BO.maximize(init_points=5, n_iter=30)

(max_bin=9255,num_leaves=81,min_data_in_leaf=191,feature_fraction=0.300000,bagging_fraction=1.000000,bagging_freq=1)
                   score -1139.406737
(max_bin=9220,num_leaves=95,min_data_in_leaf=220,feature_fraction=0.261269,bagging_fraction=1.000000,bagging_freq=1)
                   score -1139.631716
(max_bin=9263,num_leaves=104,min_data_in_leaf=190,feature_fraction=0.300000,bagging_fraction=1.000000,bagging_freq=1)
                   score -1139.849854
(max_bin=9248,num_leaves=149,min_data_in_leaf=220,feature_fraction=0.300000,bagging_fraction=1.000000,bagging_freq=1)
                   score -1139.883523


# Model OHE Coding
####  Categorical features
1. Label Encoding (Factorizing)
2. One Hot Encoding (get dummies)

OHE can be done by either get_dummies() from Pandas package or OneHotEncoder from SK-Learn package. 

* get_dummies is easier to implement (can be used directly on raw categorical features, i.e. strings, but it takes longer time and is not memory efficient.

* OneHotEncoder requires the features being converted to numeric, which has already been done by LabelEncoder in previous step, and is much more efficient (7x faster).

* The OHE's results are converted to a sparse matrix which uses way less memory as compared to dense matrix.

In [26]:
# This step has been finished previously. Run this cell if start with fresh data.


# start = time.time() 
# train_data = pd.read_csv('../input/train.csv')
# train_size=train_data.shape[0]
# print ("Loading train data finished in %0.3fs" % (time.time() - start))

# start = time.time()
# test_data = pd.read_csv('../input/test.csv')
# print ("Loading test data finished in %0.3fs" % (time.time() - start))   

# full_data=pd.concat([train_data,test_data])
# del( train_data, test_data)
# print ("Full Data set created.")

# data_types = full_data.dtypes  
# cat_cols = list(data_types[data_types=='object'].index)
# num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

# id_col = 'id'
# target_col = 'loss'
# num_cols.remove('id')
# num_cols.remove('loss')

# LBL = preprocessing.LabelEncoder()
# start=time.time()
# for cat_col in cat_cols:
# #     print ("Factorize feature %s" % (cat))
#     full_data[cat_col] = LBL.fit_transform(full_data[cat_col])
# print ('Label enconding finished in %f seconds' % (time.time()-start))


In [27]:
OHE = preprocessing.OneHotEncoder(sparse=True)
start=time.time()
full_data_sparse=OHE.fit_transform(full_data[cat_cols])
print 'One-hot-encoding finished in %f seconds' % (time.time()-start)
print full_data_sparse.shape

## it should be (313864, 1176)

One-hot-encoding finished in 13.893000 seconds
(313864, 1176)


In [28]:
lift = 200

full_data_sparse = sparse.hstack((full_data_sparse
                                  ,full_data[num_cols])
                                 , format='csr'
                                 )
print full_data_sparse.shape
train_x = full_data_sparse[:train_size]
test_x = full_data_sparse[train_size:]
train_y = np.log(full_data[:train_size].loss.values + lift)
ID = full_data.id[:train_size].values

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)

(313864, 1190)


#### 1. Tune num_leaves
* default=127, type=int, alias=num_leaf
* number of leaves in one tree
* control overfit
    * Smaller: underfit
    * larger: overfit

In [None]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=140,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'num_leaves: ', rgr.param['num_leaves']
print "MAE: ", log_mae(y_val,y_pred, 200)

    num_leaves   score
    31           1151.94725257
    47           1150.73371432
    55           1151.81532037
    59           1152.5245158
    63           1149.94641211
    71           1153.30812757
    79           1152.26073335
    95           1150.97086856
    111          1152.72179151
    127          1151.8720877129881
    140          1152.47040075
    255          1152.8892655651075
    511          1157.8167005742778
    1023         1160.7959202467935

In [None]:
num_leaves = 63


#### 2. Tune min_data_in_leaf
* default=100, type=int, alias=min_data_per_leaf , min_data
* Minimal number of data in one leaf. Can use this to deal with over-fit.
* control overfit
    * Smaller: overfit
    * larger: underfit

In [None]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=140,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'min_data_in_leaf: ', rgr.param['min_data_in_leaf']
print "MAE: ", log_mae(y_val,y_pred, 200)

    min_data_in_leaf score
    60               1152.96790945
    80               1151.2627321
    100              1149.94641211
    120              1152.14765434
    140              1150.90761217

In [None]:
min_data_in_leaf = 100


#### 3. Tune feature_fraction
* feature_fraction, default=1.0, type=double, 0.0 < feature_fraction < 1.0, alias=sub_feature
* LightGBM will random select part of features on each iteration if feature_fraction smaller than 1.0. For example, if * set to 0.8, will select 80% features before training each tree.
* Can use this to speed up training
* Can use this to deal with over-fit
    * Smaller: overfit
    * larger: underfit

In [None]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=min_data_in_leaf,
                   feature_fraction = 0.1,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'feature_fraction: ', rgr.param['feature_fraction']
print "MAE: ", log_mae(y_val,y_pred, 200)

    feature_fraction score
    1                1149.94641211
    0.9              1150.87512247
    0.8              1149.95170199
    0.7              1151.47775643
    0.6              1150.25311542
    0.5              1148.62923317
    0.4              1149.33888349
    0.3              1149.41479317
    0.2              1148.7569325
    0.1              1150.94748942

In [None]:
feature_fraction = 0.5

#### 4. Tune bagging_freq
* default=0, type=int
* Frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration.
* Note: To enable bagging, should set bagging_fraction as well (1 is default).

In [None]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=min_data_in_leaf,
                   feature_fraction = feature_fraction,
                   bagging_freq = 1, # this has to be set to an integer greater than 0 to enable bagging
                   bagging_fraction = 0.2,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'bagging_fraction: ', rgr.param['bagging_fraction']
print "MAE: ", log_mae(y_val,y_pred, 200)

    bagging_fraction score
    1                1148.62923317
    0.9              1150.56707006
    0.8              1150.57950704
    0.7              1152.39659053
    0.6              1151.89247797
    0.5              1153.36985355
    0.4              1154.88549101
    0.3              1156.96329398
    0.2              1168.00746453

In [29]:
bagging_fraction = 1

#### 5. Tune max_bin
* default=255, type=int
* max number of bin that feature values will bucket in. Small bin may reduce training accuracy but may increase general power (deal with over-fit).

In [None]:
rgr = GBMRegressor(
                   learning_rate=0.1,
                   metric = 'l1',
                   num_threads = 4, #The acutal cores of CPU
                   num_iterations=10000,
                   early_stopping_round=50,
                   num_leaves=num_leaves,
                   min_data_in_leaf=min_data_in_leaf,
                   feature_fraction = feature_fraction,
                   bagging_freq = 1,
                   bagging_fraction = bagging_fraction,
                   max_bin = (383+511)/2,
                   verbose = True)

rgr.fit(X_train,y_train,
        test_data=[(X_val,y_val)])

y_pred = rgr.predict(X_val)
print 'best_round: ', rgr.best_round
print 'max_bin: ', rgr.param['max_bin']
print "MAE: ", log_mae(y_val,y_pred, 200)

    max_bin score
    255     1148.62923317
    383     1149.26766314
    447     1149.66850311
    511     1146.3082081
    527     1148.34723792
    543     1147.95546048
    575     1147.54997036
    639     1147.05453218
    767     1148.55547679
    1023    1147.79087855
    2047    1147.84012659
    4097    1148.39183983

In [None]:
max_bin = 511

### Automated tuning - [Bayesian Optimization](https://github.com/fmfn/BayesianOptimization)

The idea is to set a range for each parameters, for which we can leverage the parameters from manual tuning, then let the bayersian optimization to seek best parameters.

It's more efficient than grid search but is still time consuming. Therefore knowing an approximate range of values for each parameter will greatly improve the performance.

In [None]:
def lgbm_cv(max_bin, num_leaves, min_data_in_leaf, feature_fraction,bagging_fraction, learning_rate=0.1):
    skf = list(KFold(len(train_y), 4))
    scores=[]
    for i, (train, val) in enumerate(skf):
        est=GBMRegressor(learning_rate = learning_rate,
                        max_bin=int(max_bin),
                        num_leaves=int(num_leaves),
                        min_data_in_leaf=int(min_data_in_leaf),
                        feature_fraction=feature_fraction,
                        bagging_fraction=bagging_fraction,
                        bagging_freq = 1,
                        num_threads=4,
                        )
        train_x_fold = train_x[train]
        train_y_fold = train_y[train]
        val_x_fold = train_x[val]
        val_y_fold = train_y[val]
        est.set_params( num_iterations=100000)
        est.set_params( early_stopping_round=50)
        est.set_params( metric='l1')
        est.set_params(verbose = False)
        est.fit(train_x_fold,
                train_y_fold,
                test_data=[(val_x_fold, val_y_fold)]
               )
        val_y_predict_fold = est.predict(val_x_fold)
        score = log_mae(val_y_fold, val_y_predict_fold,200)
        scores.append(score)
    return -np.mean(scores)
            


lgbm_BO = BayesianOptimization(lgbm_cv, {
                                     'max_bin': (447,627),
                                     'num_leaves': (60,180),
                                     'min_data_in_leaf' :(60,140),
                                     'feature_fraction': (0.15,0.6),
                                     'bagging_fraction' : (0.6,1)})


lgbm_BO.maximize(init_points=5, n_iter=30)


(max_bin=526,num_leaves=68,min_data_in_leaf=127,feature_fraction=0.218683,bagging_fraction=0.961961,bagging_freq=1)
               score -1139.877375
(max_bin=457,num_leaves=54,min_data_in_leaf=125,feature_fraction=0.383468,bagging_fraction=0.949582,bagging_freq=1)
               score -1140.332236
(max_bin=514,num_leaves=40,min_data_in_leaf=126,feature_fraction=0.325435,bagging_fraction=0.923560,bagging_freq=1)
               score -1140.546101
(max_bin=514,num_leaves=40,min_data_in_leaf=127,feature_fraction=0.464765,bagging_fraction=0.464765,bagging_freq=1)
               score -1140.593041