In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
import lightgbm as lgb
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse

%matplotlib inline



In [2]:
def lgbm_eval_mae(yhat, dtrain, lift=200):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-lift, np.exp(yhat)-lift), False

## Load Data

In [3]:
# Load data
start = time.time() 
train_data = pd.read_csv('../input/train.csv')
train_size=train_data.shape[0]
print ("Loading train data finished in %0.3fs" % (time.time() - start))        

test_data = pd.read_csv('../input/test.csv')
print ("Loading test data finished in %0.3fs" % (time.time() - start))      

Loading train data finished in 2.907s
Loading test data finished in 4.589s


#### Merge train and test

This will save our time on duplicating logics for train and test and will also ensure the transformations applied on train and test are the same.

In [4]:
full_data=pd.concat([train_data,test_data])
del( train_data, test_data)
print ("Full Data set created.")

Full Data set created.


In [5]:
data_types = full_data.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

id_col = 'id'
target_col = 'loss'
num_cols.remove('id')
num_cols.remove('loss')

### Numeric features

Two preprocessings on numeric features are applied:

   1. Apply box-cox transformations for skewed numeric features.

   2. Scale numeric features so they will fall in the range between 0 and 1.



In [6]:
skewed_cols = full_data[num_cols].apply(lambda x: skew(x.dropna()))

SSL = preprocessing.StandardScaler()
skewed_cols = skewed_cols[skewed_cols > 0.25].index.values
for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] + 1)
for num_col in num_cols:
    full_data[num_col] = SSL.fit_transform(full_data[num_col].values.reshape(-1,1))

### Model LE Coding

#### Categorical features

   1. Label Encoding (Factorizing)


In [7]:
LBL = preprocessing.LabelEncoder()
start=time.time()
for cat_col in cat_cols:
    full_data[cat_col] = LBL.fit_transform(full_data[cat_col])
print ('Label enconding finished in %f seconds' % (time.time()-start))

Label enconding finished in 36.702984 seconds


In [8]:
lift = 200

full_cols = cat_cols+num_cols
train_x = full_data[full_cols][:train_size]
test_x = full_data[full_cols][train_size:]
train_y = np.log(full_data[:train_size].loss.values + lift)
ID = full_data.id[:train_size].values

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)

### Manual tuning 
1. [Bayersian Optimization](https://github.com/fmfn/BayesianOptimization) will be introduced to optimize each parameter. 
2. Based on holdout data (X_val, y_val) for validation for the sake of time. The range for each parameter can be estimated based on the manual tuning results. However, grid search can also be used if time allows.
3. A larger learning rate (0.1) is implemented since it requires less iterations to complete. A smaller one (0.01 or even smaller) will be used to train the model for better accuracy when the parameters are optimized.

### LightGBM Tuning

* [LightGBM](https://github.com/Microsoft/LightGBM)

LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

    * Faster training speed and higher efficiency
    * Lower memory usage
    * Better accuracy
    * Parallel learning supported
    * Capable of handling large-scale data
12/02/2016 : Release  beta version [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package)
    * The parameters were tuned with pyLightGBM in the Kaggle competition. Feel free to follow the instruction and tune them by yourself.
    * The MAE values were different between official python-package and pyLightGBM. However, the parameters were close.


* [pyLightGBM](https://github.com/ArdalanM/pyLightGBM)

pyLightGBM is a python binding for Microsoft LightGBM




#### 1. Tune num_leaves
* default=31, type=int, alias=num_leaf
* number of leaves in one tree
* control overfit
    * Smaller: underfit
    * larger: overfit

In [9]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=127)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'num_leaves: ', rgr.num_leaves
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

best_round:  122
num_leaves:  127
MAE       :  1157.29


In [10]:
num_leaves = 127

#### 2. Tune min_child_samples
* default=10, type=int, alias=min_data_per_leaf , min_data
* Minimal number of data in one leaf. Can use this to deal with over-fit.
* control overfit
    * Smaller: overfit
    * larger: underfit

In [11]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = 160)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'min_child_samples: ', rgr.min_child_samples
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

best_round:  128
min_child_samples:  160
MAE       :  1152.27


In [12]:
min_child_samples = 160

#### 3. Tune colsample_bytree
* colsample_bytree, default=1.0, type=double, 0.0 < colsample_bytree < 1.0, alias=sub_feature
* LightGBM will random select part of features on each iteration if feature_fraction smaller than 1.0. For example, if * set to 0.8, will select 80% features before training each tree.
* Can use this to speed up training
* Can use this to deal with over-fit
    * Smaller: overfit
    * larger: underfit

In [13]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = 0.3)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'colsample_bytree: ', rgr.colsample_bytree
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

best_round:  262
colsample_bytree:  0.3
MAE       :  1150.08


In [14]:
colsample_bytree = 0.3

#### 4. Tune subsample
* default=0, type=int
* Frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration.
* Note: To enable bagging, should set subsample_freq as well (1 is recommended).

In [15]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = colsample_bytree,
                        subsample = 0.9,
                        subsample_freq=1)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'subsample: ', rgr.subsample
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

best_round:  179
subsample:  0.9
MAE       :  1150.57


In [16]:
subsample = 0.9

#### 5. Tune max_bin
* default=255, type=int
* max number of bin that feature values will bucket in. Small bin may reduce training accuracy but may increase general power (deal with over-fit).

In [17]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = colsample_bytree,
                        subsample = subsample,
                        subsample_freq=1,
                        max_bin = 8191 )

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

y_pred = rgr.predict(X_val,num_iteration=rgr.best_iteration)
print 'best_round: ', rgr.best_iteration
print 'max_bin: ', rgr.max_bin
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

best_round:  164
max_bin:  8191
MAE       :  1150.81


In [18]:
max_bin = 8191

#### 6. Tune reg_alpha
* default=0, type=float
* L1 regularization term on weights (deal with over-fit).

In [19]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = colsample_bytree,
                        subsample = subsample,
                        subsample_freq=1,
                        max_bin = max_bin,
                        reg_alpha = 0.01)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

y_pred = rgr.predict(X_val,num_iteration=rgr.best_iteration)
print 'best_round: ', rgr.best_iteration
print 'reg_alpha: ', rgr.reg_alpha
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

best_round:  200
reg_alpha:  0.01
MAE       :  1150.29


In [20]:
reg_alpha = 0.01

### Automated tuning - [Bayesian Optimization](https://github.com/fmfn/BayesianOptimization)

The idea is to set a range for each parameters, for which we can leverage the parameters from manual tuning, then let the bayersian optimization to seek best parameters.

It's more efficient than grid search but is still time consuming. Therefore knowing an approximate range of values for each parameter will greatly improve the performance.

In [None]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, reg_alpha, learning_rate=0.1):
    skf = list(KFold(len(train_y), 4))
    scores=[]
    for i, (train, val) in enumerate(skf):
        est=lgb.LGBMRegressor(learning_rate=0.1,
                        max_bin=int(max_bin),
                        num_leaves=int(num_leaves),
                        min_child_samples=int(min_child_samples),
                        colsample_bytree=colsample_bytree,
                        subsample=subsample,
                        subsample_freq = 1,
                        reg_alpha = reg_alpha
                        )
 
        train_x_fold = train_x.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_x.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric=lgbm_eval_mae,
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict(val_x_fold)
        score = log_mae(val_y_fold, val_y_predict_fold,200)
        scores.append(score)
    return -np.mean(scores)


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (8167,10939),
                                'num_leaves': (31,155),
                                'min_child_samples' :(170,250),
                                'colsample_bytree': (0.4,0.8),
                                'subsample' : (0.9,1),
                                'reg_alpha': (0,0.01)})

lgbm_BO.maximize(init_points=5, n_iter=30)

# Model OHE Coding
####  Categorical features
1. Label Encoding (Factorizing)
2. One Hot Encoding (get dummies)

OHE can be done by either get_dummies() from Pandas package or OneHotEncoder from SK-Learn package. 

* get_dummies is easier to implement (can be used directly on raw categorical features, i.e. strings, but it takes longer time and is not memory efficient.

* OneHotEncoder requires the features being converted to numeric, which has already been done by LabelEncoder in previous step, and is much more efficient (7x faster).

* The OHE's results are converted to a sparse matrix which uses way less memory as compared to dense matrix.

In [26]:
# This step has been finished previously. Run this cell if start with fresh data.


# start = time.time() 
# train_data = pd.read_csv('../input/train.csv')
# train_size=train_data.shape[0]
# print ("Loading train data finished in %0.3fs" % (time.time() - start))

# start = time.time()
# test_data = pd.read_csv('../input/test.csv')
# print ("Loading test data finished in %0.3fs" % (time.time() - start))   

# full_data=pd.concat([train_data,test_data])
# del( train_data, test_data)
# print ("Full Data set created.")

# data_types = full_data.dtypes  
# cat_cols = list(data_types[data_types=='object'].index)
# num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

# id_col = 'id'
# target_col = 'loss'
# num_cols.remove('id')
# num_cols.remove('loss')

# LBL = preprocessing.LabelEncoder()
# start=time.time()
# for cat_col in cat_cols:
# #     print ("Factorize feature %s" % (cat))
#     full_data[cat_col] = LBL.fit_transform(full_data[cat_col])
# print ('Label enconding finished in %f seconds' % (time.time()-start))


In [27]:
OHE = preprocessing.OneHotEncoder(sparse=True)
start=time.time()
full_data_sparse=OHE.fit_transform(full_data[cat_cols])
print 'One-hot-encoding finished in %f seconds' % (time.time()-start)
print full_data_sparse.shape

## it should be (313864, 1176)

One-hot-encoding finished in 13.893000 seconds
(313864, 1176)


In [28]:
lift = 200

full_data_sparse = sparse.hstack((full_data_sparse
                                  ,full_data[num_cols])
                                 , format='csr'
                                 )
print full_data_sparse.shape
train_x = full_data_sparse[:train_size]
test_x = full_data_sparse[train_size:]
train_y = np.log(full_data[:train_size].loss.values + lift)
ID = full_data.id[:train_size].values

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)

(313864, 1190)


#### 1. Tune num_leaves
* default=31, type=int, alias=num_leaf
* number of leaves in one tree
* control overfit
    * Smaller: underfit
    * larger: overfit

In [None]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=63)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'num_leaves: ', rgr.num_leaves
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

In [None]:
num_leaves = 63


#### 2. Tune min_child_samples
* default=10, type=int, alias=min_data_per_leaf , min_data
* Minimal number of data in one leaf. Can use this to deal with over-fit.
* control overfit
    * Smaller: overfit
    * larger: underfit

In [None]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = 100)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'min_child_samples: ', rgr.min_child_samples
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

In [None]:
min_data_in_leaf = 100

#### 3. Tune colsample_bytree
* colsample_bytree, default=1.0, type=double, 0.0 < colsample_bytree < 1.0, alias=sub_feature
* LightGBM will random select part of features on each iteration if feature_fraction smaller than 1.0. For example, if * set to 0.8, will select 80% features before training each tree.
* Can use this to speed up training
* Can use this to deal with over-fit
    * Smaller: overfit
    * larger: underfit

In [None]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = 0.5)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'colsample_bytree: ', rgr.colsample_bytree
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

In [None]:
colsample_bytree = 0.5

#### 4. Tune subsample
* default=0, type=int
* Frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration.
* Note: To enable bagging, should set subsample_freq as well (1 is recommended).

In [None]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = colsample_bytree,
                        subsample = 1,
                        subsample_freq=1)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

print 'best_round: ', rgr.best_iteration
print 'subsample: ', rgr.subsample
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

In [29]:
subsample = 1

#### 5. Tune max_bin
* default=255, type=int
* max number of bin that feature values will bucket in. Small bin may reduce training accuracy but may increase general power (deal with over-fit).

In [None]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = colsample_bytree,
                        subsample = subsample,
                        subsample_freq=1,
                        max_bin = 511 )

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

y_pred = rgr.predict(X_val,num_iteration=rgr.best_iteration)
print 'best_round: ', rgr.best_iteration
print 'max_bin: ', rgr.max_bin
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

In [None]:
max_bin = 511

#### 6. Tune reg_alpha
* default=0, type=float
* L1 regularization term on weights (deal with over-fit).

In [None]:
rgr = lgb.LGBMRegressor(learning_rate=0.1,                             
                        n_estimators=100000,
                        num_leaves=num_leaves,
                        min_child_samples = min_child_samples,
                        colsample_bytree = colsample_bytree,
                        subsample = subsample,
                        subsample_freq=1,
                        max_bin = max_bin,
                        reg_alpha = 0.01)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric=lgbm_eval_mae,
        early_stopping_rounds=50,
        verbose = False)

y_pred = rgr.predict(X_val,num_iteration=rgr.best_iteration)
print 'best_round: ', rgr.best_iteration
print 'reg_alpha: ', rgr.reg_alpha
print 'MAE       : ', rgr.evals_result_.get('valid_0').get('mae')[rgr.best_iteration]

In [None]:
reg_alpha = 0.01

### Automated tuning - [Bayesian Optimization](https://github.com/fmfn/BayesianOptimization)

The idea is to set a range for each parameters, for which we can leverage the parameters from manual tuning, then let the bayersian optimization to seek best parameters.

It's more efficient than grid search but is still time consuming. Therefore knowing an approximate range of values for each parameter will greatly improve the performance.

In [None]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, reg_alpha, learning_rate=0.1):
    skf = list(KFold(len(train_y), 4))
    scores=[]
    for i, (train, val) in enumerate(skf):
        est=lgb.LGBMRegressor(learning_rate=0.1,
                        max_bin=int(max_bin),
                        num_leaves=int(num_leaves),
                        min_child_samples=int(min_child_samples),
                        colsample_bytree=colsample_bytree,
                        subsample=subsample,
                        subsample_freq = 1,
                        reg_alpha = reg_alpha
                        )
 
        train_x_fold = train_x.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_x.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric=lgbm_eval_mae,
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict(val_x_fold)
        score = log_mae(val_y_fold, val_y_predict_fold,200)
        scores.append(score)
    return -np.mean(scores)
            


lgbm_BO = BayesianOptimization(lgbm_cv, {
                                     'max_bin': (447,627),
                                     'num_leaves': (60,180),
                                     'min_child_samples' :(60,140),
                                     'colsample_bytree': (0.15,0.6),
                                     'subsample' : (0.6,1),
                                     'reg_alpha': (0,0.01)})

lgbm_BO.maximize(init_points=5, n_iter=30)
