In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn import preprocessing
import lightgbm as lgb
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime
from scipy.stats.mstats import gmean

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_0401.csv')
test_X = pd.read_csv(data_path + 'test_BM_0401.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 428) (74659, 428) (49352L,)


In [3]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape

# import sys  
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
# reload(sys)  
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
# sys.setdefaultencoding('utf8')

(39481, 428)
(9871, 428)


In [5]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)
        
clf = clf.fit(X_train, y_train,
              eval_set = [(X_val,y_val)],
              eval_metric = 'multi_logloss',
              early_stopping_rounds = 50,
              verbose = 25)

Train until valid scores didn't improve in 50 rounds.
[25]	valid_0's multi_logloss: 0.602854
[50]	valid_0's multi_logloss: 0.5569
[75]	valid_0's multi_logloss: 0.545154
[100]	valid_0's multi_logloss: 0.539556
[125]	valid_0's multi_logloss: 0.536975
[150]	valid_0's multi_logloss: 0.535236
[175]	valid_0's multi_logloss: 0.534269
[200]	valid_0's multi_logloss: 0.533852
[225]	valid_0's multi_logloss: 0.533521
[250]	valid_0's multi_logloss: 0.533808
Early stopping, best iteration is:
[223]	valid_0's multi_logloss: 0.533439


In [6]:
pred_y = clf.predict_proba(test_X, num_iteration = clf.best_iteration)
pred_y

array([[  4.87241777e-01,   4.59754244e-01,   5.30039791e-02],
       [  9.39061072e-01,   2.87975851e-02,   3.21413429e-02],
       [  9.00711190e-01,   8.74221884e-02,   1.18666212e-02],
       ..., 
       [  9.79164336e-01,   1.99874435e-02,   8.48220964e-04],
       [  9.85101415e-01,   1.46340209e-02,   2.64564034e-04],
       [  6.40233923e-01,   3.43340830e-01,   1.64252476e-02]])

In [7]:
clf

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [9]:
# now = datetime.now()
# sub_name = '../output/sub_LightGBM_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

# out_df = pd.DataFrame(pred_y[:,:3])
# out_df.columns = ["low", "medium", "high"]
# out_df["listing_id"] = sub_id
# out_df.to_csv(sub_name, index=False)

# Tune LightGBM

In [8]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)

tmp  = 1000

In [9]:
for x in [8,15,31,63,127,255]:
    clf.set_params(num_leaves = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        num_leaves = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

8  	0.534184914604 614
15  	0.533866649924 384
31  	0.533438877335 223
63  	0.534966878101 128
127  	0.536485297438 84
255  	0.540957829211 64


In [10]:
print num_leaves
clf.set_params(num_leaves = num_leaves)

31


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [11]:
min_child_samples = 10

for x in [20, 30, 50, 70, 80,90,100,110,120,150,170,200,230,260]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

20  	0.532594819811 313
30  	0.53261320808 212
50  	0.533936088415 231
70  	0.533932672968 185
80  	0.53273199529 282
90  	0.533575376848 223
100  	0.532221825493 226
110  	0.531233211105 255
120  	0.532680049982 215
150  	0.532639182712 278
170  	0.532213958499 252
200  	0.532006011641 234
230  	0.532597043533 230
260  	0.531974482836 238


In [12]:
for x in [300,350,400,450,500]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

300  	0.531377819839 282
350  	0.530773757817 239
400  	0.532280104266 217
450  	0.530762448236 275
500  	0.53200893953 228


In [13]:
for x in [550,600,650,700,800]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

550  	0.535720729063 308
600  	0.535027375437 254
650  	0.536249586582 298
700  	0.534918784239 221
800  	0.53543830493 306


In [14]:
print min_child_samples
clf.set_params(min_child_samples = min_child_samples)


450


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=450, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [15]:
colsample_bytree = 1
for x in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(colsample_bytree = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        colsample_bytree = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.2  	0.528559521572 405
0.3  	0.529150420199 296
0.4  	0.527694767146 292
0.5  	0.529164261697 248
0.6  	0.530537690773 239
0.7  	0.531266177296 273
0.8  	0.531278855888 237
0.9  	0.532617743425 270


In [16]:
print colsample_bytree

clf.set_params(colsample_bytree = colsample_bytree)

0.4


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=450, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [17]:
subsample = 1.0
for x in [0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(subsample = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        subsample = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.5  	0.540031036293 277
0.6  	0.538408178072 253
0.7  	0.535956199754 280
0.8  	0.534108633235 278
0.9  	0.530413587987 213


In [18]:
print subsample
clf.set_params(subsample = subsample)

1.0


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=450, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [19]:
max_bin = 255

for x in [15,31,63, 127, 511, 1023, 2047]: #[200,300,400]:#
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

15  	0.530507777215 374
31  	0.529957865758 378
63  	0.527968757557 281
127  	0.527666103496 352
511  	0.529702379124 244
1023  	0.528091154786 283
2047  	0.530109284354 233


In [20]:
for x in [50,80, 110, 150,180, 210, 300, 350,400,450, 550,600]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

50  	0.529340700139 361
80  	0.528698390057 371
110  	0.527778555464 308
150  	0.528243765336 309
180  	0.52797289274 297
210  	0.528762026309 260
300  	0.529304104592 283
350  	0.530034916459 240
400  	0.527407117844 337
450  	0.53038124861 331
550  	0.529340949386 306
600  	0.529412443326 309


In [21]:
print max_bin
clf.set_params(max_bin = max_bin)

400


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=400, max_depth=-1,
        max_drop=50, min_child_samples=450, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [24]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, learning_rate=0.1):
    skf = KFold(n_splits=5,random_state=seed)
    scores=[]
    for i, (train, val) in enumerate(skf.split(train_X)):
        est=lgb.LGBMClassifier(learning_rate=0.1,
                               max_bin=int(max_bin),
                               num_leaves=int(num_leaves),
                               min_child_samples=int(min_child_samples),
                               colsample_bytree=colsample_bytree,
                               subsample=subsample,
                               subsample_freq = 1
                              )
 
        train_x_fold = train_X.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_X.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric='multi_logloss',
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict_proba(val_x_fold)
        score = log_loss(val_y_fold, val_y_predict_fold)
        scores.append(score)
    return -np.mean(scores)


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (200,800),
                                'num_leaves': (8,80),
                                'min_child_samples' :(60,500),
                                'colsample_bytree': (0.3,0.7),
                                'subsample' : (0.7,1)})

lgbm_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
    1 | 01m34s | [35m  -0.52729[0m | [32m            0.3133[0m | [32m 589.9460[0m | [32m           268.8357[0m | [32m     50.7648[0m | [32m     0.8742[0m | 
    2 | 02m29s |   -0.52932 |             0.6879 |  740.7174 |            405.9792 |      11.3948 |      0.7703 | 
    3 | 02m48s |   -0.52774 |             0.5401 |  598.6027 |            375.8580 |      24.1757 |      0.8293 | 
    4 | 02m28s | [35m  -0.52714[0m | [32m            0.6169[0m | [32m 749.5119[0m | [32m           266.9397[0m | [32m     53.4135[0m | [32m     0.8691[0m | 
    5 | 01m56s |   -0.52757 |             0.5187 |  666.6244 |             95.1477 |      48.5823 |      0.7286 | 
    6 | 02m04s |   -0.52721 |             0.5061 |  333

  " state: %s" % convergence_dict)


   14 | 01m44s | [35m  -0.52559[0m | [32m            0.4713[0m | [32m 207.0710[0m | [32m            62.3656[0m | [32m     19.4815[0m | [32m     0.7813[0m | 
   15 | 02m37s | [35m  -0.52546[0m | [32m            0.4535[0m | [32m 397.8426[0m | [32m           106.9241[0m | [32m     10.4720[0m | [32m     0.9240[0m | 
   16 | 02m59s |   -0.52699 |             0.5011 |  468.3995 |            287.2805 |      12.8670 |      0.8117 | 
   17 | 02m28s |   -0.52958 |             0.4877 |  215.4646 |            483.0462 |       8.0135 |      0.9121 | 
   18 | 03m00s |   -0.52708 |             0.4797 |  303.5365 |            227.8765 |      11.4661 |      0.8788 | 
   19 | 02m55s |   -0.52819 |             0.6839 |  487.7124 |            103.6519 |      69.0829 |      0.7694 | 
   20 | 03m24s |   -0.52557 |             0.4204 |  712.7941 |            182.3048 |      14.5081 |      0.9308 | 
   21 | 02m51s |   -0.52666 |             0.4651 |  320.2883 |            107.1398 |   

  " state: %s" % convergence_dict)


   22 | 02m59s |   -0.52574 |             0.6102 |  597.5082 |             93.1867 |      14.7655 |      0.8099 | 
   23 | 03m42s |   -0.52759 |             0.5549 |  593.9066 |            280.2198 |       9.8184 |      0.7990 | 
   24 | 02m11s |   -0.52586 |             0.5272 |  214.8638 |            166.8518 |      59.1982 |      0.9403 | 
   25 | 02m14s |   -0.52672 |             0.3630 |  381.0544 |            327.9845 |      50.2107 |      0.9700 | 


  " state: %s" % convergence_dict)


   26 | 02m57s | [35m  -0.52507[0m | [32m            0.4978[0m | [32m 559.9400[0m | [32m            64.2226[0m | [32m     17.1944[0m | [32m     0.9345[0m | 


  " state: %s" % convergence_dict)


   27 | 03m05s |   -0.52866 |             0.4651 |  728.8284 |            237.5077 |      12.7237 |      0.7018 | 
   28 | 02m09s |   -0.52598 |             0.4760 |  466.4442 |             78.4595 |      21.9301 |      0.7044 | 


  " state: %s" % convergence_dict)


   29 | 02m42s |   -0.52910 |             0.4290 |  657.2222 |            332.2029 |      72.5399 |      0.9146 | 
   30 | 02m10s |   -0.52796 |             0.5623 |  217.0109 |            183.0951 |      16.8085 |      0.7138 | 


  " state: %s" % convergence_dict)


   31 | 02m41s |   -0.52923 |             0.3964 |  218.4037 |            467.5303 |      73.4901 |      0.8638 | 
   32 | 03m23s |   -0.52569 |             0.4058 |  536.0100 |             68.1054 |      13.2079 |      0.9545 | 
   33 | 02m01s |   -0.52890 |             0.3984 |  270.7654 |            304.7278 |      79.0575 |      0.8079 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   34 | 03m40s |   -0.52642 |             0.4951 |  230.9936 |             94.5305 |      69.8697 |      0.8894 | 
   35 | 02m44s | [35m  -0.52425[0m | [32m            0.3070[0m | [32m 376.6407[0m | [32m            64.9630[0m | [32m     22.3820[0m | [32m     0.9476[0m | 
   36 | 02m26s |   -0.52871 |             0.6149 |  438.7484 |            456.7444 |      49.7697 |      0.8943 | 
   37 | 03m21s |   -0.52582 |             0.6680 |  798.6898 |             91.7745 |      14.1578 |      0.8281 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   38 | 03m02s |   -0.52684 |             0.5978 |  792.3995 |             65.6243 |      70.4431 |      0.9093 | 
   39 | 03m09s |   -0.52639 |             0.4983 |  449.7922 |            352.9653 |      14.2125 |      0.8703 | 
   40 | 03m27s |   -0.52619 |             0.6727 |  747.0193 |            148.8777 |      27.6480 |      0.8322 | 


  " state: %s" % convergence_dict)


   41 | 03m42s |   -0.52710 |             0.6931 |  468.2321 |            335.8678 |      73.0873 |      0.9586 | 
   42 | 02m06s |   -0.52489 |             0.4257 |  421.2139 |             66.0217 |      31.6154 |      0.8991 | 


  " state: %s" % convergence_dict)


   43 | 02m07s |   -0.52605 |             0.4459 |  715.5591 |             68.9222 |      17.5104 |      0.8731 | 


  " state: %s" % convergence_dict)


   44 | 01m57s |   -0.53350 |             0.4252 |  791.8792 |            457.1748 |      79.0675 |      0.7146 | 
   45 | 02m16s |   -0.52825 |             0.4205 |  374.5565 |            204.9129 |      56.4942 |      0.7647 | 
   46 | 02m47s |   -0.52777 |             0.6794 |  354.5173 |            366.8474 |      11.5122 |      0.8544 | 


  " state: %s" % convergence_dict)


   47 | 02m50s |   -0.52649 |             0.6697 |  717.5593 |            145.0487 |      77.6690 |      0.9634 | 


  " state: %s" % convergence_dict)


   48 | 02m25s |   -0.53328 |             0.6209 |  347.2920 |            497.5252 |      64.5198 |      0.7667 | 
   49 | 01m52s |   -0.52863 |             0.3022 |  511.6160 |            384.0916 |      18.7794 |      0.7010 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   50 | 02m43s |   -0.52509 |             0.6233 |  357.6424 |             66.2162 |      16.3892 |      0.9344 | 


In [None]:
#  	num_leaves 	 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 34 	17.342582 	158.175569 	 	 	453.587691 	0.309807 	 	 	0.951246 	-0.523952
# 36 	34.809317 	175.689702 	 	 	431.124869 	0.420417 	 	 	0.980390 	-0.524356
# 16 	13.123686 	120.179447 	 	 	689.223522 	0.706641 	 	 	0.769943 	-0.524734
# 33 	18.297130 	121.709972 	 	 	452.154081 	0.467294 	 	 	0.913592 	-0.524832
# 24 	8.498056 	122.380717 	 	 	540.797144 	0.318956 	 	 	0.953308 	-0.524889

# 0331 data

In [25]:
gbm_bo_scores = pd.DataFrame([[s[0]['num_leaves'],
                               s[0]['min_child_samples'],
                               s[0]['max_bin'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[1]] for s in zip(lgbm_BO.res['all']['params'],lgbm_BO.res['all']['values'])],
                            columns = ['num_leaves',
                                       'min_child_samples',
                                       'max_bin',
                                       'colsample_bytree',
                                       'subsample',
                                       'score'])
gbm_bo_scores=gbm_bo_scores.sort_values('score',ascending=False)
gbm_bo_scores.head(10)

Unnamed: 0,num_leaves,min_child_samples,max_bin,colsample_bytree,subsample,score
24,22.382049,64.962952,376.64068,0.306975,0.947558,-0.524254
31,31.615417,66.021705,421.213915,0.425728,0.899097,-0.524894
15,17.194424,64.222584,559.939981,0.49779,0.934519,-0.525071
39,16.38924,66.21621,357.642431,0.623331,0.934423,-0.525089
4,10.47195,106.92412,397.84257,0.453457,0.924038,-0.525457
9,14.508117,182.304842,712.79409,0.420441,0.930807,-0.525571
3,19.481535,62.36555,207.070979,0.471294,0.781278,-0.52559
0,25.533528,117.166334,232.38312,0.450393,0.896,-0.525687
21,13.207873,68.105397,536.009959,0.405839,0.954524,-0.525693
11,14.765542,93.186696,597.508214,0.610208,0.809859,-0.525743


In [4]:
def lgbm_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=50):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
#     train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
#     test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
#     scores = np.zeros ((fold,N_params))
#     best_rounds = np.zeros ((fold, N_params))
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    

    
    for j, est in enumerate(estimators):
        est.set_params(learning_rate = 0.005)
        est.set_params(subsample_freq = 1)
        est.set_params(objective = 'multiclass')
        est.set_params(n_estimators = 1000000)

        
        print ("Model %d: %s" %(j+1, est)) 

        
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]
            
            est.fit(train_x_fold, train_y_fold,
                   eval_set = [(val_x_fold,val_y_fold)],
                   eval_metric = 'multi_logloss',
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)
            
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            val_y_predict_fold = est.predict_proba(val_x_fold,num_iteration = best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,num_iteration=best_round)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
#         test_blend_x[:,(j*N_class):(j+1)*N_class] = \
#                 np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

Unnamed: 0,num_loc_price_diff,num_price,num_loc_median_price
710,-49.5,2600,2649.5
779,-500.0,2750,3250.0
988,5700.0,8000,2300.0
1542,2800.0,10000,7200.0
2099,-1555.0,4195,5750.0
3447,-200.0,4200,4400.0
3697,-3464.0,2300,5764.0
4662,4200.0,6500,2300.0
4669,3275.0,6200,2925.0
4689,-1169.0,4595,5764.0


In [5]:
est =       [lgb.LGBMClassifier(num_leaves = 17,
                                min_child_samples = 101,
                                colsample_bytree = 0.317697,
                                subsample = 0.947757,
                                max_bin = 572),
             lgb.LGBMClassifier(num_leaves = 22,
                                min_child_samples = 64,
                                colsample_bytree = 0.306975,
                                subsample = 0.947558,
                                max_bin = 376),
             lgb.LGBMClassifier(num_leaves = 31,
                                min_child_samples = 66,
                                colsample_bytree = 0.425728,
                                subsample = 0.899097,
                                max_bin = 421),
             lgb.LGBMClassifier(num_leaves = 17,
                                min_child_samples = 64,
                                colsample_bytree = 0.497790,
                                subsample = 0.934519,
                                max_bin = 559),
             lgb.LGBMClassifier(num_leaves = 16,
                                min_child_samples = 66,
                                colsample_bytree = 0.623331,
                                subsample = 0.934423,
                                max_bin = 357)]

#  	 	num_leaves 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 39 	17.827540 	101.915438 	 	 	572.750117 	0.317697 	 	 	0.947757 	-0.524479
# 24 	22.382049 	64.962952 	 	 	376.640680 	0.306975 	 	 	0.947558 	-0.524254
# 31 	31.615417 	66.021705 	 	 	421.213915 	0.425728 	 	 	0.899097 	-0.524894
# 15 	17.194424 	64.222584 	 	 	559.939981 	0.497790 	 	 	0.934519 	-0.525071
# 39 	16.389240 	66.216210 	 	 	357.642431 	0.623331 	 	 	0.934423 	-0.525089
# 36 	45.039551 	103.986754 	338.247631 	0.369964 	0.918298 	-0.525114
# 4 	10.471950 	106.924120 	397.842570 	0.453457 	0.924038 	-0.525457
# 9 	14.508117 	182.304842 	712.794090 	0.420441 	0.930807 	-0.525571
# 3 	19.481535 	62.365550 	207.070979 	0.471294 	0.781278 	-0.525590
# 9 	31.402499 	301.654857 	574.185729 	0.672997 	0.965712 	-0.525661
# 0 	25.533528 	117.166334 	232.383120 	0.450393 	0.896000 	-0.525687
# 21 	13.207873 	68.105397 	536.009959 	0.405839 	0.954524 	-0.525693
# 11 	14.765542 	93.186696 	597.508214 	0.610208 	0.809859 	-0.525743


(train_blend_x_gbm,
 test_blend_x_gbm_mean,
 test_blend_x_gbm_gmean,
 blend_scores_gbm,
 best_rounds_gbm)= lgbm_blend(est, 
                               train_X, train_y, 
                               test_X,
                               10,
                               1000) #as the learning rate decreases the number of stopping rounds need to be increased

Blend 5 estimators for 10 folds
Model 1: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.317697, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.005, max_bin=572, max_depth=-1,
        max_drop=50, min_child_samples=101, min_child_weight=5,
        min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=17,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.947757, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
Model 1 fold 1
best round 14207
('Score: ', 0.50722064493075603)
Model 1 fold 1 fitting finished in 676.972s
Model 1 fold 2
best round 19337
('Score: ', 0.49107678244137304)
Model 1 fold 2 fitting finished in 852.580s
Model 1 fold 3
best round 14296
('Score: ', 0.51806582706202187)
Model 1 fold 3 fitting finished in 645.320s
Model 1 fold 4
best round 15855
('Score: ', 0.49455983301319034)
Model 1 fold 

In [None]:
# data 0322


# Blend 5 estimators for 10 folds
# Model 1: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.398779, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=357, max_depth=-1,
#         max_drop=50, min_child_samples=168, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=16,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.94605, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 1 fold 1
# best round 13658
# ('Score: ', 0.51511027734945769)
# Model 1 fold 1 fitting finished in 666.358s
# Model 1 fold 2
# best round 18966
# ('Score: ', 0.49748491939132805)
# Model 1 fold 2 fitting finished in 933.243s
# Model 1 fold 3
# best round 16912
# ('Score: ', 0.52418540090167209)
# Model 1 fold 3 fitting finished in 744.659s
# Model 1 fold 4
# best round 17529
# ('Score: ', 0.49909289305104237)
# Model 1 fold 4 fitting finished in 671.642s
# Model 1 fold 5
# best round 12866
# ('Score: ', 0.53343311318041697)
# Model 1 fold 5 fitting finished in 568.086s
# Model 1 fold 6
# best round 13399
# ('Score: ', 0.52049272741295138)
# Model 1 fold 6 fitting finished in 505.250s
# Model 1 fold 7
# best round 13580
# ('Score: ', 0.52872157155539778)
# Model 1 fold 7 fitting finished in 472.583s
# Model 1 fold 8
# best round 14739
# ('Score: ', 0.54319696370850756)
# Model 1 fold 8 fitting finished in 526.231s
# Model 1 fold 9
# best round 15989
# ('Score: ', 0.53855626201600892)
# Model 1 fold 9 fitting finished in 591.713s
# Model 1 fold 10
# best round 15025
# ('Score: ', 0.53069150503223939)
# Model 1 fold 10 fitting finished in 553.884s
# Score for model 1 is 0.523097
# Model 2: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.7164, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=226, max_depth=-1,
#         max_drop=50, min_child_samples=87, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=18,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.898679, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 2 fold 1
# best round 10248
# ('Score: ', 0.51279527489887844)
# Model 2 fold 1 fitting finished in 463.937s
# Model 2 fold 2
# best round 15443
# ('Score: ', 0.4976452352207959)
# Model 2 fold 2 fitting finished in 638.788s
# Model 2 fold 3
# best round 12373
# ('Score: ', 0.52193792417675422)
# Model 2 fold 3 fitting finished in 523.411s
# Model 2 fold 4
# best round 12944
# ('Score: ', 0.4994116556253469)
# Model 2 fold 4 fitting finished in 548.260s
# Model 2 fold 5
# best round 10491
# ('Score: ', 0.53427000487297915)
# Model 2 fold 5 fitting finished in 480.181s
# Model 2 fold 6
# best round 11402
# ('Score: ', 0.52198833831101743)
# Model 2 fold 6 fitting finished in 446.453s
# Model 2 fold 7
# best round 11700
# ('Score: ', 0.52975063233231268)
# Model 2 fold 7 fitting finished in 508.264s
# Model 2 fold 8
# best round 13942
# ('Score: ', 0.54124823287843038)
# Model 2 fold 8 fitting finished in 579.405s
# Model 2 fold 9
# best round 13851
# ('Score: ', 0.53849899306996529)
# Model 2 fold 9 fitting finished in 575.462s
# Model 2 fold 10
# best round 12086
# ('Score: ', 0.52869565211129366)
# Model 2 fold 10 fitting finished in 512.622s
# Score for model 2 is 0.522624
# Model 3: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.39677, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=351, max_depth=-1,
#         max_drop=50, min_child_samples=171, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=15,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.970258, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 3 fold 1
# best round 12874
# ('Score: ', 0.51579970828744059)
# Model 3 fold 1 fitting finished in 460.611s
# Model 3 fold 2
# best round 18224
# ('Score: ', 0.49875315482793847)
# Model 3 fold 2 fitting finished in 617.800s
# Model 3 fold 3
# best round 16410
# ('Score: ', 0.52445295623291099)
# Model 3 fold 3 fitting finished in 556.717s
# Model 3 fold 4
# best round 15824
# ('Score: ', 0.50021228553164887)
# Model 3 fold 4 fitting finished in 564.615s
# Model 3 fold 5
# best round 15533
# ('Score: ', 0.53396460910535426)
# Model 3 fold 5 fitting finished in 532.299s
# Model 3 fold 6
# best round 12458
# ('Score: ', 0.52040391903309924)
# Model 3 fold 6 fitting finished in 421.335s
# Model 3 fold 7
# best round 15590
# ('Score: ', 0.52977879904304848)
# Model 3 fold 7 fitting finished in 538.673s
# Model 3 fold 8
# best round 15934
# ('Score: ', 0.54355795947963481)
# Model 3 fold 8 fitting finished in 552.762s
# Model 3 fold 9
# best round 16181
# ('Score: ', 0.53803790113040595)
# Model 3 fold 9 fitting finished in 553.873s
# Model 3 fold 10
# best round 15159
# ('Score: ', 0.53069840648618571)
# Model 3 fold 10 fitting finished in 531.805s
# Score for model 3 is 0.523566
# Model 4: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.402, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=338, max_depth=-1,
#         max_drop=50, min_child_samples=120, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=36,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.9845, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 4 fold 1
# best round 6780
# ('Score: ', 0.51297224026086863)
# Model 4 fold 1 fitting finished in 388.870s
# Model 4 fold 2
# best round 9795
# ('Score: ', 0.49493678490657994)
# Model 4 fold 2 fitting finished in 456.681s
# Model 4 fold 3
# best round 8109
# ('Score: ', 0.52283766823302491)
# Model 4 fold 3 fitting finished in 446.561s
# Model 4 fold 4
# best round 8156
# ('Score: ', 0.49875483248461955)
# Model 4 fold 4 fitting finished in 438.891s
# Model 4 fold 5
# best round 6897
# ('Score: ', 0.53322646425989406)
# Model 4 fold 5 fitting finished in 356.814s
# Model 4 fold 6
# best round 6370
# ('Score: ', 0.52000404633950559)
# Model 4 fold 6 fitting finished in 469.238s
# Model 4 fold 7
# best round 7291
# ('Score: ', 0.52866228253282754)
# Model 4 fold 7 fitting finished in 406.877s
# Model 4 fold 8
# best round 8043
# ('Score: ', 0.54111737889176825)
# Model 4 fold 8 fitting finished in 426.052s
# Model 4 fold 9
# best round 7127
# ('Score: ', 0.53782950085297831)
# Model 4 fold 9 fitting finished in 370.410s
# Model 4 fold 10
# best round 8311
# ('Score: ', 0.52849296178887739)
# Model 4 fold 10 fitting finished in 467.649s
# Score for model 4 is 0.521883
# Model 5: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.562594, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=232, max_depth=-1,
#         max_drop=50, min_child_samples=80, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=21,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.886285, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 5 fold 1
# best round 11339
# ('Score: ', 0.51246952947927349)
# Model 5 fold 1 fitting finished in 473.511s
# Model 5 fold 2
# best round 13498
# ('Score: ', 0.49665250024834595)
# Model 5 fold 2 fitting finished in 544.382s
# Model 5 fold 3
# best round 12945
# ('Score: ', 0.52162202839399208)
# Model 5 fold 3 fitting finished in 522.065s
# Model 5 fold 4
# best round 12176
# ('Score: ', 0.49930094273345088)
# Model 5 fold 4 fitting finished in 478.193s
# Model 5 fold 5
# best round 10737
# ('Score: ', 0.53347270474127595)
# Model 5 fold 5 fitting finished in 464.415s
# Model 5 fold 6
# best round 9892
# ('Score: ', 0.52071472372899585)
# Model 5 fold 6 fitting finished in 436.333s
# Model 5 fold 7
# best round 10177
# ('Score: ', 0.52835928323419556)
# Model 5 fold 7 fitting finished in 411.271s
# Model 5 fold 8
# best round 11398
# ('Score: ', 0.54099733268775996)
# Model 5 fold 8 fitting finished in 449.035s
# Model 5 fold 9
# best round 11651
# ('Score: ', 0.53796773146228249)
# Model 5 fold 9 fitting finished in 503.545s
# Model 5 fold 10
# best round 12079
# ('Score: ', 0.5282561706228871)
# Model 5 fold 10 fitting finished in 498.074s
# Score for model 5 is 0.521981
# Score for blended models is 0.522630

In [6]:
now = datetime.now()

name_train_blend = '../blend/train_blend_LightGBM_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_LightGBM_mean_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_LightGBM_gmean_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_gbm,axis=0))
print (np.mean(best_rounds_gbm,axis=0))
np.savetxt(name_train_blend,train_blend_x_gbm, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_gbm_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_gbm_gmean, delimiter=",")

[ 0.51770578  0.51729681  0.51737595  0.51798511  0.518612  ]
[ 14772.2  12224.4   8384.3  13985.8  14431.3]


In [7]:
sub_name = '../output/sub_LightGBM_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_gbm_mean[:,6:9])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

In [None]:
# data 0322

# [ 0.52309656  0.52262419  0.52356597  0.52188342  0.52198129]
# [ 15266.3  12448.   15418.7   7687.9  11589.2]

# data 0331
# [ 0.51778446  0.51758745  0.51859108  0.51763268  0.51941101]
# [ 15053.7   8042.4  16049.9  13754.3  28486.6]


In [35]:
temp = (test_blend_x_gbm_mean[:,6:9] +test_blend_x_gbm_gmean[:,6:9])/2

In [37]:
sub_name = '../output/sub_LightGBM_BM_0331_total_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(temp)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

Unnamed: 0,num_loc_price_diff,num_price,num_loc_median_price
710,,2600,2649.5
779,,2750,3250.0
988,,8000,2300.0
1542,,10000,7200.0
2099,,4195,5750.0
3447,,4200,4400.0
3697,,2300,5764.0
4662,,6500,2300.0
4669,,6200,2925.0
4689,,4595,5764.0
