In [2]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn import preprocessing
import lightgbm as lgb
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime
from scipy.stats.mstats import gmean

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [62]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_CV_statistics2.csv')
test_X = pd.read_csv(data_path + 'test_CV_statistics2.csv')
train_y = np.ravel(pd.read_csv(data_path + 'train_y_CV_statistics.csv',header=None))
sub_id = test_X.listing_id.astype('int32').values

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 398) (74659, 398) (49352L,)


In [63]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape

# import sys  
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
# reload(sys)  
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
# sys.setdefaultencoding('utf8')

(39481, 398)
(9871, 398)


In [64]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)
        
clf = clf.fit(X_train, y_train,
              eval_set = [(X_val,y_val)],
              eval_metric = 'multi_logloss',
              early_stopping_rounds = 50,
              verbose = 25)

Train until valid scores didn't improve in 50 rounds.
[25]	valid_0's multi_logloss: 0.600748
[50]	valid_0's multi_logloss: 0.553274
[75]	valid_0's multi_logloss: 0.539869
[100]	valid_0's multi_logloss: 0.533978
[125]	valid_0's multi_logloss: 0.531351
[150]	valid_0's multi_logloss: 0.528829
[175]	valid_0's multi_logloss: 0.52773
[200]	valid_0's multi_logloss: 0.527708
[225]	valid_0's multi_logloss: 0.527244
[250]	valid_0's multi_logloss: 0.527715
[275]	valid_0's multi_logloss: 0.527564
Early stopping, best iteration is:
[227]	valid_0's multi_logloss: 0.527176


In [None]:
[223]	valid_0's multi_logloss: 0.533439

In [6]:
pred_y = clf.predict_proba(test_X, num_iteration = clf.best_iteration)
pred_y

array([[  4.87241777e-01,   4.59754244e-01,   5.30039791e-02],
       [  9.39061072e-01,   2.87975851e-02,   3.21413429e-02],
       [  9.00711190e-01,   8.74221884e-02,   1.18666212e-02],
       ..., 
       [  9.79164336e-01,   1.99874435e-02,   8.48220964e-04],
       [  9.85101415e-01,   1.46340209e-02,   2.64564034e-04],
       [  6.40233923e-01,   3.43340830e-01,   1.64252476e-02]])

In [7]:
clf

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [9]:
# now = datetime.now()
# sub_name = '../output/sub_LightGBM_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

# out_df = pd.DataFrame(pred_y[:,:3])
# out_df.columns = ["low", "medium", "high"]
# out_df["listing_id"] = sub_id
# out_df.to_csv(sub_name, index=False)

# Tune LightGBM

In [65]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)

tmp  = 1000

In [66]:
for x in [8,15,31,63,127,255]:
    clf.set_params(num_leaves = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        num_leaves = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

8  	0.527041278996 793
15  	0.527078513996 370
31  	0.527176416198 227
63  	0.527274346296 143
127  	0.531434596588 91
255  	0.537356896044 70


In [67]:
for x in [20,40,50,70,80,90,100]:
    clf.set_params(num_leaves = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        num_leaves = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

20  	0.526385745858 393
40  	0.525487289967 202
50  	0.527927222495 152
70  	0.528879383675 135
80  	0.528587738266 140
90  	0.52881305334 135
100  	0.529835264449 112


In [68]:
print num_leaves
clf.set_params(num_leaves = num_leaves)

40


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=40,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [69]:
min_child_samples = 10

for x in [20, 30, 50, 70, 80,90,100,110,120,150,170,200,230,260]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

20  	0.527344050351 245
30  	0.526075261377 244
50  	0.527824890434 167
70  	0.527667410327 181
80  	0.526741742421 178
90  	0.526054703678 221
100  	0.527322600072 205
110  	0.526716682857 220
120  	0.526160737465 173
150  	0.526137552135 206
170  	0.52639317896 220
200  	0.526716082566 229
230  	0.527000361857 228
260  	0.528410197917 147


In [70]:
for x in [300,350,400,450,500]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

300  	0.526215358666 194
350  	0.526511677588 260
400  	0.53021506304 174
450  	0.527192693563 243
500  	0.528374245325 193


In [72]:
for x in [2,5,15]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

2  	0.525487289967 202
5  	0.525487289967 202
15  	0.527908054708 228


In [73]:
print min_child_samples
clf.set_params(min_child_samples = min_child_samples)


10


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=40,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [74]:
colsample_bytree = 1
for x in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(colsample_bytree = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        colsample_bytree = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.2  	0.525192945714 321
0.3  	0.524793214907 233
0.4  	0.524509420866 209
0.5  	0.52526129895 279
0.6  	0.527018874265 254
0.7  	0.52695130558 234
0.8  	0.526803467182 246
0.9  	0.527618523417 232


In [75]:
print colsample_bytree

clf.set_params(colsample_bytree = colsample_bytree)

0.4


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=40,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [76]:
subsample = 1.0
for x in [0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(subsample = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        subsample = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.5  	0.530614618756 217
0.6  	0.528097245047 166
0.7  	0.525933107883 256
0.8  	0.525863502725 209
0.9  	0.525326682139 225


In [77]:
print subsample
clf.set_params(subsample = subsample)

1.0


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=40,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [78]:
max_bin = 255

for x in [15,31,63, 127, 511, 1023, 2047]: #[200,300,400]:#
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

15  	0.526064061531 310
31  	0.52635679076 359
63  	0.523683112095 318
127  	0.523561569349 264
511  	0.525713325305 228
1023  	0.525161061192 234
2047  	0.52483700656 212


In [79]:
for x in [50,80, 110, 150,180, 210, 300, 350,400,450, 550,600]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

50  	0.527117139211 230
80  	0.527459616846 262
110  	0.524080345071 217
150  	0.522634049854 284
180  	0.525442944145 274
210  	0.52506481323 261
300  	0.52534347792 244
350  	0.524350980286 245
400  	0.525151462678 268
450  	0.524252545145 260
550  	0.524723690999 195
600  	0.526557533148 215


In [80]:
for x in [700,800,900,1200,1300,1500,1700,1900]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

700  	0.525780854119 216
800  	0.524122312911 278
900  	0.524841964157 202
1200  	0.526485123393 234
1300  	0.524583789748 248
1500  	0.524620156821 242
1700  	0.524406246904 271
1900  	0.524610462787 288


In [81]:
print max_bin
clf.set_params(max_bin = max_bin)

150


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=150, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=40,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [82]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, learning_rate=0.1):
    skf = KFold(n_splits=5,random_state=seed)
    scores=[]
    for i, (train, val) in enumerate(skf.split(train_X)):
        est=lgb.LGBMClassifier(learning_rate=0.1,
                               max_bin=int(max_bin),
                               num_leaves=int(num_leaves),
                               min_child_samples=int(min_child_samples),
                               colsample_bytree=colsample_bytree,
                               subsample=subsample,
                               subsample_freq = 1
                              )
 
        train_x_fold = train_X.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_X.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric='multi_logloss',
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict_proba(val_x_fold)
        score = log_loss(val_y_fold, val_y_predict_fold)
        scores.append(score)
    return -np.mean(scores)


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (80,400),
                                'num_leaves': (20,63),
                                'min_child_samples' :(9,90),
                                'colsample_bytree': (0.2,0.6),
                                'subsample' : (0.8,1)})

lgbm_BO.maximize(init_points=10, n_iter=50)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
    1 | 01m26s | [35m  -0.53330[0m | [32m            0.2356[0m | [32m 341.1413[0m | [32m            89.1886[0m | [32m     43.9605[0m | [32m     0.8478[0m | 
    2 | 02m24s |   -0.53615 |             0.5326 |  370.5819 |             32.5007 |      55.0066 |      0.8930 | 
    3 | 02m11s |   -0.53495 |             0.5391 |  370.8547 |             39.4240 |      39.2428 |      0.8294 | 
    4 | 02m17s |   -0.53543 |             0.5711 |  203.1610 |             82.0747 |      50.5593 |      0.9160 | 
    5 | 03m19s |   -0.53624 |             0.3881 |  385.2582 |             66.5734 |      28.7896 |      0.9056 | 
    6 | 02m24s |   -0.53573 |             0.4685 |  308.1932 |             37.2609 |      34.0030 |      0.90

  " state: %s" % convergence_dict)


   15 | 02m59s |   -0.53639 |             0.2995 |  334.9221 |             85.2584 |      23.1761 |      0.9634 | 
   16 | 02m49s |   -0.53554 |             0.5825 |  347.8508 |             89.8336 |      62.9133 |      0.9289 | 
   17 | 01m41s |   -0.53509 |             0.3940 |  104.2733 |             85.5616 |      32.3929 |      0.8197 | 


  " state: %s" % convergence_dict)


   18 | 02m09s |   -0.53544 |             0.3238 |  371.6555 |             13.9422 |      31.5505 |      0.9743 | 


  " state: %s" % convergence_dict)


   19 | 02m37s |   -0.53665 |             0.2726 |  291.4607 |             13.0467 |      53.6573 |      0.9604 | 


  " state: %s" % convergence_dict)


   20 | 01m59s |   -0.53539 |             0.4969 |  178.9722 |             31.9216 |      55.2478 |      0.8347 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   21 | 01m50s |   -0.53498 |             0.3211 |  141.7267 |             62.8979 |      35.8699 |      0.8657 | 
   22 | 01m40s |   -0.53534 |             0.3522 |  129.9365 |             62.8356 |      60.7925 |      0.8375 | 
   23 | 02m02s |   -0.53613 |             0.3622 |  257.0582 |             75.3045 |      62.8147 |      0.9453 | 
   24 | 02m28s |   -0.53514 |             0.3001 |   93.8251 |              9.5577 |      20.4295 |      0.8180 | 


  " state: %s" % convergence_dict)


   25 | 02m23s |   -0.53660 |             0.5949 |  165.2242 |             76.0086 |      30.9940 |      0.9602 | 
   26 | 01m25s |   -0.53481 |             0.2711 |  116.2822 |             85.3056 |      61.4459 |      0.8505 | 


  " state: %s" % convergence_dict)


   27 | 01m48s |   -0.53614 |             0.2538 |  149.1975 |             11.9283 |      23.8183 |      0.9831 | 
   28 | 02m31s |   -0.53609 |             0.5810 |  399.6546 |             30.4571 |      43.3566 |      0.8426 | 
   29 | 02m28s |   -0.53469 |             0.3278 |  311.6200 |             75.4017 |      58.1006 |      0.8300 | 
   30 | 01m33s |   -0.53633 |             0.3505 |   82.8129 |             10.1692 |      58.6160 |      0.8570 | 


  " state: %s" % convergence_dict)


   31 | 01m43s |   -0.53478 |             0.3448 |   82.8083 |             84.2359 |      40.5581 |      0.8313 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   32 | 02m03s |   -0.53560 |             0.3600 |  333.2086 |             89.4848 |      53.1634 |      0.8196 | 
   33 | 01m59s |   -0.53424 |             0.2744 |  348.2830 |             81.2616 |      42.6914 |      0.9050 | 
   34 | 03m16s |   -0.53560 |             0.5112 |  308.9816 |             38.5077 |      62.5161 |      0.8364 | 


  " state: %s" % convergence_dict)


   35 | 02m14s |   -0.53619 |             0.3619 |  399.6125 |             87.3991 |      30.3797 |      0.8865 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   36 | 02m34s |   -0.53576 |             0.5630 |  326.4525 |             56.2440 |      44.9520 |      0.8095 | 
   37 | 01m55s |   -0.53390 |             0.2702 |  293.3899 |             55.7680 |      56.8832 |      0.9136 | 
   38 | 02m44s |   -0.53506 |             0.5004 |  174.6413 |             56.2852 |      60.8253 |      0.8256 | 


  " state: %s" % convergence_dict)


   39 | 02m06s |   -0.53642 |             0.4466 |  222.0290 |             19.4466 |      57.9596 |      0.8582 | 


  " state: %s" % convergence_dict)


   40 | 03m00s |   -0.53644 |             0.5496 |  395.4920 |             36.6357 |      21.5344 |      0.8070 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   41 | 01m58s |   -0.53534 |             0.2459 |  392.4199 |             88.7462 |      58.2734 |      0.8013 | 


  " state: %s" % convergence_dict)


   42 | 02m44s |   -0.53620 |             0.3995 |  189.6259 |              9.3057 |      60.2366 |      0.9995 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   43 | 01m54s |   -0.53518 |             0.2859 |  298.6996 |             63.1689 |      43.9520 |      0.8012 | 
   44 | 02m02s |   -0.53508 |             0.5167 |  141.4436 |             87.3187 |      41.9752 |      0.8718 | 
   45 | 02m46s |   -0.53726 |             0.5580 |  336.7110 |             10.8807 |      25.3556 |      0.9164 | 
   46 | 02m51s |   -0.53708 |             0.4413 |  191.5267 |             85.4909 |      58.7518 |      0.9767 | 
   47 | 01m50s |   -0.53473 |             0.2371 |  218.0256 |             79.4527 |      26.4683 |      0.8358 | 


  " state: %s" % convergence_dict)


   48 | 02m12s |   -0.53686 |             0.4967 |  210.2524 |             58.7173 |      39.5812 |      0.9544 | 
   49 | 01m56s |   -0.53571 |             0.2701 |  239.3193 |             80.7619 |      23.4652 |      0.8486 | 


  " state: %s" % convergence_dict)


   50 | 03m09s |   -0.53709 |             0.3999 |  396.6137 |              9.8225 |      25.4306 |      0.8048 | 


KeyboardInterrupt: 

In [None]:
#  	num_leaves 	 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 34 	17.342582 	158.175569 	 	 	453.587691 	0.309807 	 	 	0.951246 	-0.523952
# 36 	34.809317 	175.689702 	 	 	431.124869 	0.420417 	 	 	0.980390 	-0.524356
# 16 	13.123686 	120.179447 	 	 	689.223522 	0.706641 	 	 	0.769943 	-0.524734
# 33 	18.297130 	121.709972 	 	 	452.154081 	0.467294 	 	 	0.913592 	-0.524832
# 24 	8.498056 	122.380717 	 	 	540.797144 	0.318956 	 	 	0.953308 	-0.524889

# 0331 data

In [25]:
gbm_bo_scores = pd.DataFrame([[s[0]['num_leaves'],
                               s[0]['min_child_samples'],
                               s[0]['max_bin'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[1]] for s in zip(lgbm_BO.res['all']['params'],lgbm_BO.res['all']['values'])],
                            columns = ['num_leaves',
                                       'min_child_samples',
                                       'max_bin',
                                       'colsample_bytree',
                                       'subsample',
                                       'score'])
gbm_bo_scores=gbm_bo_scores.sort_values('score',ascending=False)
gbm_bo_scores.head(10)

Unnamed: 0,num_leaves,min_child_samples,max_bin,colsample_bytree,subsample,score
24,22.382049,64.962952,376.64068,0.306975,0.947558,-0.524254
31,31.615417,66.021705,421.213915,0.425728,0.899097,-0.524894
15,17.194424,64.222584,559.939981,0.49779,0.934519,-0.525071
39,16.38924,66.21621,357.642431,0.623331,0.934423,-0.525089
4,10.47195,106.92412,397.84257,0.453457,0.924038,-0.525457
9,14.508117,182.304842,712.79409,0.420441,0.930807,-0.525571
3,19.481535,62.36555,207.070979,0.471294,0.781278,-0.52559
0,25.533528,117.166334,232.38312,0.450393,0.896,-0.525687
21,13.207873,68.105397,536.009959,0.405839,0.954524,-0.525693
11,14.765542,93.186696,597.508214,0.610208,0.809859,-0.525743


In [83]:
def lgbm_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=50):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
#     train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
#     test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
#     scores = np.zeros ((fold,N_params))
#     best_rounds = np.zeros ((fold, N_params))
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    

    
    for j, est in enumerate(estimators):
        est.set_params(learning_rate = 0.005)
        est.set_params(subsample_freq = 1)
        est.set_params(objective = 'multiclass')
        est.set_params(n_estimators = 1000000)

        
        print ("Model %d: %s" %(j+1, est)) 

        
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]
            
            est.fit(train_x_fold, train_y_fold,
                   eval_set = [(val_x_fold,val_y_fold)],
                   eval_metric = 'multi_logloss',
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)
            
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            val_y_predict_fold = est.predict_proba(val_x_fold,num_iteration = best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,num_iteration=best_round)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
#         test_blend_x[:,(j*N_class):(j+1)*N_class] = \
#                 np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [84]:
est =       [lgb.LGBMClassifier(num_leaves = 43,
                                min_child_samples = 89,
                                colsample_bytree = 0.2356,
                                subsample = 0.8478,
                                max_bin = 341),
#              lgb.LGBMClassifier(num_leaves = 22,
#                                 min_child_samples = 64,
#                                 colsample_bytree = 0.306975,
#                                 subsample = 0.947558,
#                                 max_bin = 376),
#              lgb.LGBMClassifier(num_leaves = 31,
#                                 min_child_samples = 66,
#                                 colsample_bytree = 0.425728,
#                                 subsample = 0.899097,
#                                 max_bin = 421),
#              lgb.LGBMClassifier(num_leaves = 17,
#                                 min_child_samples = 64,
#                                 colsample_bytree = 0.497790,
#                                 subsample = 0.934519,
#                                 max_bin = 559),
#              lgb.LGBMClassifier(num_leaves = 16,
#                                 min_child_samples = 66,
#                                 colsample_bytree = 0.623331,
#                                 subsample = 0.934423,
#                                 max_bin = 357)
            ]

#  Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
#     1 | 01m26s |   -0.53330 |             0.2356 |  341.1413 |             89.1886 |      43.9605 |      0.8478 | 


(train_blend_x_gbm,
 test_blend_x_gbm_mean,
 test_blend_x_gbm_gmean,
 blend_scores_gbm,
 best_rounds_gbm)= lgbm_blend(est, 
                               train_X, train_y, 
                               test_X,
                               10,
                               1000) #as the learning rate decreases the number of stopping rounds need to be increased

Blend 1 estimators for 10 folds
Model 1: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.2356, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.005, max_bin=341, max_depth=-1,
        max_drop=50, min_child_samples=89, min_child_weight=5,
        min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=43,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.8478, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
Model 1 fold 1
best round 7464
('Score: ', 0.54215349038012606)
Model 1 fold 1 fitting finished in 708.631s
Model 1 fold 2
best round 7872
('Score: ', 0.52898572392866161)
Model 1 fold 2 fitting finished in 803.146s
Model 1 fold 3
best round 8149
('Score: ', 0.51244088700520318)
Model 1 fold 3 fitting finished in 713.988s
Model 1 fold 4
best round 6860
('Score: ', 0.52543760552092333)
Model 1 fold 4 fitting

KeyboardInterrupt: 

In [6]:
now = datetime.now()

name_train_blend = '../blend/train_blend_LightGBM_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_LightGBM_mean_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_LightGBM_gmean_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_gbm,axis=0))
print (np.mean(best_rounds_gbm,axis=0))
np.savetxt(name_train_blend,train_blend_x_gbm, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_gbm_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_gbm_gmean, delimiter=",")

[ 0.51770578  0.51729681  0.51737595  0.51798511  0.518612  ]
[ 14772.2  12224.4   8384.3  13985.8  14431.3]


In [7]:
sub_name = '../output/sub_LightGBM_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_gbm_mean[:,6:9])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

In [None]:
# data 0322

# [ 0.52309656  0.52262419  0.52356597  0.52188342  0.52198129]
# [ 15266.3  12448.   15418.7   7687.9  11589.2]

# data 0331
# [ 0.51778446  0.51758745  0.51859108  0.51763268  0.51941101]
# [ 15053.7   8042.4  16049.9  13754.3  28486.6]


In [35]:
temp = (test_blend_x_gbm_mean[:,6:9] +test_blend_x_gbm_gmean[:,6:9])/2

In [37]:
sub_name = '../output/sub_LightGBM_BM_0331_total_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(temp)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)