In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn import preprocessing
import lightgbm as lgb
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime
from scipy.stats.mstats import gmean

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_0331.csv')
test_X = pd.read_csv(data_path + 'test_BM_0331.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values

null_ind = test_X.num_loc_price_diff.isnull()
test_X['num_loc_price_diff'] = test_X['num_price'] - test_X['num_loc_median_price']
# test_X[null_ind][['num_loc_price_diff','num_price','num_loc_median_price']]

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 412) (74659, 412) (49352L,)


In [4]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape

# import sys  
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
# reload(sys)  
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
# sys.setdefaultencoding('utf8')

(39481, 412)
(9871, 412)


In [5]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)
        
clf = clf.fit(X_train, y_train,
              eval_set = [(X_val,y_val)],
              eval_metric = 'multi_logloss',
              early_stopping_rounds = 50,
              verbose = 25)

Train until valid scores didn't improve in 50 rounds.
[25]	valid_0's multi_logloss: 0.604683
[50]	valid_0's multi_logloss: 0.556987
[75]	valid_0's multi_logloss: 0.544693
[100]	valid_0's multi_logloss: 0.539017
[125]	valid_0's multi_logloss: 0.536772
[150]	valid_0's multi_logloss: 0.535167
[175]	valid_0's multi_logloss: 0.534229
[200]	valid_0's multi_logloss: 0.533892
[225]	valid_0's multi_logloss: 0.533346
[250]	valid_0's multi_logloss: 0.533719
Early stopping, best iteration is:
[215]	valid_0's multi_logloss: 0.533274


In [8]:
pred_y = clf.predict_proba(test_X, num_iteration = clf.best_iteration)
pred_y

array([[  4.49234624e-01,   4.70735749e-01,   8.00296264e-02],
       [  9.78020218e-01,   1.22130189e-02,   9.76676308e-03],
       [  8.96809228e-01,   9.19339499e-02,   1.12568216e-02],
       ..., 
       [  9.77052516e-01,   2.18812615e-02,   1.06622257e-03],
       [  9.73375177e-01,   2.56800302e-02,   9.44793072e-04],
       [  6.67708476e-01,   3.23468858e-01,   8.82266579e-03]])

In [9]:
# now = datetime.now()
# sub_name = '../output/sub_LightGBM_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

# out_df = pd.DataFrame(pred_y[:,:3])
# out_df.columns = ["low", "medium", "high"]
# out_df["listing_id"] = sub_id
# out_df.to_csv(sub_name, index=False)

# Tune LightGBM

In [10]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)

tmp  = 1000

In [11]:
for x in [8,15,31,63,127,255]:
    clf.set_params(num_leaves = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        num_leaves = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

8  	0.533202507807 760
15  	0.533380582035 407
31  	0.533273698004 215
63  	0.532785716326 167
127  	0.537808911467 83
255  	0.541784028451 63


In [12]:
print num_leaves
clf.set_params(num_leaves = num_leaves)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=63,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [13]:
min_child_samples = 10

for x in [20, 30, 50, 70, 80,90,100,110,120,150,170,200,230,260]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

20  	0.534607432529 181
30  	0.5338408244 168
50  	0.534499336841 154
70  	0.532744830038 176
80  	0.534183574303 158
90  	0.533888731728 170
100  	0.534698093143 157
110  	0.533009200404 146
120  	0.532805048572 141
150  	0.532923835951 168
170  	0.532091685442 197
200  	0.532975844929 187
230  	0.531032983749 174
260  	0.532318413446 158


In [14]:
for x in [300,350,400,450,500]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

300  	0.531014922999 182
350  	0.533646237117 179
400  	0.532047579469 163
450  	0.533212823642 193
500  	0.532004110603 174


In [15]:
print min_child_samples
clf.set_params(min_child_samples = min_child_samples)


300


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=300, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=63,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [16]:
colsample_bytree = 1
for x in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(colsample_bytree = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        colsample_bytree = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.2  	0.53054947819 246
0.3  	0.530053128272 188
0.4  	0.529074280746 207
0.5  	0.530259562988 168
0.6  	0.529655608372 149
0.7  	0.531195321452 201
0.8  	0.530592604433 171
0.9  	0.532657872604 147


In [17]:
print colsample_bytree

clf.set_params(colsample_bytree = colsample_bytree)

0.4


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=300, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=63,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [18]:
subsample = 1.0
for x in [0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(subsample = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        subsample = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.5  	0.537608174333 189
0.6  	0.535259431507 191
0.7  	0.533627915798 184
0.8  	0.531106184833 215
0.9  	0.531171144842 164


In [19]:
print subsample
clf.set_params(subsample = subsample)

1.0


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=300, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=63,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [20]:
max_bin = 255

for x in [15,31,63, 127, 511, 1023, 2047]: #[200,300,400]:#
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

15  	0.531162807544 189
31  	0.530269914319 200
63  	0.529735792077 238
127  	0.529204712417 174
511  	0.528204412632 188
1023  	0.528746638861 186
2047  	0.528513074537 225


In [21]:
for x in [350,400,550,600,700,1300,1500,1800,2100,2400]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

350  	0.529161914318 204
400  	0.52807891238 240
550  	0.529057888889 160
600  	0.52838262318 173
700  	0.528477930237 215
1300  	0.528727048419 164
1500  	0.529334495305 193
1800  	0.52907526234 181
2100  	0.529803681644 168
2400  	0.528498643638 150


In [22]:
print max_bin
clf.set_params(max_bin = max_bin)

400


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.4, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=400, max_depth=-1,
        max_drop=50, min_child_samples=300, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=63,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [25]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, learning_rate=0.1):
    skf = KFold(n_splits=5,random_state=seed)
    scores=[]
    for i, (train, val) in enumerate(skf.split(train_X)):
        est=lgb.LGBMClassifier(learning_rate=0.1,
                               max_bin=int(max_bin),
                               num_leaves=int(num_leaves),
                               min_child_samples=int(min_child_samples),
                               colsample_bytree=colsample_bytree,
                               subsample=subsample,
                               subsample_freq = 1
                              )
 
        train_x_fold = train_X.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_X.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric='multi_logloss',
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict_proba(val_x_fold)
        score = log_loss(val_y_fold, val_y_predict_fold)
        scores.append(score)
    return -np.mean(scores)


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (255,700),
                                'num_leaves': (8,80),
                                'min_child_samples' :(120,500),
                                'colsample_bytree': (0.3,0.8),
                                'subsample' : (0.7,1)})

lgbm_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
    1 | 01m48s | [35m  -0.52698[0m | [32m            0.6292[0m | [32m 655.4609[0m | [32m           196.0590[0m | [32m     58.7500[0m | [32m     0.7726[0m | 
    2 | 01m36s |   -0.52792 |             0.4858 |  420.2007 |            456.8544 |      55.5553 |      0.8656 | 
    3 | 01m57s |   -0.53286 |             0.6414 |  572.6798 |            466.0000 |      52.2623 |      0.7147 | 
    4 | 01m29s | [35m  -0.52568[0m | [32m            0.3994[0m | [32m 444.5476[0m | [32m           221.5861[0m | [32m     50.6166[0m | [32m     0.8743[0m | 
    5 | 01m57s |   -0.52687 |             0.4622 |  314.4790 |            327.1643 |      11.0416 |      0.8433 | 
    6 | 02m16s |   -0.52727 |             0.3405 |  333

  " state: %s" % convergence_dict)


   15 | 03m01s |   -0.52851 |             0.7158 |  551.4119 |            220.7177 |      78.7243 |      0.7649 | 
   16 | 02m46s |   -0.52601 |             0.7614 |  695.9861 |            160.7555 |      30.0256 |      0.9576 | 
   17 | 02m46s | [35m  -0.52504[0m | [32m            0.7168[0m | [32m 347.8642[0m | [32m           186.9693[0m | [32m     17.8923[0m | [32m     0.9671[0m | 
   18 | 02m59s |   -0.52593 |             0.5840 |  271.7977 |            217.2170 |      11.7845 |      0.8581 | 


  " state: %s" % convergence_dict)


   19 | 02m45s |   -0.52578 |             0.6508 |  380.6639 |            148.0683 |      10.7668 |      0.8918 | 
   20 | 03m08s |   -0.52641 |             0.7199 |  663.2364 |            245.4941 |      12.6071 |      0.9776 | 


  " state: %s" % convergence_dict)


   21 | 01m22s |   -0.52709 |             0.4273 |  405.7218 |            221.0959 |      26.3853 |      0.7211 | 


  " state: %s" % convergence_dict)


   22 | 03m14s |   -0.52610 |             0.6826 |  650.6457 |            161.3106 |      12.5134 |      0.7609 | 
   23 | 03m25s |   -0.52615 |             0.7912 |  318.1193 |            193.0608 |      10.6307 |      0.7718 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   24 | 02m44s |   -0.52742 |             0.4388 |  489.4472 |            411.6182 |       9.5533 |      0.9017 | 
   25 | 02m21s |   -0.52614 |             0.7183 |  681.3723 |            124.2430 |      49.5388 |      0.8366 | 


  " state: %s" % convergence_dict)


   26 | 02m20s |   -0.52737 |             0.5185 |  270.8408 |            133.8418 |      65.0826 |      0.7725 | 
   27 | 02m30s | [35m  -0.52473[0m | [32m            0.7066[0m | [32m 689.2235[0m | [32m           120.1794[0m | [32m     13.1237[0m | [32m     0.7699[0m | 
   28 | 01m41s |   -0.52707 |             0.3471 |  460.8146 |            313.7642 |      74.7979 |      0.9864 | 
   29 | 02m16s |   -0.52537 |             0.4538 |  699.2984 |            289.4009 |      16.4335 |      0.9253 | 


  " state: %s" % convergence_dict)


   30 | 03m02s |   -0.52746 |             0.6559 |  695.7992 |            370.7342 |      12.4807 |      0.7927 | 
   31 | 02m10s |   -0.52528 |             0.6567 |  349.7725 |            161.3051 |      26.1966 |      0.9721 | 
   32 | 01m40s |   -0.52669 |             0.4055 |  438.2872 |            139.6188 |      67.6488 |      0.7992 | 
   33 | 02m37s |   -0.52668 |             0.7951 |  258.8889 |            283.6357 |      12.3130 |      0.8895 | 
   34 | 02m26s |   -0.52874 |             0.5685 |  690.8754 |            252.6525 |      78.3750 |      0.7904 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   35 | 02m26s |   -0.52489 |             0.3190 |  540.7971 |            122.3807 |       8.4981 |      0.9533 | 
   36 | 01m37s |   -0.52580 |             0.4154 |  257.0006 |            231.9925 |      45.0309 |      0.9206 | 


  " state: %s" % convergence_dict)


   37 | 02m01s |   -0.52528 |             0.4218 |  600.9347 |            129.0075 |      13.3609 |      0.9141 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   38 | 01m50s |   -0.52640 |             0.4491 |  426.9602 |            366.7376 |      22.8055 |      0.8766 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   39 | 03m22s |   -0.52510 |             0.7421 |  472.4078 |            120.1093 |      16.1651 |      0.9315 | 
   40 | 01m25s |   -0.52738 |             0.3558 |  330.3053 |            252.5348 |      49.3628 |      0.8111 | 
   41 | 02m15s |   -0.52566 |             0.6987 |  544.2726 |            121.7552 |      49.7833 |      0.9015 | 


  " state: %s" % convergence_dict)


   42 | 02m13s |   -0.52591 |             0.4619 |  670.6837 |            303.2779 |      18.1906 |      0.9554 | 
   43 | 02m46s |   -0.53124 |             0.7114 |  426.0716 |            498.9683 |      17.6822 |      0.8162 | 


  " state: %s" % convergence_dict)


   44 | 01m59s |   -0.52483 |             0.4673 |  452.1541 |            121.7100 |      18.2971 |      0.9136 | 


  " state: %s" % convergence_dict)


   45 | 01m35s | [35m  -0.52395[0m | [32m            0.3098[0m | [32m 453.5877[0m | [32m           158.1756[0m | [32m     17.3426[0m | [32m     0.9512[0m | 
   46 | 01m49s |   -0.52534 |             0.4146 |  461.4220 |            171.9331 |      18.7899 |      0.9245 | 


  " state: %s" % convergence_dict)


   47 | 01m44s |   -0.52436 |             0.4204 |  431.1249 |            175.6897 |      34.8093 |      0.9804 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   48 | 02m17s |   -0.52541 |             0.4077 |  434.8329 |            129.9591 |      16.9366 |      0.9998 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   49 | 02m26s |   -0.53201 |             0.5063 |  689.9913 |            483.8530 |      28.7450 |      0.7858 | 
   50 | 01m47s |   -0.52677 |             0.4601 |  429.3074 |            167.6773 |      19.2457 |      0.7308 | 


In [None]:
#  	 	num_leaves 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 7 	16.783674 	168.394127 	 	 	357.870498 	0.398779 	 	 	0.946050 	-0.528477
# 0 	18.426665 	87.476334 	 	 	226.334635 	0.716400 	 	 	0.898679 	-0.528612
# 38 	15.042746 	171.830790 	 	 	351.539184 	0.396770 	 	 	0.970258 	-0.528655
# 18 	36.8216 	120.8350	 	 	338.2488 	0.4020 	 	 	 	0.9845 		-0.52866
# 24 	21.760862 	80.973547 	 	 	232.334088 	0.562594 	 	 	0.886285 	-0.528852
# 34 	23.050184 	82.442485 	 	 	215.680602 	0.375532 	 	 	0.905054 	-0.528863

# 0322 data

In [26]:
gbm_bo_scores = pd.DataFrame([[s[0]['num_leaves'],
                               s[0]['min_child_samples'],
                               s[0]['max_bin'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[1]] for s in zip(lgbm_BO.res['all']['params'],lgbm_BO.res['all']['values'])],
                            columns = ['num_leaves',
                                       'min_child_samples',
                                       'max_bin',
                                       'colsample_bytree',
                                       'subsample',
                                       'score'])
gbm_bo_scores=gbm_bo_scores.sort_values('score',ascending=False)
gbm_bo_scores.head(10)

Unnamed: 0,num_leaves,min_child_samples,max_bin,colsample_bytree,subsample,score
34,17.342582,158.175569,453.587691,0.309807,0.951246,-0.523952
36,34.809317,175.689702,431.124869,0.420417,0.98039,-0.524356
16,13.123686,120.179447,689.223522,0.706641,0.769943,-0.524734
33,18.29713,121.709972,452.154081,0.467294,0.913592,-0.524832
24,8.498056,122.380717,540.797144,0.318956,0.953308,-0.524889
6,17.89227,186.969297,347.864169,0.716843,0.9671,-0.525036
28,16.165073,120.109258,472.407759,0.742136,0.931532,-0.525098
20,26.196643,161.305052,349.772457,0.656672,0.972073,-0.525277
26,13.360864,129.007466,600.934654,0.421792,0.914086,-0.525285
35,18.789924,171.933123,461.422015,0.41457,0.924544,-0.525342


In [27]:
def lgbm_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=50):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
#     train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
#     test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
#     scores = np.zeros ((fold,N_params))
#     best_rounds = np.zeros ((fold, N_params))
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    

    
    for j, est in enumerate(estimators):
        est.set_params(learning_rate = 0.005)
        est.set_params(subsample_freq = 1)
        est.set_params(objective = 'multiclass')
        est.set_params(n_estimators = 1000000)

        
        print ("Model %d: %s" %(j+1, est)) 

        
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]
            
            est.fit(train_x_fold, train_y_fold,
                   eval_set = [(val_x_fold,val_y_fold)],
                   eval_metric = 'multi_logloss',
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)
            
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            val_y_predict_fold = est.predict_proba(val_x_fold,num_iteration = best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,num_iteration=best_round)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
#         test_blend_x[:,(j*N_class):(j+1)*N_class] = \
#                 np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [None]:
est =       [lgb.LGBMClassifier(num_leaves = 17,
                                min_child_samples = 158,
                                colsample_bytree = 0.309807,
                                subsample = 0.951246,
                                max_bin = 453),
             lgb.LGBMClassifier(num_leaves = 34,
                                min_child_samples = 175,
                                colsample_bytree = 0.420417,
                                subsample = 0.980390,
                                max_bin = 431),
             lgb.LGBMClassifier(num_leaves = 13,
                                min_child_samples = 120,
                                colsample_bytree = 0.706641,
                                subsample = 0.769943,
                                max_bin = 689),
             lgb.LGBMClassifier(num_leaves = 18,
                                min_child_samples = 121,
                                colsample_bytree = 0.467294,
                                subsample = 0.913592,
                                max_bin = 452),
             lgb.LGBMClassifier(num_leaves = 8,
                                min_child_samples = 122,
                                colsample_bytree = 0.318956,
                                subsample = 0.953308,
                                max_bin = 540)]

#  	num_leaves 	 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 34 	17.342582 	158.175569 	 	 	453.587691 	0.309807 	 	 	0.951246 	-0.523952
# 36 	34.809317 	175.689702 	 	 	431.124869 	0.420417 	 	 	0.980390 	-0.524356
# 16 	13.123686 	120.179447 	 	 	689.223522 	0.706641 	 	 	0.769943 	-0.524734
# 33 	18.297130 	121.709972 	 	 	452.154081 	0.467294 	 	 	0.913592 	-0.524832
# 24 	8.498056 	122.380717 	 	 	540.797144 	0.318956 	 	 	0.953308 	-0.524889



(train_blend_x_gbm,
 test_blend_x_gbm_mean,
 test_blend_x_gbm_gmean,
 blend_scores_gbm,
 best_rounds_gbm)= lgbm_blend(est, 
                               train_X, train_y, 
                               test_X,
                               10,
                               1000) #as the learning rate decreases the number of stopping rounds need to be increased

Exception KeyboardInterrupt in <bound method Booster.__del__ of <lightgbm.basic.Booster object at 0x0000000016AA6208>> ignored
Exception KeyboardInterrupt in <bound method Booster.__del__ of <lightgbm.basic.Booster object at 0x0000000016ACCE10>> ignored


In [None]:
# data 0322


# Blend 5 estimators for 10 folds
# Model 1: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.398779, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=357, max_depth=-1,
#         max_drop=50, min_child_samples=168, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=16,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.94605, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 1 fold 1
# best round 13658
# ('Score: ', 0.51511027734945769)
# Model 1 fold 1 fitting finished in 666.358s
# Model 1 fold 2
# best round 18966
# ('Score: ', 0.49748491939132805)
# Model 1 fold 2 fitting finished in 933.243s
# Model 1 fold 3
# best round 16912
# ('Score: ', 0.52418540090167209)
# Model 1 fold 3 fitting finished in 744.659s
# Model 1 fold 4
# best round 17529
# ('Score: ', 0.49909289305104237)
# Model 1 fold 4 fitting finished in 671.642s
# Model 1 fold 5
# best round 12866
# ('Score: ', 0.53343311318041697)
# Model 1 fold 5 fitting finished in 568.086s
# Model 1 fold 6
# best round 13399
# ('Score: ', 0.52049272741295138)
# Model 1 fold 6 fitting finished in 505.250s
# Model 1 fold 7
# best round 13580
# ('Score: ', 0.52872157155539778)
# Model 1 fold 7 fitting finished in 472.583s
# Model 1 fold 8
# best round 14739
# ('Score: ', 0.54319696370850756)
# Model 1 fold 8 fitting finished in 526.231s
# Model 1 fold 9
# best round 15989
# ('Score: ', 0.53855626201600892)
# Model 1 fold 9 fitting finished in 591.713s
# Model 1 fold 10
# best round 15025
# ('Score: ', 0.53069150503223939)
# Model 1 fold 10 fitting finished in 553.884s
# Score for model 1 is 0.523097
# Model 2: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.7164, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=226, max_depth=-1,
#         max_drop=50, min_child_samples=87, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=18,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.898679, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 2 fold 1
# best round 10248
# ('Score: ', 0.51279527489887844)
# Model 2 fold 1 fitting finished in 463.937s
# Model 2 fold 2
# best round 15443
# ('Score: ', 0.4976452352207959)
# Model 2 fold 2 fitting finished in 638.788s
# Model 2 fold 3
# best round 12373
# ('Score: ', 0.52193792417675422)
# Model 2 fold 3 fitting finished in 523.411s
# Model 2 fold 4
# best round 12944
# ('Score: ', 0.4994116556253469)
# Model 2 fold 4 fitting finished in 548.260s
# Model 2 fold 5
# best round 10491
# ('Score: ', 0.53427000487297915)
# Model 2 fold 5 fitting finished in 480.181s
# Model 2 fold 6
# best round 11402
# ('Score: ', 0.52198833831101743)
# Model 2 fold 6 fitting finished in 446.453s
# Model 2 fold 7
# best round 11700
# ('Score: ', 0.52975063233231268)
# Model 2 fold 7 fitting finished in 508.264s
# Model 2 fold 8
# best round 13942
# ('Score: ', 0.54124823287843038)
# Model 2 fold 8 fitting finished in 579.405s
# Model 2 fold 9
# best round 13851
# ('Score: ', 0.53849899306996529)
# Model 2 fold 9 fitting finished in 575.462s
# Model 2 fold 10
# best round 12086
# ('Score: ', 0.52869565211129366)
# Model 2 fold 10 fitting finished in 512.622s
# Score for model 2 is 0.522624
# Model 3: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.39677, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=351, max_depth=-1,
#         max_drop=50, min_child_samples=171, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=15,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.970258, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 3 fold 1
# best round 12874
# ('Score: ', 0.51579970828744059)
# Model 3 fold 1 fitting finished in 460.611s
# Model 3 fold 2
# best round 18224
# ('Score: ', 0.49875315482793847)
# Model 3 fold 2 fitting finished in 617.800s
# Model 3 fold 3
# best round 16410
# ('Score: ', 0.52445295623291099)
# Model 3 fold 3 fitting finished in 556.717s
# Model 3 fold 4
# best round 15824
# ('Score: ', 0.50021228553164887)
# Model 3 fold 4 fitting finished in 564.615s
# Model 3 fold 5
# best round 15533
# ('Score: ', 0.53396460910535426)
# Model 3 fold 5 fitting finished in 532.299s
# Model 3 fold 6
# best round 12458
# ('Score: ', 0.52040391903309924)
# Model 3 fold 6 fitting finished in 421.335s
# Model 3 fold 7
# best round 15590
# ('Score: ', 0.52977879904304848)
# Model 3 fold 7 fitting finished in 538.673s
# Model 3 fold 8
# best round 15934
# ('Score: ', 0.54355795947963481)
# Model 3 fold 8 fitting finished in 552.762s
# Model 3 fold 9
# best round 16181
# ('Score: ', 0.53803790113040595)
# Model 3 fold 9 fitting finished in 553.873s
# Model 3 fold 10
# best round 15159
# ('Score: ', 0.53069840648618571)
# Model 3 fold 10 fitting finished in 531.805s
# Score for model 3 is 0.523566
# Model 4: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.402, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=338, max_depth=-1,
#         max_drop=50, min_child_samples=120, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=36,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.9845, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 4 fold 1
# best round 6780
# ('Score: ', 0.51297224026086863)
# Model 4 fold 1 fitting finished in 388.870s
# Model 4 fold 2
# best round 9795
# ('Score: ', 0.49493678490657994)
# Model 4 fold 2 fitting finished in 456.681s
# Model 4 fold 3
# best round 8109
# ('Score: ', 0.52283766823302491)
# Model 4 fold 3 fitting finished in 446.561s
# Model 4 fold 4
# best round 8156
# ('Score: ', 0.49875483248461955)
# Model 4 fold 4 fitting finished in 438.891s
# Model 4 fold 5
# best round 6897
# ('Score: ', 0.53322646425989406)
# Model 4 fold 5 fitting finished in 356.814s
# Model 4 fold 6
# best round 6370
# ('Score: ', 0.52000404633950559)
# Model 4 fold 6 fitting finished in 469.238s
# Model 4 fold 7
# best round 7291
# ('Score: ', 0.52866228253282754)
# Model 4 fold 7 fitting finished in 406.877s
# Model 4 fold 8
# best round 8043
# ('Score: ', 0.54111737889176825)
# Model 4 fold 8 fitting finished in 426.052s
# Model 4 fold 9
# best round 7127
# ('Score: ', 0.53782950085297831)
# Model 4 fold 9 fitting finished in 370.410s
# Model 4 fold 10
# best round 8311
# ('Score: ', 0.52849296178887739)
# Model 4 fold 10 fitting finished in 467.649s
# Score for model 4 is 0.521883
# Model 5: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.562594, drop_rate=0.1,
#         is_unbalance=False, learning_rate=0.005, max_bin=232, max_depth=-1,
#         max_drop=50, min_child_samples=80, min_child_weight=5,
#         min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=21,
#         objective='multiclass', reg_alpha=0, reg_lambda=0,
#         scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
#         skip_drop=0.5, subsample=0.886285, subsample_for_bin=50000,
#         subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
# Model 5 fold 1
# best round 11339
# ('Score: ', 0.51246952947927349)
# Model 5 fold 1 fitting finished in 473.511s
# Model 5 fold 2
# best round 13498
# ('Score: ', 0.49665250024834595)
# Model 5 fold 2 fitting finished in 544.382s
# Model 5 fold 3
# best round 12945
# ('Score: ', 0.52162202839399208)
# Model 5 fold 3 fitting finished in 522.065s
# Model 5 fold 4
# best round 12176
# ('Score: ', 0.49930094273345088)
# Model 5 fold 4 fitting finished in 478.193s
# Model 5 fold 5
# best round 10737
# ('Score: ', 0.53347270474127595)
# Model 5 fold 5 fitting finished in 464.415s
# Model 5 fold 6
# best round 9892
# ('Score: ', 0.52071472372899585)
# Model 5 fold 6 fitting finished in 436.333s
# Model 5 fold 7
# best round 10177
# ('Score: ', 0.52835928323419556)
# Model 5 fold 7 fitting finished in 411.271s
# Model 5 fold 8
# best round 11398
# ('Score: ', 0.54099733268775996)
# Model 5 fold 8 fitting finished in 449.035s
# Model 5 fold 9
# best round 11651
# ('Score: ', 0.53796773146228249)
# Model 5 fold 9 fitting finished in 503.545s
# Model 5 fold 10
# best round 12079
# ('Score: ', 0.5282561706228871)
# Model 5 fold 10 fitting finished in 498.074s
# Score for model 5 is 0.521981
# Score for blended models is 0.522630

In [29]:
now = datetime.now()

name_train_blend = '../blend/train_blend_LightGBM_BM_0331_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_LightGBM_mean_BM_0331_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_LightGBM_gmean_BM_0331_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_gbm,axis=0))
print (np.mean(best_rounds_gbm,axis=0))
np.savetxt(name_train_blend,train_blend_x_gbm, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_gbm_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_gbm_gmean, delimiter=",")

[ 0.51778446  0.51758745  0.51859108  0.51763268  0.51941101]
[ 15053.7   8042.4  16049.9  13754.3  28486.6]


In [32]:
sub_name = '../output/sub_LightGBM_BM_0331_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_gbm_mean[:,6:9])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

In [None]:
# data 0322

# [ 0.52309656  0.52262419  0.52356597  0.52188342  0.52198129]
# [ 15266.3  12448.   15418.7   7687.9  11589.2]

In [35]:
temp = (test_blend_x_gbm_mean[:,6:9] +test_blend_x_gbm_gmean[:,6:9])/2

In [37]:
sub_name = '../output/sub_LightGBM_BM_0331_total_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(temp)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)