In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn import preprocessing
import lightgbm as lgb
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime
from scipy.stats.mstats import gmean

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [43]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_0401.csv')
test_X = pd.read_csv(data_path + 'test_BM_0401.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 428) (74659, 428) (49352L,)


In [44]:
train_y[:20]

array([1, 0, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0], dtype=int64)

In [45]:
train_X['target'] = train_y
train_X.head(20)

Unnamed: 0,building_id_mean_med,building_id_mean_high,manager_id_mean_med,manager_id_mean_high,median_price_bed,ratio_bed,compound,neg,neu,pos,...,feature_washer_&_dryer,feature_washer_in_unit,feature_wheelchair_access,feature_wheelchair_ramp,feature_wifi,feature_wifi_access,feature_wood-burning_fireplace,feature_yard,feature_yoga_classes,target
0,0.299643,0.05661408,0.403788,0.0594723,2900.0,0.827586,2.183865,-0.779948,0.099537,0.887005,...,0,0,0,0,0,0,0,0,0,1
1,0.409172,0.04557453,0.650389,0.0500421,3350.0,1.134328,1.138077,-0.779948,0.176361,0.723724,...,0,0,0,0,0,0,0,0,0,0
2,0.389029,0.04096503,0.194666,0.04161561,3350.0,1.043284,0.253714,-0.779948,0.053088,0.981956,...,0,0,0,0,0,0,0,0,0,1
3,0.222325,0.0766081,0.248812,6.214142e-31,4500.0,0.666667,-0.793989,-0.207931,0.603349,-0.426116,...,0,0,0,0,0,0,0,0,0,1
4,0.19315,0.04818481,0.140153,2.440795e-18,2400.0,1.164583,-1.34078,1.414587,0.558216,-0.768148,...,0,0,0,0,0,0,0,0,0,0
5,0.241598,0.1102479,0.429327,0.2268608,4500.0,1.6,0.183656,-0.779948,0.344683,0.336462,...,0,0,0,0,0,0,0,0,0,0
6,0.401675,0.02848683,0.428874,0.01460819,4500.0,1.333333,1.460108,1.036594,-0.923395,2.330864,...,0,0,0,0,0,0,0,0,0,0
7,0.251185,7.053514e-05,0.302649,0.1244522,2400.0,0.810417,-1.085039,1.046004,0.582473,-0.661079,...,0,0,0,0,0,0,0,0,0,2
8,0.227796,0.07754304,0.28017,0.1253419,2900.0,0.839655,0.775417,-0.779948,0.208093,0.653913,...,0,0,0,0,0,0,0,0,0,0
9,0.241791,0.07644576,0.038478,5.911671e-11,4500.0,1.522222,-0.085358,-0.334245,0.762564,-0.903934,...,0,0,0,0,0,0,0,0,0,0


In [46]:
train_X = train_X.sort_values(by='listing_id').reset_index(drop=True)
# train_X.reset_index
train_X.head(20)

Unnamed: 0,building_id_mean_med,building_id_mean_high,manager_id_mean_med,manager_id_mean_high,median_price_bed,ratio_bed,compound,neg,neu,pos,...,feature_washer_&_dryer,feature_washer_in_unit,feature_wheelchair_access,feature_wheelchair_ramp,feature_wifi,feature_wifi_access,feature_wood-burning_fireplace,feature_yard,feature_yoga_classes,target
0,0.209004,1.6471e-20,0.4977719,0.1662864,2900.0,1.101724,-1.34078,-0.779948,0.993037,-1.653623,...,0,0,0,0,0,0,0,0,0,2
1,0.223571,0.07631393,0.2848296,0.524041,2400.0,0.833333,-0.105235,1.693842,-0.314646,1.189324,...,0,0,0,0,0,0,0,0,0,1
2,0.539652,0.306711,0.3042367,0.5637824,4500.0,1.3,1.290802,-0.779948,0.000723,1.085749,...,0,0,0,0,0,0,0,0,0,2
3,0.182525,0.02091337,0.1834092,0.0209099,2900.0,0.946552,0.622219,-0.155228,0.343788,0.266971,...,0,0,0,0,0,0,0,0,0,1
4,0.262472,0.05285947,0.4941058,0.09875175,2900.0,0.827586,-0.37031,-0.779948,0.382525,0.24335,...,0,0,0,0,0,0,0,0,0,1
5,0.058923,0.02424558,0.2001526,0.06825954,2900.0,1.258621,0.92786,-0.779948,0.079705,0.927884,...,0,0,0,0,0,0,0,0,0,0
6,0.060274,0.02358336,0.2174423,0.07419695,3350.0,0.940299,-1.34078,-0.779948,0.993037,-1.653623,...,0,0,0,0,0,0,0,0,0,0
7,0.060676,0.0232558,4.9144259999999996e-30,1.694175e-30,2900.0,1.146552,-1.677115,0.947474,0.788451,-1.311848,...,0,0,0,0,0,0,0,0,0,0
8,0.058769,0.0242394,0.223737,0.09420263,2900.0,0.963793,-1.238563,1.674565,0.519566,-0.873793,...,0,0,0,0,0,0,0,0,0,0
9,0.060529,0.02337462,0.1134886,0.03877598,3350.0,0.716418,-0.828807,1.319638,0.041084,0.661329,...,0,0,0,0,0,0,0,0,0,0


In [47]:
train_y = train_X['target'].values
train_y[:20]

array([2, 1, 2, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [48]:
train_X = train_X.drop('target',axis = 1)

In [49]:
train_X.head()

Unnamed: 0,building_id_mean_med,building_id_mean_high,manager_id_mean_med,manager_id_mean_high,median_price_bed,ratio_bed,compound,neg,neu,pos,...,feature_washer/dryer_in_unit,feature_washer_&_dryer,feature_washer_in_unit,feature_wheelchair_access,feature_wheelchair_ramp,feature_wifi,feature_wifi_access,feature_wood-burning_fireplace,feature_yard,feature_yoga_classes
0,0.209004,1.6471e-20,0.497772,0.166286,2900.0,1.101724,-1.34078,-0.779948,0.993037,-1.653623,...,0,0,0,0,0,0,0,0,0,0
1,0.223571,0.07631393,0.28483,0.524041,2400.0,0.833333,-0.105235,1.693842,-0.314646,1.189324,...,0,0,0,0,0,0,0,0,0,0
2,0.539652,0.306711,0.304237,0.563782,4500.0,1.3,1.290802,-0.779948,0.000723,1.085749,...,0,0,0,0,0,0,0,0,0,0
3,0.182525,0.02091337,0.183409,0.02091,2900.0,0.946552,0.622219,-0.155228,0.343788,0.266971,...,0,0,0,0,0,0,0,0,0,0
4,0.262472,0.05285947,0.494106,0.098752,2900.0,0.827586,-0.37031,-0.779948,0.382525,0.24335,...,0,0,0,0,0,0,0,0,0,0


In [21]:
print train_X.isnull().values.any()

False


In [22]:
print test_X.isnull().values.any()

False


In [50]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape

(39481, 428)
(9871, 428)


In [51]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)
        
clf = clf.fit(X_train, y_train,
              eval_set = [(X_val,y_val)],
              eval_metric = 'multi_logloss',
              early_stopping_rounds = 50,
              verbose = 25)

Train until valid scores didn't improve in 50 rounds.
[25]	valid_0's multi_logloss: 0.596546
[50]	valid_0's multi_logloss: 0.548529
[75]	valid_0's multi_logloss: 0.536572
[100]	valid_0's multi_logloss: 0.531046
[125]	valid_0's multi_logloss: 0.528372
[150]	valid_0's multi_logloss: 0.527127
[175]	valid_0's multi_logloss: 0.526887
[200]	valid_0's multi_logloss: 0.526267
[225]	valid_0's multi_logloss: 0.52549
[250]	valid_0's multi_logloss: 0.52519
[275]	valid_0's multi_logloss: 0.524247
[300]	valid_0's multi_logloss: 0.524606
Early stopping, best iteration is:
[272]	valid_0's multi_logloss: 0.524245


In [52]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)

tmp  = 1000

In [53]:
for x in [8,15,31,63,127,255]:
    clf.set_params(num_leaves = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        num_leaves = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

8  	0.523447639565 875
15  	0.522912525382 420
31  	0.524245132735 272
63  	0.524991927516 151
127  	0.527521218102 105
255  	0.534582199981 69


In [54]:
print num_leaves
clf.set_params(num_leaves = num_leaves)

15


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [55]:
min_child_samples = 10

for x in [20, 30, 50, 70, 80,90,100,110,120,150,170,200,230,260]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

20  	0.52380419891 487
30  	0.523905051926 536
50  	0.524536082485 504
70  	0.524777343749 432
80  	0.523260827651 415
90  	0.524723643548 352
100  	0.523072862878 491
110  	0.52484500751 474
120  	0.522249019108 455
150  	0.523104253988 417
170  	0.523137432607 481
200  	0.524790252619 445
230  	0.524761489481 479
260  	0.524733385745 400


In [56]:
for x in [300,350,400,450,500]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

300  	0.522738726212 484
350  	0.526449033979 355
400  	0.525850893049 308
450  	0.525448032641 362
500  	0.523463602125 443


In [57]:
for x in [550,600,650,700,800]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

550  	0.525388614134 485
600  	0.525510141911 526
650  	0.52553170707 486
700  	0.524184667943 467
800  	0.525769343021 433


In [58]:
print min_child_samples
clf.set_params(min_child_samples = min_child_samples)


120


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=120, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [59]:
colsample_bytree = 1
for x in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(colsample_bytree = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        colsample_bytree = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.2  	0.519869784299 554
0.3  	0.521271379637 558
0.4  	0.521401743122 485
0.5  	0.521179851153 480
0.6  	0.522457816553 528
0.7  	0.521238716306 496
0.8  	0.520888647177 497
0.9  	0.523197240089 475


In [64]:
colsample_bytree = 0.2
for x in [0.05,0.1]:
    clf.set_params(colsample_bytree = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        colsample_bytree = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.05  	0.526687710672 1324
0.1  	0.524147788866 673


In [65]:
print colsample_bytree

clf.set_params(colsample_bytree = colsample_bytree)

0.2


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.2, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=15, max_depth=-1,
        max_drop=50, min_child_samples=120, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [66]:
subsample = 1.0
for x in [0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(subsample = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        subsample = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.5  	0.528421595081 637
0.6  	0.528364172063 383
0.7  	0.526122407828 600
0.8  	0.523388586702 660
0.9  	0.52271243082 628


In [67]:
print subsample
clf.set_params(subsample = subsample)

1.0


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.2, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=15, max_depth=-1,
        max_drop=50, min_child_samples=120, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [68]:
max_bin = 255

for x in [15,31,63, 127, 511, 1023, 2047]: #[200,300,400]:#
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

15  	0.522213717443 864
31  	0.523198247165 691
63  	0.520753717986 635
127  	0.520232551802 498
511  	0.519190662898 759
1023  	0.520922233368 508
2047  	0.520050347558 617


In [69]:
for x in [110, 150,180, 210, 300, 350,400,450, 550,600,650,700,800,900]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

110  	0.522053943788 633
150  	0.520780359971 626
180  	0.5198894751 653
210  	0.520989454107 586
300  	0.519787284522 528
350  	0.520464351886 545
400  	0.520515812175 637
450  	0.520687859183 648
550  	0.520988863694 549
600  	0.520354342361 650
650  	0.519570048811 611
700  	0.52000395968 588
800  	0.521005882578 700
900  	0.521934735309 653


In [70]:
print max_bin
clf.set_params(max_bin = max_bin)

511


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.2, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=511, max_depth=-1,
        max_drop=50, min_child_samples=120, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [72]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, learning_rate=0.1):
    skf = KFold(n_splits=5,random_state=seed)
    scores=[]
    for i, (train, val) in enumerate(skf.split(train_X)):
        est=lgb.LGBMClassifier(learning_rate=0.1,
                               max_bin=int(max_bin),
                               num_leaves=int(num_leaves),
                               min_child_samples=int(min_child_samples),
                               colsample_bytree=colsample_bytree,
                               subsample=subsample,
                               subsample_freq = 1
                              )
 
        train_x_fold = train_X.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_X.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric='multi_logloss',
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict_proba(val_x_fold)
        score = log_loss(val_y_fold, val_y_predict_fold)
        scores.append(score)
    return -np.mean(scores)


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (63,1000),
                                'num_leaves': (8,80),
                                'min_child_samples' :(60,500),
                                'colsample_bytree': (0.1,0.9),
                                'subsample' : (0.7,1)})

lgbm_BO.maximize(init_points=10, n_iter=50)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
    1 | 01m37s | [35m  -0.53455[0m | [32m            0.3972[0m | [32m 335.1068[0m | [32m           197.3932[0m | [32m     42.2940[0m | [32m     0.9286[0m | 
    2 | 01m40s |   -0.53631 |             0.1528 |  278.1771 |            249.9185 |      36.1665 |      0.9593 | 
    3 | 02m22s |   -0.53979 |             0.1298 |  574.6983 |            309.2746 |      79.9042 |      0.9485 | 
    4 | 02m20s |   -0.53496 |             0.6291 |  244.8545 |            409.2169 |      76.2128 |      0.9338 | 
    5 | 02m45s | [35m  -0.53440[0m | [32m            0.7582[0m | [32m 415.4839[0m | [32m           295.8915[0m | [32m     36.9249[0m | [32m     1.0000[0m | 
    6 | 02m14s |   -0.53685 |             0.3990 |  850

  " state: %s" % convergence_dict)


[31mBayesian Optimization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
   11 | 03m30s |   -0.53554 |             0.6055 |  987.0283 |            270.9801 |      10.7650 |      0.8168 | 
   12 | 01m53s |   -0.53507 |             0.3345 |  949.7900 |             84.7068 |      23.3041 |      0.9200 | 
   13 | 02m31s |   -0.53527 |             0.8447 |  147.8238 |             71.8293 |      67.8771 |      0.7623 | 
   14 | 02m27s |   -0.53473 |             0.7459 |  427.2431 |             68.0999 |      28.5079 |      0.7307 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   15 | 02m20s |   -0.53900 |             0.4829 |  982.7768 |            482.6601 |      48.9826 |      0.7836 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   16 | 01m39s |   -0.53725 |             0.3491 |  320.3157 |            451.5372 |      29.5362 |      0.7992 | 
   17 | 03m33s |   -0.53548 |             0.7188 |  957.9129 |            253.0793 |      75.7861 |      0.9585 | 
   18 | 02m04s |   -0.53597 |             0.3828 |  890.7508 |            180.3190 |      10.8158 |      0.7921 | 
   19 | 03m36s |   -0.53517 |             0.7035 |  453.9726 |            150.7810 |      72.0485 |      0.9091 | 
   20 | 02m28s |   -0.53542 |             0.7692 |  290.7798 |             68.0198 |      74.0213 |      0.8386 | 
   21 | 01m51s |   -0.53465 |             0.2525 |  836.1215 |             61.4188 |      50.7367 |      0.8980 | 


  " state: %s" % convergence_dict)


   22 | 01m28s |   -0.53664 |             0.1457 |   69.3069 |            329.9957 |      37.4253 |      0.8778 | 
   23 | 04m18s |   -0.53708 |             0.8304 |  918.5980 |             71.2561 |      69.9413 |      0.7458 | 


  " state: %s" % convergence_dict)


   24 | 01m49s |   -0.53632 |             0.6325 |   63.0317 |            155.2661 |      67.4792 |      0.7909 | 


  " state: %s" % convergence_dict)


   25 | 02m54s |   -0.53575 |             0.7254 |  746.7933 |            159.2006 |      51.4805 |      0.7575 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   26 | 01m37s |   -0.53723 |             0.2194 |  848.1631 |             92.0648 |      46.4313 |      0.7168 | 


  " state: %s" % convergence_dict)


   27 | 03m12s |   -0.53388 |             0.3397 |  825.0653 |             80.7654 |      70.9759 |      0.9785 | 
   28 | 02m25s |   -0.53501 |             0.4705 |  766.7361 |            308.8398 |      57.2894 |      0.9614 | 
   29 | 01m42s |   -0.54001 |             0.4461 |   72.1440 |            475.0500 |      72.6065 |      0.7570 | 


  " state: %s" % convergence_dict)


   30 | 01m52s |   -0.53519 |             0.3011 |  990.0439 |            147.0767 |      14.2016 |      0.9394 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   31 | 03m17s |   -0.53935 |             0.6608 |  669.3812 |            467.9600 |      14.3093 |      0.7040 | 
   32 | 03m02s |   -0.53939 |             0.8712 |  499.5160 |            495.0878 |      75.7846 |      0.7145 | 
   33 | 03m34s |   -0.53528 |             0.7099 |  890.1592 |            289.1036 |      68.2002 |      0.8998 | 
   34 | 01m28s |   -0.53622 |             0.1430 |  107.5720 |            121.1598 |      20.4682 |      0.7697 | 
   35 | 02m23s |   -0.53825 |             0.1491 |  567.7630 |             90.9891 |      50.6571 |      0.7748 | 
   36 | 02m54s |   -0.53647 |             0.8672 |  151.5483 |            281.2266 |      75.2219 |      0.8788 | 


  " state: %s" % convergence_dict)


   37 | 02m51s |   -0.53930 |             0.8362 |  192.7165 |            499.3348 |      10.1408 |      0.7462 | 
   38 | 01m42s |   -0.54045 |             0.1266 |  796.6862 |            198.0553 |      76.5914 |      0.8209 | 
   39 | 02m03s |   -0.53631 |             0.5203 |  401.3755 |            147.4150 |      13.7352 |      0.7260 | 


  " state: %s" % convergence_dict)


   40 | 02m34s |   -0.53736 |             0.6449 |  382.9687 |            325.3132 |      75.8212 |      0.7666 | 


  " state: %s" % convergence_dict)


   41 | 02m50s |   -0.53396 |             0.5175 |   64.0132 |             73.6541 |      11.4813 |      0.8887 | 


  " state: %s" % convergence_dict)


   42 | 02m26s |   -0.53676 |             0.4759 |  876.8490 |            336.7889 |      10.6877 |      0.7037 | 
   43 | 02m28s |   -0.53592 |             0.5163 |  666.3633 |            187.0051 |      21.9445 |      0.7373 | 
   44 | 02m14s |   -0.53721 |             0.5509 |  414.9060 |             65.8363 |      76.9343 |      0.7217 | 


  " state: %s" % convergence_dict)


   45 | 03m26s |   -0.53405 |             0.8569 |  166.7589 |             60.1644 |      12.7236 |      0.9535 | 
   46 | 03m21s |   -0.53575 |             0.7337 |  255.4097 |            138.7666 |      10.0094 |      0.8033 | 


  " state: %s" % convergence_dict)


   47 | 01m43s |   -0.53607 |             0.2538 |  203.3084 |            376.6512 |      15.6626 |      0.7876 | 


  " state: %s" % convergence_dict)


   48 | 03m27s |   -0.53553 |             0.8523 |  496.9743 |            207.2984 |      13.4535 |      0.9888 | 
   49 | 03m29s |   -0.53374 |             0.8920 |  351.3001 |             64.7570 |      14.6907 |      0.8868 | 
   50 | 02m32s |   -0.53939 |             0.5780 |  779.5202 |            495.0038 |      79.4391 |      0.7839 | 
   51 | 02m49s |   -0.53484 |             0.5394 |  752.7225 |            358.8743 |      12.7207 |      0.8524 | 
   52 | 02m19s |   -0.53413 |             0.4471 |  483.3408 |            370.0730 |      19.3280 |      0.9625 | 


  " state: %s" % convergence_dict)


   53 | 02m02s |   -0.53580 |             0.6487 |   69.6294 |             67.9062 |      74.4751 |      0.8529 | 


  " state: %s" % convergence_dict)


   54 | 03m30s |   -0.53461 |             0.7768 |  713.1285 |            315.3862 |      12.5283 |      0.9357 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   55 | 03m06s |   -0.53549 |             0.8471 |  420.3840 |            405.3900 |      13.3197 |      0.8568 | 
   56 | 02m59s |   -0.53534 |             0.7055 |  471.8181 |            267.3971 |      14.8765 |      0.9853 | 
   57 | 03m21s |   -0.53554 |             0.3868 |  972.7902 |            324.5050 |      73.1005 |      0.9193 | 


  " state: %s" % convergence_dict)


   58 | 03m01s |   -0.53479 |             0.8804 |  289.1428 |             64.3399 |      13.6808 |      0.8719 | 
   59 | 02m02s |   -0.53943 |             0.6065 |  234.6067 |            493.8118 |      75.6841 |      0.7175 | 


  " state: %s" % convergence_dict)


   60 | 02m22s |   -0.53781 |             0.5563 |  800.1224 |            370.3717 |      72.1752 |      0.7148 | 


In [73]:
gbm_bo_scores = pd.DataFrame([[s[0]['num_leaves'],
                               s[0]['min_child_samples'],
                               s[0]['max_bin'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[1]] for s in zip(lgbm_BO.res['all']['params'],lgbm_BO.res['all']['values'])],
                            columns = ['num_leaves',
                                       'min_child_samples',
                                       'max_bin',
                                       'colsample_bytree',
                                       'subsample',
                                       'score'])
gbm_bo_scores=gbm_bo_scores.sort_values('score',ascending=False)
gbm_bo_scores.head(10)

Unnamed: 0,num_leaves,min_child_samples,max_bin,colsample_bytree,subsample,score
38,14.690687,64.757042,351.300118,0.892044,0.886818,-0.533743
16,70.975862,80.765399,825.065274,0.339679,0.978516,-0.533882
30,11.481279,73.654074,64.013241,0.517505,0.888681,-0.533957
34,12.723564,60.164362,166.758938,0.856873,0.953502,-0.534054
41,19.328042,370.073023,483.340785,0.44709,0.962529,-0.534129
43,12.528268,315.386235,713.128543,0.776829,0.935694,-0.534608
10,50.736712,61.418815,836.121489,0.252467,0.898009,-0.534649
3,28.507858,68.099928,427.243121,0.74593,0.730671,-0.534735
47,13.680802,64.339886,289.1428,0.880381,0.871935,-0.534792
40,12.720713,358.874296,752.722504,0.539404,0.852383,-0.534844


In [75]:
def lgbm_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=50):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,shuffle = True, random_state=seed)
    N_class = len(set(train_y))

    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    

    
    for j, est in enumerate(estimators):
        est.set_params(learning_rate = 0.005)
        est.set_params(subsample_freq = 1)
        est.set_params(objective = 'multiclass')
        est.set_params(n_estimators = 1000000)

        
        print ("Model %d: %s" %(j+1, est)) 

        
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]
            
            est.fit(train_x_fold, train_y_fold,
                   eval_set = [(val_x_fold,val_y_fold)],
                   eval_metric = 'multi_logloss',
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)
            
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            val_y_predict_fold = est.predict_proba(val_x_fold,num_iteration = best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,num_iteration=best_round)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
#         test_blend_x[:,(j*N_class):(j+1)*N_class] = \
#                 np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
#                           test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [76]:
est =       [
#     lgb.LGBMClassifier(num_leaves = 14,
#                                 min_child_samples = 64,
#                                 colsample_bytree = 0.892044,
#                                 subsample = 0.886818,
#                                 max_bin = 351),
             lgb.LGBMClassifier(num_leaves = 70,
                                min_child_samples = 80,
                                colsample_bytree = 0.339679,
                                subsample = 0.978516,
                                max_bin = 825),
#              lgb.LGBMClassifier(num_leaves = 31,
#                                 min_child_samples = 66,
#                                 colsample_bytree = 0.425728,
#                                 subsample = 0.899097,
#                                 max_bin = 421),
#              lgb.LGBMClassifier(num_leaves = 17,
#                                 min_child_samples = 64,
#                                 colsample_bytree = 0.497790,
#                                 subsample = 0.934519,
#                                 max_bin = 559),
#              lgb.LGBMClassifier(num_leaves = 16,
#                                 min_child_samples = 66,
#                                 colsample_bytree = 0.623331,
#                                 subsample = 0.934423,
#                                 max_bin = 357)
            ]

#  	 	num_leaves 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 38 	14.690687 	64.757042 	 	 	351.300118 	0.892044 	 	 	0.886818 	-0.533743
# 16 	70.975862 	80.765399 	 	 	825.065274 	0.339679 	 	 	0.978516 	-0.533882


(train_blend_x_gbm,
 test_blend_x_gbm_mean,
 test_blend_x_gbm_gmean,
 blend_scores_gbm,
 best_rounds_gbm)= lgbm_blend(est, 
                               train_X, train_y, 
                               test_X,
                               10,
                               1000) #as the learning rate decreases the number of stopping rounds need to be increased

Blend 1 estimators for 10 folds
Model 1: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.339679, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.005, max_bin=825, max_depth=-1,
        max_drop=50, min_child_samples=80, min_child_weight=5,
        min_split_gain=0, n_estimators=1000000, nthread=-1, num_leaves=70,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.978516, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
Model 1 fold 1
best round 3993
('Score: ', 0.51410061879901192)
Model 1 fold 1 fitting finished in 967.043s
Model 1 fold 2
best round 4219
('Score: ', 0.53072953974363501)
Model 1 fold 2 fitting finished in 992.226s
Model 1 fold 3
best round 4436
('Score: ', 0.52116742359637547)
Model 1 fold 3 fitting finished in 909.220s
Model 1 fold 4
best round 4677
('Score: ', 0.50288131181666806)
Model 1 fold 4 fit

In [87]:
tmp = pd.DataFrame(train_blend_x_gbm,columns=["low", "medium", "high"])
tmp['listing_id'] = train_X['listing_id'].values
tmp_X = pd.read_csv(data_path + 'train_BM_0401.csv')
tmp_X = tmp_X.merge(tmp,on='listing_id',how='left')

Unnamed: 0,low,medium,high,listing_id
0,0.478539,0.442273,0.079188,6811957
1,0.045494,0.183336,0.771171,6811965
2,0.046062,0.551995,0.401944,6811966
3,0.424488,0.515695,0.059817,6811973
4,0.549026,0.384421,0.066553,6811975


In [94]:
train_blend_x_gbm = tmp_X[["low", "medium", "high"]].values

In [95]:
train_blend_x_gbm

array([[ 0.56978898,  0.39050017,  0.03971085],
       [ 0.47319149,  0.48903452,  0.03777399],
       [ 0.65699981,  0.3284372 ,  0.014563  ],
       ..., 
       [ 0.48368995,  0.44174765,  0.0745624 ],
       [ 0.40629347,  0.50903604,  0.08467048],
       [ 0.11609501,  0.39095831,  0.49294668]])

In [96]:
now = datetime.now()

name_train_blend = '../blend/train_blend_LightGBM_sortbylistingid_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_LightGBM_mean_sortbylistingid_0401_sortbylistingid_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../blend/test_blend_LightGBM_gmean_sortbylistingid_0401_sortbylistingid_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_gbm,axis=0))
print (np.mean(best_rounds_gbm,axis=0))
np.savetxt(name_train_blend,train_blend_x_gbm, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_gbm_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_gbm_gmean, delimiter=",")

[ 0.5175872]
[ 4335.9]


In [7]:
sub_name = '../output/sub_LightGBM_BM_0401_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_gbm_mean[:,6:9])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

In [None]:
# data 0322

# [ 0.52309656  0.52262419  0.52356597  0.52188342  0.52198129]
# [ 15266.3  12448.   15418.7   7687.9  11589.2]

# data 0331
# [ 0.51778446  0.51758745  0.51859108  0.51763268  0.51941101]
# [ 15053.7   8042.4  16049.9  13754.3  28486.6]


In [35]:
temp = (test_blend_x_gbm_mean[:,6:9] +test_blend_x_gbm_gmean[:,6:9])/2

In [37]:
sub_name = '../output/sub_LightGBM_BM_0331_total_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(temp)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)