In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn import preprocessing
import lightgbm as lgb
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime
from scipy.stats.mstats import gmean

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [2]:
#input data
train_df=pd.read_json('../input/train.json').reset_index(drop = True)
test_df=pd.read_json('../input/test.json').reset_index(drop = True)

print train_df.shape
print test_df.shape

(49352, 15)
(74659, 14)


In [3]:
data_path = "../input/"
train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')

print train_X_0322.shape
print test_X_0322.shape

(49352, 322)
(74659, 322)


In [4]:
target_num_map = {'high':2, 'medium':1, 'low':0}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

In [5]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_CV_MS_52571.csv')
test_X = pd.read_csv(data_path + 'test_CV_MS_52571.csv')

In [6]:
time_feature = pd.read_csv(data_path + 'listing_image_time.csv')
time_feature.columns = ['listing_id','time_stamp']
train_X = train_X.merge(time_feature,on='listing_id',how='left')
test_X = test_X.merge(time_feature,on='listing_id',how='left')

print train_X.shape
print test_X.shape

(49352, 223)
(74659, 223)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape

# import sys  
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
# reload(sys)  
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
# sys.setdefaultencoding('utf8')

(39481, 223)
(9871, 223)


In [8]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)
        
clf = clf.fit(X_train, y_train,
              eval_set = [(X_val,y_val)],
              eval_metric = 'multi_logloss',
              early_stopping_rounds = 50,
              verbose = 25)

Train until valid scores didn't improve in 50 rounds.
[25]	valid_0's multi_logloss: 0.588823
[50]	valid_0's multi_logloss: 0.538591
[75]	valid_0's multi_logloss: 0.524487
[100]	valid_0's multi_logloss: 0.517198
[125]	valid_0's multi_logloss: 0.51425
[150]	valid_0's multi_logloss: 0.512217
[175]	valid_0's multi_logloss: 0.510664
[200]	valid_0's multi_logloss: 0.509768
[225]	valid_0's multi_logloss: 0.509511
[250]	valid_0's multi_logloss: 0.509082
[275]	valid_0's multi_logloss: 0.509147
[300]	valid_0's multi_logloss: 0.50957
[325]	valid_0's multi_logloss: 0.509903
Early stopping, best iteration is:
[286]	valid_0's multi_logloss: 0.508955


In [9]:
pred_y = clf.predict_proba(test_X, num_iteration = clf.best_iteration)
pred_y

array([[ 0.14490733,  0.62461861,  0.23047406],
       [ 0.95660061,  0.03789534,  0.00550405],
       [ 0.8776459 ,  0.11560843,  0.00674567],
       ..., 
       [ 0.7207035 ,  0.23590706,  0.04338944],
       [ 0.10608034,  0.45213807,  0.44178159],
       [ 0.96449493,  0.03358761,  0.00191746]])

In [9]:
# now = datetime.now()
# sub_name = '../output/sub_LightGBM_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

# out_df = pd.DataFrame(pred_y[:,:3])
# out_df.columns = ["low", "medium", "high"]
# out_df["listing_id"] = sub_id
# out_df.to_csv(sub_name, index=False)

# Tune LightGBM

In [10]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)

tmp  = 1000

In [11]:
for x in [8,15,31,63,127,255]:
    clf.set_params(num_leaves = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        num_leaves = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

8  	0.510622313281 751
15  	0.510599053691 445
31  	0.508955275492 286
63  	0.509733744621 157
127  	0.512473910556 105
255  	0.52244729458 69


In [12]:
print num_leaves
clf.set_params(num_leaves = num_leaves)

31


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [13]:
min_child_samples = 10

for x in [20, 30, 50, 70, 80,90,100,110,120,150,170,200,230,260]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

20  	0.508801127766 300
30  	0.510046803883 237
50  	0.508428019615 253
70  	0.50986653128 217
80  	0.507873251903 313
90  	0.50960123034 250
100  	0.508001037864 245
110  	0.507422421625 269
120  	0.508982438783 290
150  	0.507776221775 246
170  	0.507834646192 259
200  	0.508524307346 241
230  	0.508264263442 293
260  	0.509057569341 204


In [14]:
for x in [300,350,400,450,500]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

300  	0.509475978131 280
350  	0.508225751106 235
400  	0.508510220635 265
450  	0.509242192358 227
500  	0.510873003257 266


In [15]:
print min_child_samples
clf.set_params(min_child_samples = min_child_samples)


110


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=110, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [16]:
colsample_bytree = 1
for x in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(colsample_bytree = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        colsample_bytree = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.2  	0.507058612673 325
0.3  	0.505958981181 285
0.4  	0.507203992338 313
0.5  	0.505348103064 324
0.6  	0.508285337677 334
0.7  	0.507455853861 298
0.8  	0.509746019328 237
0.9  	0.507877264001 270


In [17]:
print colsample_bytree

clf.set_params(colsample_bytree = colsample_bytree)

0.5


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.5, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=110, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [18]:
subsample = 1.0
for x in [0.5,0.6,0.7,0.8,0.9]:
    clf.set_params(subsample = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        subsample = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.5  	0.514180096443 257
0.6  	0.511188112068 280
0.7  	0.509958029638 271
0.8  	0.509227886965 256
0.9  	0.507082987952 315


In [19]:
print subsample
clf.set_params(subsample = subsample)

1.0


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.5, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=110, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [20]:
max_bin = 255

for x in [15,31,63, 127, 511, 1023, 2047]: #[200,300,400]:#
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

15  	0.514500520046 259
31  	0.509383120011 305
63  	0.507052514588 283
127  	0.50902541054 250
511  	0.506975171613 256
1023  	0.507322946675 257
2047  	0.50721377153 300


In [21]:
for x in [150,200,300,350,400]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

150  	0.50844991167 287
200  	0.508507936091 253
300  	0.507923013597 317
350  	0.507106259968 275
400  	0.506939260004 257


In [23]:
for x in [550,600,650, 700,750,800]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

550  	0.508498522582 263
600  	0.507616451545 288
650  	0.50817674737 266
700  	0.508281832035 236
750  	0.507573919401 291
800  	0.506912912043 294


In [22]:
print max_bin
clf.set_params(max_bin = max_bin)

255


LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.5, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=110, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=31,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1.0, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [25]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, learning_rate=0.1):
    skf = KFold(n_splits=5,random_state=seed)
    scores=[]
    for i, (train, val) in enumerate(skf.split(train_X)):
        est=lgb.LGBMClassifier(learning_rate=0.1,
                               max_bin=int(max_bin),
                               num_leaves=int(num_leaves),
                               min_child_samples=int(min_child_samples),
                               colsample_bytree=colsample_bytree,
                               subsample=subsample,
                               subsample_freq = 1
                              )
 
        train_x_fold = train_X.iloc[train]
        train_y_fold = train_y[train]
        val_x_fold = train_X.iloc[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric='multi_logloss',
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict_proba(val_x_fold,num_iteration=est.best_iteration)
        score = log_loss(val_y_fold, val_y_predict_fold)
        scores.append(score)
    return -np.mean(scores)


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (127,511),
                                'num_leaves': (15,127),
                                'min_child_samples' :(70,200),
                                'colsample_bytree': (0.2,0.7),
                                'subsample' : (0.8,1)})

lgbm_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
    1 | 02m15s | [35m  -0.51984[0m | [32m            0.2696[0m | [32m 453.5988[0m | [32m           148.6024[0m | [32m     84.7992[0m | [32m     0.9807[0m | 
    2 | 01m29s |   -0.52106 |             0.2047 |  185.9080 |             89.7965 |      79.6393 |      0.9058 | 
    3 | 02m30s |   -0.52396 |             0.2121 |  195.1548 |            112.2907 |     116.8766 |      0.8611 | 
    4 | 01m52s |   -0.52125 |             0.5756 |  234.2694 |            161.3905 |      82.9552 |      0.8089 | 
    5 | 02m12s | [35m  -0.51947[0m | [32m            0.4735[0m | [32m 308.4164[0m | [32m           198.0203[0m | [32m     68.6747[0m | [32m     0.8521[0m | 
    6 | 01m54s |   -0.51991 |             0.3404 |  434

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   12 | 02m47s | [35m  -0.51724[0m | [32m            0.3303[0m | [32m 149.8989[0m | [32m            72.4408[0m | [32m     17.7380[0m | [32m     0.8735[0m | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   13 | 02m05s |   -0.51876 |             0.3449 |  202.5646 |            197.6180 |      21.1409 |      0.9445 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   14 | 02m38s |   -0.51897 |             0.6535 |  179.2739 |             72.1150 |      16.9902 |      0.9282 | 
   15 | 02m23s |   -0.51731 |             0.4360 |  305.3458 |             70.4679 |      16.6436 |      0.8941 | 
   16 | 03m10s |   -0.51816 |             0.5796 |  138.5868 |            188.4178 |      46.6201 |      0.9784 | 
   17 | 01m56s |   -0.51834 |             0.2207 |  133.6819 |            131.8981 |      20.1165 |      0.9064 | 
   18 | 01m35s |   -0.52034 |             0.2316 |  133.9744 |             73.2643 |      39.4984 |      0.8586 | 
   19 | 02m49s |   -0.51954 |             0.6212 |  420.9165 |            197.2228 |      17.2634 |      0.8751 | 
   20 | 03m09s |   -0.51812 |             0.3611 |  414.6929 |             70.3574 |      24.6119 |      0.9383 | 
   21 | 03m34s |   -0.52259 |             0.6633 |  419.9823 |             70.0671 |     126.5746 |      0.9258 | 
   22 | 02m38s |   -0.51891 |             0.4223 |  496.0241 |             89.37

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   24 | 04m11s |   -0.51982 |             0.6629 |  336.1426 |             79.7485 |      16.3997 |      0.9399 | 


  " state: %s" % convergence_dict)


   25 | 01m43s |   -0.51999 |             0.4739 |  162.5283 |            195.4038 |     126.6713 |      0.9867 | 
   26 | 02m06s |   -0.51793 |             0.5297 |  226.2377 |             72.3611 |      15.0273 |      0.9315 | 
   27 | 02m49s |   -0.51841 |             0.3509 |  280.6198 |             72.6335 |      22.1573 |      0.9398 | 
   28 | 01m49s |   -0.51933 |             0.2206 |  392.8786 |            199.5551 |      69.7294 |      0.9912 | 


  " state: %s" % convergence_dict)


   29 | 02m25s |   -0.52126 |             0.5859 |  282.0271 |            194.5213 |     116.2780 |      0.9746 | 


  " state: %s" % convergence_dict)


   30 | 01m35s |   -0.51770 |             0.3235 |  165.9935 |            181.3839 |      47.9315 |      0.9002 | 
   31 | 02m58s |   -0.51846 |             0.3318 |  466.5652 |             75.9761 |      21.9407 |      0.9581 | 


  " state: %s" % convergence_dict)


   32 | 02m10s |   -0.51897 |             0.6926 |  161.9012 |            155.1353 |      51.5838 |      0.9390 | 
   33 | 02m15s |   -0.51920 |             0.2695 |  449.2952 |            196.1942 |      61.5250 |      0.8778 | 
   34 | 03m36s |   -0.52256 |             0.6963 |  508.0199 |             73.3694 |     118.5023 |      0.9655 | 
   35 | 02m00s |   -0.51856 |             0.3360 |  195.0799 |            194.4718 |      62.0153 |      0.9848 | 


  " state: %s" % convergence_dict)


   36 | 02m00s |   -0.51784 |             0.2664 |  152.5018 |            170.0339 |      25.0632 |      0.8795 | 
   37 | 02m30s |   -0.51977 |             0.4207 |  438.3189 |            149.2793 |      17.8988 |      0.9311 | 


  " state: %s" % convergence_dict)


   38 | 02m47s |   -0.51813 |             0.6632 |  142.8504 |            194.8516 |      20.3053 |      0.9043 | 


  " state: %s" % convergence_dict)


   39 | 02m28s |   -0.52006 |             0.5692 |  370.7397 |            193.7954 |     118.9099 |      0.9159 | 
   40 | 02m33s |   -0.52069 |             0.2700 |  131.0887 |            199.2556 |      79.2522 |      0.8067 | 
   41 | 01m24s |   -0.51952 |             0.2348 |  133.8214 |            161.4403 |      31.6986 |      0.8055 | 
   42 | 02m30s |   -0.51916 |             0.5266 |  327.4980 |            182.1242 |      26.7996 |      0.9831 | 
   43 | 01m55s |   -0.51826 |             0.3772 |  183.3714 |            137.9335 |      20.2016 |      0.8503 | 
   44 | 02m30s |   -0.51902 |             0.3446 |  171.9214 |             99.4287 |      16.7945 |      0.9639 | 
   45 | 03m07s |   -0.51826 |             0.3943 |  149.4050 |            199.1900 |      39.1987 |      0.8583 | 
   46 | 02m45s |   -0.51859 |             0.4529 |  265.4177 |            160.5125 |      15.3743 |      0.8353 | 
   47 | 03m13s |   -0.52098 |             0.6557 |  413.9109 |            199.75

  " state: %s" % convergence_dict)


   48 | 03m05s | [35m  -0.51676[0m | [32m            0.4170[0m | [32m 150.3986[0m | [32m            86.0221[0m | [32m     16.1275[0m | [32m     0.8598[0m | 


  " state: %s" % convergence_dict)


   49 | 02m45s |   -0.52063 |             0.5935 |  316.0501 |            196.7603 |      18.5986 |      0.9865 | 
   50 | 02m04s |   -0.51855 |             0.4869 |  507.4570 |            149.6155 |      47.6210 |      0.9752 | 


In [26]:
gbm_bo_scores = pd.DataFrame([[s[0]['num_leaves'],
                               s[0]['min_child_samples'],
                               s[0]['max_bin'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[1]] for s in zip(lgbm_BO.res['all']['params'],lgbm_BO.res['all']['values'])],
                            columns = ['num_leaves',
                                       'min_child_samples',
                                       'max_bin',
                                       'colsample_bytree',
                                       'subsample',
                                       'score'])
gbm_bo_scores=gbm_bo_scores.sort_values('score',ascending=False)
gbm_bo_scores.head(10)

Unnamed: 0,num_leaves,min_child_samples,max_bin,colsample_bytree,subsample,score
37,16.127511,86.022102,150.398557,0.416957,0.859761,-0.516757
1,17.737979,72.440783,149.898851,0.330273,0.873543,-0.517244
4,16.643581,70.467945,305.345802,0.436046,0.894116,-0.517307
19,47.931491,181.383895,165.993511,0.323495,0.900225,-0.517696
25,25.063164,170.033934,152.501751,0.266445,0.879513,-0.517844
15,15.027258,72.361097,226.237672,0.52969,0.931497,-0.517932
9,24.611915,70.357354,414.69293,0.361137,0.9383,-0.518121
27,20.305277,194.851596,142.85037,0.663189,0.904251,-0.518133
5,46.620119,188.417768,138.586753,0.57962,0.978396,-0.518157
34,39.198748,199.190016,149.404955,0.394342,0.858294,-0.51826


In [7]:
def lgbm_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=50, randomseed = 1234):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,shuffle = True, random_state=randomseed)
    N_class = len(set(train_y))

    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros((fold,N_params))
    best_rounds = np.zeros((fold, N_params))    

    
    for j, est in enumerate(estimators):
        est.set_params(learning_rate = 0.01)
        est.set_params(subsample_freq = 1)
        est.set_params(objective = 'multiclass')
        est.set_params(n_estimators = 100000)

        
#         print ("Model %d: %s" %(j+1, est)) 

        
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]
            
            est.fit(train_x_fold, train_y_fold,
                   eval_set = [(val_x_fold,val_y_fold)],
                   eval_metric = 'multi_logloss',
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)
            
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            val_y_predict_fold = est.predict_proba(val_x_fold,num_iteration = best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,num_iteration=best_round)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
           
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
#         test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
#                 np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
#                           gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
#                           gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print "Score for blended models is %f" % (np.mean(scores))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)

In [8]:
train_total = np.zeros((train_X.shape[0], 3))
test_total = np.zeros((test_X.shape[0], 3))
name_train_blend = '../tmp/train_lightgbm.csv'
name_test_blend = '../tmp/test_lightgbm.csv'
score_total = 0
count = 10

for n in range(count):
    randomseed = n + 42745
    est =       [lgb.LGBMClassifier(num_leaves = 16,
                                    min_child_samples = 86,
                                    colsample_bytree = 0.416957,
                                    subsample = 0.859761,
                                    max_bin = 150)]
    
#  	 	num_leaves 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 37 	16.127511 	86.022102 	 	 	150.398557 	0.416957 	 	 	0.859761 	-0.516757

    (train_blend_x_gbm,
     test_blend_x_gbm_mean,
     test_blend_x_gbm_gmean,
     blend_scores_gbm,
     best_rounds_gbm)= lgbm_blend(est,
                                  train_X, train_y,
                                  test_X,
                                  5,
                                  300, randomseed)
    
    train_total += train_blend_x_gbm
    test_total += test_blend_x_gbm_mean
    score_total += np.mean(blend_scores_gbm)
    
    np.savetxt(name_train_blend,train_total, delimiter=",")
    np.savetxt(name_test_blend,test_total, delimiter=",")
    
train_total = train_total / count
test_total = test_total / count
score_total = score_total / count

Blend 1 estimators for 5 folds
Model 1 fold 1
best round 5727
('Score: ', 0.51204396688238618)
Model 1 fold 1 fitting finished in 249.140s
Model 1 fold 2
best round 5581
('Score: ', 0.5104832430369759)
Model 1 fold 2 fitting finished in 218.344s
Model 1 fold 3
best round 5738
('Score: ', 0.50602478747063229)
Model 1 fold 3 fitting finished in 222.025s
Model 1 fold 4
best round 6407
('Score: ', 0.51403399520257476)
Model 1 fold 4 fitting finished in 297.210s
Model 1 fold 5
best round 5850
('Score: ', 0.51980381725085945)
Model 1 fold 5 fitting finished in 241.291s
Score for model 1 is 0.512478
Score for blended models is 0.512478
Blend 1 estimators for 5 folds
Model 1 fold 1
best round 5957
('Score: ', 0.5192722189842166)
Model 1 fold 1 fitting finished in 280.374s
Model 1 fold 2
best round 5151
('Score: ', 0.52322885895018023)
Model 1 fold 2 fitting finished in 224.218s
Model 1 fold 3
best round 6284
('Score: ', 0.50965059886113395)
Model 1 fold 3 fitting finished in 245.084s
Model 1 f

In [9]:
train_lightgbm = pd.DataFrame(train_total)
train_lightgbm.columns = ["low", "medium", "high"]
train_lightgbm["listing_id"] = train_X.listing_id.values

test_lightgbm_mean = pd.DataFrame(test_total)
test_lightgbm_mean.columns = ["low", "medium", "high"]
test_lightgbm_mean["listing_id"] = test_X.listing_id.values

In [10]:
tmp_train = train_X_0322[['listing_id']].merge(train_lightgbm,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values
tmp_test_mean = test_X_0322[['listing_id']].merge(test_lightgbm_mean,on = 'listing_id', how = 'left')[["low", "medium", "high"]].values

In [11]:
now = datetime.now()

name_train_blend = '../blend/train_blend_LightGBM_last_10bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../blend/test_blend_LightGBM_mean_last_10bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
# name_test_blend_gmean = '../blend/test_blend_LightGBM_gmean_last_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print score_total
# print (np.mean(best_rounds_gbm,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")
# np.savetxt(name_test_blend_gmean,test_blend_x_gbm_gmean, delimiter=",")

0.513497233685


In [13]:
sub_name = '../output/sub_LightGBM_last_10bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(tmp_test_mean)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X_0322.listing_id.values
out_df.to_csv(sub_name, index=False)

In [14]:
y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))

print log_loss(y,tmp_train)

0.508599719699
