In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn import preprocessing
import lightgbm as lgb
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

# Load Data

In [2]:
data_path = "../input/"

train_df = pd.read_pickle(data_path + 'train_2017-03-05-22-40.pkl')
train_y = pd.read_pickle(data_path + 'y_2017-03-05-22-40.pkl')
test_df = pd.read_pickle(data_path + 'test_2017-03-05-22-40.pkl')
features_to_use = pd.read_pickle(data_path + 'featurestouse_2017-03-05-22-40.pkl')

tr_desc_sparse = pd.read_pickle(data_path + 'tr_desc_sparse_2017-03-05-22-40.pkl')
tr_feat_sparse = pd.read_pickle(data_path + 'tr_feat_sparse_2017-03-05-22-40.pkl')
te_desc_sparse = pd.read_pickle(data_path + 'te_desc_sparse_2017-03-05-22-40.pkl')
te_feat_sparse = pd.read_pickle(data_path + 'te_feat_sparse_2017-03-05-22-40.pkl')

desc_sparse_cols = pd.read_pickle(data_path + 'desc_sparse_cols_2017-03-05-22-40.pkl')
feat_sparse_cols = pd.read_pickle(data_path + 'feat_sparse_cols_2017-03-05-22-40.pkl')

In [3]:
train_X = sparse.hstack([train_df[features_to_use], tr_desc_sparse, tr_feat_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_desc_sparse, te_feat_sparse]).tocsr()


all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape

(49352, 457) (74659, 457)


In [9]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)

In [64]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)
        
clf = clf.fit(X_train, y_train,
              eval_set = [(X_val,y_val)],
              eval_metric = 'multi_logloss',
              early_stopping_rounds = 50,
              verbose = 25)

Train until valid scores didn't improve in 50 rounds.
[25]	valid_0's multi_logloss: 0.627417
[50]	valid_0's multi_logloss: 0.582572
[75]	valid_0's multi_logloss: 0.568281
[100]	valid_0's multi_logloss: 0.560913
[125]	valid_0's multi_logloss: 0.557244
[150]	valid_0's multi_logloss: 0.554635
[175]	valid_0's multi_logloss: 0.553015
[200]	valid_0's multi_logloss: 0.552212
[225]	valid_0's multi_logloss: 0.55191
[250]	valid_0's multi_logloss: 0.551563
[275]	valid_0's multi_logloss: 0.551774
[300]	valid_0's multi_logloss: 0.552159
Early stopping, best iteration is:
[251]	valid_0's multi_logloss: 0.551424


In [17]:
pred_y = clf.predict_proba(test_X, num_iteration = clf.best_iteration)

In [19]:
now = datetime.now()
sub_name = '../output/sub_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y[:,:3])
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv(sub_name, index=False)

In [20]:
# cv_dataset = lgb.Dataset(train_X, train_y,
#                         free_raw_data = False,
#                         feature_name = all_features)

In [72]:
clf.evals_result.values()[0]['multi_logloss'][250]

0.55142379592804802

# Tune LightGBM

In [73]:
clf = lgb.LGBMClassifier()
clf.set_params(learning_rate = 0.1)
clf.set_params(subsample_freq = 1)
clf.set_params(objective = 'multiclass')
clf.set_params(n_estimators = 100000)
        

tmp  = 1000
for x in [8,15,31,63,127,255]:
    clf.set_params(num_leaves = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        num_leaves = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

8  	0.554749035506 727
15  	0.550121665474 581
31  	0.551423795928 251
63  	0.551923841631 149
127  	0.55330551502 102
255  	0.562214075515 70


In [75]:
clf.set_params(num_leaves = num_leaves)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [77]:
tmp  = 1000
for x in [10, 20, 30, 50, 70, 80,90,100,110,120,150,170,200,230,260]:
    clf.set_params(min_child_samples = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        min_child_samples = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

10  	0.550121665474 581
20  	0.549912688816 512
30  	0.550117278371 473
50  	0.550495059319 580
70  	0.549630570323 490
80  	0.550301382549 583
90  	0.549211688569 541
100  	0.549800126976 521
110  	0.550536072369 452
120  	0.548761755286 521
150  	0.549187250993 554
170  	0.548417297631 453
200  	0.547889637762 559
230  	0.548951807203 465
260  	0.549699494886 506


In [78]:
clf.set_params(min_child_samples = min_child_samples)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=200, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [79]:
tmp  = 1000
for x in [0.4,0.5,0.6,0.7,0.8,0.9,1]:
    clf.set_params(colsample_bytree = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        colsample_bytree = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.4  	0.549440682317 661
0.5  	0.548130940147 646
0.6  	0.547785619614 488
0.7  	0.546980258999 454
0.8  	0.549510061419 581
0.9  	0.548454288363 526
1  	0.547889637762 559


In [80]:
clf.set_params(colsample_bytree = colsample_bytree)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.7, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=200, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [81]:
tmp  = 1000
for x in [0.5,0.6,0.7,0.8,0.9,1]:
    clf.set_params(subsample = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        subsample = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

0.5  	0.553313333624 532
0.6  	0.552616940574 402
0.7  	0.550063288956 491
0.8  	0.5499600939 498
0.9  	0.547960666237 508
1  	0.546980258999 454


In [84]:
clf.set_params(subsample = subsample)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.7, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=15, max_depth=-1,
        max_drop=50, min_child_samples=200, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=15,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [86]:
tmp  = 1000
for x in [200,300,400]:#[15,31,63, 127, 255, 511, 1023, 2047]:
    clf.set_params(max_bin = x)
    clf = clf.fit(X_train, y_train,
                  eval_set = [(X_val,y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 50,
                  verbose = False)
    if tmp > clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]:
        max_bin = x
        tmp = clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1]


    print x, ' \t', clf.evals_result.values()[0]['multi_logloss'][clf.best_iteration -1], clf.best_iteration

200  	0.546597489793 675
300  	0.547176672163 574
400  	0.548544964529 574


In [None]:
15  	0.551604274677 627
31  	0.553090529644 464
63  	0.549920964586 639
127  	0.549448332103 560
200  	0.546597489793 675
255  	0.546980258999 454
300  	0.547176672163 574
400  	0.548544964529 574
511  	0.549596013107 411
1023  	0.548102295097 566
2047  	0.548225477116 552

In [88]:
def lgbm_cv(max_bin, num_leaves, min_child_samples, colsample_bytree, subsample, learning_rate=0.1):
    skf = KFold(n_splits=5,random_state=seed)
    scores=[]
    for i, (train, val) in enumerate(skf.split(train_X)):
        est=lgb.LGBMClassifier(learning_rate=0.1,
                               max_bin=int(max_bin),
                               num_leaves=int(num_leaves),
                               min_child_samples=int(min_child_samples),
                               colsample_bytree=colsample_bytree,
                               subsample=subsample,
                               subsample_freq = 1
                              )
 
        train_x_fold = train_X[train]
        train_y_fold = train_y[train]
        val_x_fold = train_X[val]
        val_y_fold = train_y[val]
        est.set_params( n_estimators=100000)
        est.fit(train_x_fold,
                train_y_fold,
                eval_set=[(val_x_fold, val_y_fold)],
                eval_metric='multi_logloss',
                early_stopping_rounds=50,
                verbose = False
               )
        val_y_predict_fold = est.predict_proba(val_x_fold)
        score = log_loss(val_y_fold, val_y_predict_fold)
        scores.append(score)
    return -np.mean(scores)


lgbm_BO = BayesianOptimization(lgbm_cv, 
                               {
                                'max_bin': (127,300),
                                'num_leaves': (8,31),
                                'min_child_samples' :(120,230),
                                'colsample_bytree': (0.6,1.0),
                                'subsample' : (0.8,1)})

lgbm_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   num_leaves |   subsample | 
    1 | 08m01s | [35m  -0.54963[0m | [32m            0.9460[0m | [32m 211.6904[0m | [32m           217.7480[0m | [32m     20.4129[0m | [32m     0.8806[0m | 
    2 | 03m03s |   -0.55004 |             0.7666 |  234.1145 |            130.0934 |      27.0889 |      0.9055 | 
    3 | 04m16s |   -0.54991 |             0.8285 |  275.2507 |            168.6393 |      12.8018 |      0.9677 | 
    4 | 03m32s | [35m  -0.54932[0m | [32m            0.8229[0m | [32m 261.7395[0m | [32m           157.2411[0m | [32m     17.8669[0m | [32m     0.8453[0m | 
    5 | 03m35s |   -0.55102 |             0.7590 |  173.4688 |            132.5745 |      14.9747 |      0.9379 | 
    6 | 02m55s |   -0.54977 |             0.6249 |  270

  " state: %s" % convergence_dict)


   12 | 03m18s |   -0.55000 |             0.7579 |  201.4824 |            174.6327 |      30.6962 |      0.8784 | 
   13 | 03m47s | [35m  -0.54864[0m | [32m            0.9206[0m | [32m 288.7490[0m | [32m           124.6431[0m | [32m     24.1693[0m | [32m     0.8716[0m | 


  " state: %s" % convergence_dict)


   14 | 03m12s |   -0.55034 |             0.6142 |  132.9146 |            168.0585 |      28.6169 |      0.9421 | 


  " state: %s" % convergence_dict)


   15 | 03m30s |   -0.54901 |             0.8155 |  286.7125 |            153.2362 |      26.3706 |      0.9325 | 
   16 | 04m53s |   -0.55237 |             0.8832 |  289.6937 |            124.1972 |       8.0902 |      0.8075 | 
   17 | 03m18s |   -0.55015 |             0.7765 |  283.6394 |            133.5231 |      29.8266 |      0.8444 | 
   18 | 03m30s |   -0.54995 |             0.8383 |  299.8306 |            194.4852 |      30.3670 |      0.8682 | 
   19 | 05m08s |   -0.55130 |             0.9380 |  176.2912 |            229.5021 |      12.5143 |      0.9461 | 


  " state: %s" % convergence_dict)


   20 | 03m40s | [35m  -0.54847[0m | [32m            0.8531[0m | [32m 295.1750[0m | [32m           124.7518[0m | [32m     30.5231[0m | [32m     0.9252[0m | 


  " state: %s" % convergence_dict)


   21 | 03m53s |   -0.54975 |             0.9705 |  156.4544 |            201.3598 |      28.0879 |      0.9061 | 


  " state: %s" % convergence_dict)


   22 | 03m06s |   -0.55062 |             0.6619 |  139.1473 |            120.8136 |      27.0732 |      0.9462 | 
   23 | 03m29s |   -0.55110 |             0.8387 |  246.5130 |            222.5992 |      30.6178 |      0.8588 | 
   24 | 03m13s |   -0.54990 |             0.6149 |  174.4775 |            169.5950 |      28.4826 |      0.8611 | 
   25 | 03m49s |   -0.55009 |             0.7837 |  128.7771 |            199.4040 |      16.6673 |      0.8735 | 
   26 | 03m38s |   -0.55013 |             0.8313 |  263.1905 |            172.2646 |      27.7548 |      0.8922 | 
   27 | 04m45s |   -0.55129 |             0.7480 |  214.8425 |            136.6220 |      10.1192 |      0.9577 | 
   28 | 03m46s |   -0.54951 |             0.8868 |  297.3956 |            222.4299 |      29.8218 |      0.8842 | 
   29 | 03m48s |   -0.55017 |             0.7577 |  299.9109 |            148.0573 |      19.8152 |      0.9457 | 
   30 | 05m45s |   -0.55116 |             0.8396 |  140.1333 |            124.36

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   32 | 05m44s |   -0.55105 |             0.9660 |  153.9076 |            163.3903 |       9.9017 |      0.9103 | 
   33 | 03m09s |   -0.55066 |             0.6215 |  129.4628 |            221.8249 |      29.5196 |      0.8512 | 
   34 | 03m44s |   -0.54971 |             0.7369 |  274.4785 |            120.6243 |      24.2176 |      0.8231 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   35 | 03m48s |   -0.54906 |             0.9571 |  285.2068 |            164.9676 |      29.6011 |      0.8406 | 


  " state: %s" % convergence_dict)


   36 | 04m30s |   -0.55034 |             0.8858 |  226.8172 |            170.4293 |      14.4001 |      0.8586 | 
   37 | 03m05s |   -0.55148 |             0.6120 |  184.8332 |            226.6471 |      29.6228 |      0.8342 | 
   38 | 03m15s |   -0.54989 |             0.6624 |  196.0310 |            145.2668 |      29.9471 |      0.8057 | 


  " state: %s" % convergence_dict)


   39 | 05m36s |   -0.55134 |             0.9794 |  213.6993 |            225.1597 |       9.0249 |      0.9152 | 


  " state: %s" % convergence_dict)


   40 | 03m44s |   -0.55037 |             0.8811 |  198.7085 |            121.4965 |      26.3915 |      0.9500 | 
   41 | 03m53s |   -0.54956 |             0.7665 |  298.0426 |            212.8628 |      18.6427 |      0.8351 | 
   42 | 03m52s |   -0.55089 |             0.8129 |  291.2952 |            122.8679 |      22.8399 |      0.8387 | 
   43 | 04m52s |   -0.54974 |             0.8611 |  254.4196 |            140.8148 |      14.4113 |      0.9423 | 
   44 | 04m23s |   -0.55170 |             0.6287 |  294.1900 |            229.7629 |      10.4736 |      0.8196 | 
   45 | 05m33s |   -0.55045 |             0.9916 |  272.8526 |            142.5846 |      12.9358 |      0.9678 | 
   46 | 05m59s |   -0.55032 |             0.9979 |  287.4494 |            198.5770 |      11.5600 |      0.9247 | 
   47 | 05m49s |   -0.55077 |             0.9919 |  252.9265 |            175.9670 |      10.7541 |      0.9384 | 


  " state: %s" % convergence_dict)


   48 | 07m00s |   -0.55098 |             0.9629 |  186.0242 |            175.2644 |       8.1343 |      0.9797 | 
   49 | 03m46s |   -0.55017 |             0.9731 |  167.5765 |            127.8197 |      30.4875 |      0.8551 | 


  " state: %s" % convergence_dict)


   50 | 04m12s |   -0.54998 |             0.9323 |  240.9843 |            154.7499 |      30.9518 |      0.9867 | 


In [89]:
gbm_bo_scores = pd.DataFrame([[s[0]['num_leaves'],
                               s[0]['min_child_samples'],
                               s[0]['max_bin'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[1]] for s in zip(lgbm_BO.res['all']['params'],lgbm_BO.res['all']['values'])],
                            columns = ['num_leaves',
                                       'min_child_samples',
                                       'max_bin',
                                       'colsample_bytree',
                                       'subsample',
                                       'score'])
gbm_bo_scores=gbm_bo_scores.sort_values('score',ascending=False)
gbm_bo_scores.head()

Unnamed: 0,num_leaves,min_child_samples,max_bin,colsample_bytree,subsample,score
9,30.523147,124.751807,295.175031,0.853059,0.925184,-0.548472
2,24.169263,124.643095,288.749046,0.920628,0.871552,-0.548643
4,26.370628,153.236247,286.712464,0.815477,0.932513,-0.549005
24,29.601053,164.967606,285.206768,0.957067,0.840632,-0.549058
17,29.821756,222.429922,297.395617,0.886827,0.884196,-0.549511


In [90]:
def lgbm_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    

    
    for j, est in enumerate(estimators):
        est.set_params(learning_rate = 0.01)
        est.set_params(subsample_freq = 1)
        est.set_params(objective = 'multiclass')
        est.set_params(n_estimators = 100000)

        
        print ("Model %d: %s" %(j+1, est)) 

        
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = train_y[val_index]
            
            est.fit(train_x_fold, train_y_fold,
                   eval_set = [(val_x_fold,val_y_fold)],
                   eval_metric = 'multi_logloss',
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)
            
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            val_y_predict_fold = est.predict_proba(val_x_fold,num_iteration = best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,num_iteration=best_round)
            
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
        test_blend_x[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x, scores,best_rounds)

In [91]:
lgb_params = [lgb.LGBMClassifier(num_leaves = 30,
                                min_child_samples = 124,
                                colsample_bytree = 0.853059,
                                subsample = 0.925184,
                                max_bin = 295),
             lgb.LGBMClassifier(num_leaves = 24,
                                min_child_samples = 124,
                                colsample_bytree = 0.920628,
                                subsample = 0.871552,
                                max_bin = 288),
             lgb.LGBMClassifier(num_leaves = 26,
                                min_child_samples = 153,
                                colsample_bytree = 0.815477,
                                subsample = 0.932513,
                                max_bin = 286),
             lgb.LGBMClassifier(num_leaves = 29,
                                min_child_samples = 164,
                                colsample_bytree = 0.957067,
                                subsample = 0.840632,
                                max_bin = 285),
             lgb.LGBMClassifier(num_leaves = 29,
                                min_child_samples = 222,
                                colsample_bytree = 0.886827,
                                subsample = 0.884196,
                                max_bin = 297)]

#  	num_leaves 	min_child_samples 	max_bin 	colsample_bytree 	subsample 	score
# 9 	30.523147 	124.751807 			295.175031 	0.853059 			0.925184 	-0.548472
# 2 	24.169263 	124.643095 			288.749046 	0.920628 			0.871552 	-0.548643
# 4 	26.370628 	153.236247 			286.712464 	0.815477 			0.932513 	-0.549005
# 24 	29.601053 	164.967606 			285.206768 	0.957067 			0.840632 	-0.549058
# 17 	29.821756 	222.429922 			297.395617 	0.886827 			0.884196 	-0.549511

(train_blend_x_gbm,
 test_blend_x_gbm,
 blend_scores_gbm,
 best_rounds_gbm) = lgbm_blend(lgb_params, 
                               train_X, train_y, 
                               test_X,
                               10,
                               500) #as the learning rate decreases the number of stopping rounds need to be increased

Blend 5 estimators for 10 folds
Model 1: LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.853059, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.01, max_bin=295, max_depth=-1,
        max_drop=50, min_child_samples=124, min_child_weight=5,
        min_split_gain=0, n_estimators=100000, nthread=-1, num_leaves=30,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=1, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=0.925184, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)
Model 1 fold 1
best round 3918
('Score: ', 0.55298554008510625)
Model 1 fold 1 fitting finished in 543.753s
Model 1 fold 2
best round 3920
('Score: ', 0.5403799269476991)
Model 1 fold 2 fitting finished in 517.663s
Model 1 fold 3
best round 4274
('Score: ', 0.52314649078651643)
Model 1 fold 3 fitting finished in 568.412s
Model 1 fold 4
best round 3564
('Score: ', 0.53809719083518437)
Model 1 fold 4 fitti

In [92]:
now = datetime.now()


name_train_blend = '../blend/train_blend_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend = '../blend/test_blend_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_gbm,axis=0))
print (np.mean(best_rounds_gbm,axis=0))

np.savetxt(name_train_blend,train_blend_x_gbm, delimiter=",")
np.savetxt(name_test_blend,test_blend_x_gbm, delimiter=",")



[ 0.54197122  0.54179449  0.54177244  0.54184673  0.54176743]
[ 3787.8  4556.3  4393.2  3773.1  3854.6]


In [94]:
now

datetime.datetime(2017, 3, 6, 17, 11, 34, 262468)

In [95]:
sub_name = '../output/sub_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_gbm[:,:3])
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv(sub_name, index=False)