In [3]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [3]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 322) (74659, 322) (49352,)


In [6]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 322)
(9871, 322)


In [8]:
rgr = xgb.XGBClassifier(objective = 'multi:softprob',
                       learning_rate = 0.1,
                       n_estimators = 10000,
                       nthread = -1)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         num_class = 3,
        early_stopping_rounds=50,
        verbose=25
       )

[0]	validation_0-mlogloss:1.03729
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.639405
[50]	validation_0-mlogloss:0.599529
[75]	validation_0-mlogloss:0.584213
[100]	validation_0-mlogloss:0.574866
[125]	validation_0-mlogloss:0.568737
[150]	validation_0-mlogloss:0.563895
[175]	validation_0-mlogloss:0.560445
[200]	validation_0-mlogloss:0.557579
[225]	validation_0-mlogloss:0.555358
[250]	validation_0-mlogloss:0.553443
[275]	validation_0-mlogloss:0.551671
[300]	validation_0-mlogloss:0.550225
[325]	validation_0-mlogloss:0.548876
[350]	validation_0-mlogloss:0.547644
[375]	validation_0-mlogloss:0.546672
[400]	validation_0-mlogloss:0.545659
[425]	validation_0-mlogloss:0.545018
[450]	validation_0-mlogloss:0.544299
[475]	validation_0-mlogloss:0.543861
[500]	validation_0-mlogloss:0.543206
[525]	validation_0-mlogloss:0.542953
[550]	validation_0-mlogloss:0.542268
[575]	validation_0-mlogloss:0.541983
[600]	validation_0-mlogloss:0.541475
[625]	valida

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [9]:
pred_y = rgr.predict_proba(test_X, ntree_limit = rgr.best_iteration)

In [10]:
now = datetime.now()
sub_name = '../output/sub_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

# Tune XGBoost

In [11]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.538056 999
4 	0.535551 682
5 	0.535388 504
6 	0.537389 281
7 	0.53739 228


In [12]:
for x in [8,9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

8 	0.539197 171
9 	0.544561 123


In [13]:
max_depth = train_param
print max_depth

5


In [14]:
train_param = 1
for x in [2,4,8,12,16,20]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.535875 451
4 	0.53768 468
8 	0.535414 541
12 	0.535852 382
16 	0.534811 502
20 	0.535808 417


In [15]:
for x in [24,28,32]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

24 	0.534658 465
28 	0.535242 387
32 	0.535742 501


In [16]:
min_child_weight = train_param
print min_child_weight

24


In [17]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.543347 1102
0.1 	0.537411 558
0.2 	0.532129 548
0.3 	0.530955 594
0.4 	0.532481 472
0.5 	0.533489 439
0.6 	0.53406 442
0.7 	0.533872 533
0.8 	0.535258 458
0.9 	0.535175 532


In [18]:
colsample_bytree = train_param
print colsample_bytree

0.3


In [19]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.53808 445
0.6 	0.53512 453
0.7 	0.534725 432
0.8 	0.534234 482
0.9 	0.531949 483


In [20]:
subsample = train_param
print subsample

1


In [21]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.530912 593
0.6 	0.531407 607
0.9 	0.532852 469
1.2 	0.530871 677
1.5 	0.530909 555
1.8 	0.532441 586
2.1 	0.532689 696
2.4 	0.531855 654
2.7 	0.534031 864
3.0 	0.533408 804


In [22]:
gamma = train_param
print gamma

1.2


In [None]:
# 0.3 	0.528756 371
# 0.6 	0.530068 353
# 0.9 	0.530043 275
# 1.2 	0.530065 388
# 1.5 	0.529657 331
# 1.8 	0.529906 328
# 2.1 	0.528338 393
# 2.4 	0.529364 372
# 2.7 	0.527919 456
# 3.0 	0.528962 417

In [5]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,10),
        'min_child_weight': (12,28),
        'colsample_bytree': (0.2,0.6),
        'subsample': (0.8,1),
        'gamma': (0,3)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[725]	train-mlogloss:0.400678+0.00155655	test-mlogloss:0.529636+0.00537507

    1 | 21m34s | [35m  -0.52964[0m | [32m            0.5829[0m | [32m   0.5937[0m | [32m     4.5959[0m | [32m           22.1669[0m | [32m     0.9258[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[436]	train-mlogloss:0.329219+0.000572654	test-mlogloss:0.529528+0.00450463

    2 | 13m13s | [35m  -0.52953[0m | [32m            0.2249[0m |

  " state: %s" % convergence_dict)


[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[184]	train-mlogloss:0.291984+0.00238293	test-mlogloss:0.529962+0.00405876

   11 | 14m49s |   -0.52996 |             0.5553 |    0.3280 |      9.9388 |            12.1396 |      0.9189 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[925]	train-mlogloss:0.382127+0.000735958	test-mlogloss:0.529125+0.00451081





   12 | 19m22s |   -0.52913 |             0.3726 |    0.2058 |      4.0259 |            12.2460 |      0.9922 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[172]	train-mlogloss:0.32246+0.00132542	test-mlogloss:0.529665+0.00446974

   13 | 14m17s |   -0.52966 |             0.5996 |    0.1170 |      9.9857 |            19.8379 |      0.9344 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1026]	train-mlogloss:0.375292+0.00130248	test-mlogloss:0.527378+0.00533473



  " state: %s" % convergence_dict)


   14 | 21m56s |   -0.52738 |             0.2040 |    2.9926 |      7.4795 |            12.1574 |      0.9932 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[473]	train-mlogloss:0.350935+0.00143367	test-mlogloss:0.526713+0.00522667

   15 | 19m10s | [35m  -0.52671[0m | [32m            0.3269[0m | [32m   2.9211[0m | [32m     9.8785[0m | [32m           15.6469[0m | [32m     0.9999[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[885]	train-mlogloss:0.406373+0.00130509	test-mlogloss:0.530122+0.00552415



  " state: %s" % convergence_dict)


   16 | 12m59s |   -0.53012 |             0.2128 |    0.3522 |      4.0512 |            27.9874 |      0.9984 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[430]	train-mlogloss:0.33863+0.00047786	test-mlogloss:0.526687+0.004655

   17 | 22m18s | [35m  -0.52669[0m | [32m            0.4301[0m | [32m   2.9819[0m | [32m     9.9223[0m | [32m           12.3224[0m | [32m     0.9849[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[342]	train-mlogloss:0.345625+0.00123	test-mlogloss:0.527731+0.00503383

   18 | 22m57s |   -0.52773 |             0.5676 |    2.9370 |      9.9507 |            18.8677 |      0.9769 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until 

  " state: %s" % convergence_dict)


   19 | 27m45s |   -0.52709 |             0.5698 |    2.9517 |      8.5407 |            14.3097 |      0.9893 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1075]	train-mlogloss:0.408316+0.000779373	test-mlogloss:0.529536+0.00573271

   20 | 32m21s |   -0.52954 |             0.5745 |    2.9787 |      4.0570 |            12.2167 |      0.8481 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[671]	train-mlogloss:0.34419+0.00122427	test-mlogloss:0.526417+0.00519199

   21 | 20m21s | [35m  -0.52642[0m | [32m            0.2345[0m | [32m   2.9557[0m | [32m     9.8936[0m | [32m           13.9868[0m | [32m     0.9924[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train

  " state: %s" % convergence_dict)


   25 | 24m00s |   -0.53158 |             0.3279 |    2.9467 |      4.0821 |            25.2351 |      0.9964 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[674]	train-mlogloss:0.339162+0.00168842	test-mlogloss:0.527123+0.00407488

   26 | 27m28s |   -0.52712 |             0.3474 |    2.9096 |      9.8901 |            22.4212 |      0.9922 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[350]	train-mlogloss:0.315887+0.00212657	test-mlogloss:0.528965+0.0057493

   27 | 09m56s |   -0.52897 |             0.2094 |    1.3550 |      8.1596 |            15.1486 |      0.9990 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopp

  " state: %s" % convergence_dict)


   30 | 16m39s |   -0.52764 |             0.2059 |    2.9171 |      7.9178 |            16.7649 |      0.9963 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[607]	train-mlogloss:0.34644+0.00153277	test-mlogloss:0.527363+0.0058374

   31 | 17m39s |   -0.52736 |             0.2147 |    2.9002 |      9.0786 |            13.1329 |      0.9820 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[413]	train-mlogloss:0.345909+0.00148099	test-mlogloss:0.526405+0.00424134



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   32 | 21m24s | [35m  -0.52641[0m | [32m            0.4333[0m | [32m   2.9445[0m | [32m     9.8583[0m | [32m           13.2500[0m | [32m     0.9998[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[691]	train-mlogloss:0.381477+0.00135683	test-mlogloss:0.527726+0.00447741



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   33 | 29m05s |   -0.52773 |             0.5670 |    2.9543 |      6.0514 |            14.7623 |      0.9909 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[412]	train-mlogloss:0.333635+0.00144647	test-mlogloss:0.526878+0.00511223



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   34 | 23m45s |   -0.52688 |             0.4896 |    2.8750 |      9.9790 |            13.0506 |      0.9895 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[374]	train-mlogloss:0.368002+0.00102855	test-mlogloss:0.528896+0.00498313



  " state: %s" % convergence_dict)


   35 | 17m14s |   -0.52890 |             0.5711 |    0.0176 |      6.7121 |            23.6963 |      0.9993 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[620]	train-mlogloss:0.374049+0.00316101	test-mlogloss:0.527624+0.00446495

   36 | 28m54s |   -0.52762 |             0.5291 |    2.9870 |      7.4630 |            19.1157 |      0.9975 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[447]	train-mlogloss:0.342849+0.00109567	test-mlogloss:0.528036+0.00503764



  " state: %s" % convergence_dict)


   37 | 26m55s |   -0.52804 |             0.5906 |    2.8966 |      8.6491 |            12.0881 |      0.9900 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[484]	train-mlogloss:0.340231+0.000564369	test-mlogloss:0.52791+0.0048331



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   38 | 32m18s |   -0.52791 |             0.5915 |    2.9325 |      9.6668 |            27.3338 |      0.9887 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[644]	train-mlogloss:0.350561+0.000690743	test-mlogloss:0.527347+0.00468315



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   39 | 18m15s |   -0.52735 |             0.2036 |    2.9937 |      9.7210 |            13.6505 |      0.9663 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1647]	train-mlogloss:0.433856+0.00316512	test-mlogloss:0.529726+0.00557642



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   40 | 28m32s |   -0.52973 |             0.2936 |    2.7764 |      4.0773 |            18.3647 |      0.9934 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[208]	train-mlogloss:0.294488+0.00167102	test-mlogloss:0.528641+0.00366348

   41 | 14m58s |   -0.52864 |             0.5352 |    0.0016 |      9.4992 |            16.3353 |      0.9942 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1558]	train-mlogloss:0.45136+0.00192101	test-mlogloss:0.530909+0.0056377

   42 | 24m25s |   -0.53091 |             0.2486 |    2.9264 |      4.0958 |            14.2523 |      1.0000 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopp

  " state: %s" % convergence_dict)


   43 | 17m41s |   -0.52928 |             0.5938 |    0.0263 |      6.0669 |            12.0085 |      0.9943 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[273]	train-mlogloss:0.351369+0.00163731	test-mlogloss:0.528499+0.00402837



  " state: %s" % convergence_dict)


   44 | 15m46s |   -0.52850 |             0.5737 |    0.7939 |      7.0712 |            17.5950 |      0.9991 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1000]	train-mlogloss:0.400203+0.00356582	test-mlogloss:0.528327+0.0052068

   45 | 35m34s |   -0.52833 |             0.5396 |    2.9644 |      5.7632 |            12.0119 |      0.9866 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[739]	train-mlogloss:0.411229+0.00135623	test-mlogloss:0.529978+0.00485103

   46 | 16m34s |   -0.52998 |             0.3732 |    0.0361 |      4.2472 |            24.7241 |      0.9915 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)


   47 | 15m45s |   -0.52817 |             0.4709 |    0.0424 |      9.8735 |            27.6846 |      0.9867 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[983]	train-mlogloss:0.405389+0.00263627	test-mlogloss:0.528608+0.00500426

   48 | 32m51s |   -0.52861 |             0.5346 |    2.9289 |      5.8091 |            17.3035 |      0.9933 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[411]	train-mlogloss:0.371359+0.000881543	test-mlogloss:0.528633+0.00477758



  " state: %s" % convergence_dict)


   49 | 11m44s |   -0.52863 |             0.2818 |    0.0935 |      6.3628 |            26.3586 |      0.9974 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1691]	train-mlogloss:0.440857+0.00397754	test-mlogloss:0.530862+0.00564943

   50 | 47m40s |   -0.53086 |             0.5715 |    2.8924 |      4.7404 |            27.7970 |      0.9955 | 


need improve

In [6]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
21,9.858298,13.249991,0.43326,0.999819,2.944467,-0.526405
10,9.893582,13.98679,0.234479,0.992427,2.955693,-0.526417
6,9.922328,12.32241,0.430052,0.984938,2.981919,-0.526687
4,9.878492,15.646924,0.326926,0.999855,2.921145,-0.526713
23,9.979036,13.050578,0.489616,0.989539,2.874991,-0.526878
8,8.540735,14.309724,0.569809,0.989256,2.951742,-0.52709
15,9.8901,22.421182,0.34737,0.992242,2.909582,-0.527123
18,9.892799,26.129075,0.317373,0.976754,2.990347,-0.527206
28,9.721016,13.650527,0.203559,0.966315,2.99367,-0.527347
20,9.078562,13.132861,0.214693,0.981959,2.900196,-0.527363


In [1]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=5555)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [2]:
estimators = [
            xgb.XGBClassifier(max_depth = 9,
                              min_child_weight = 13,
                              colsample_bytree = 0.433260 ,
                              subsample = 0.999819 ,
                              gamma = 2.944467),
             xgb.XGBClassifier(max_depth = 9,
                              min_child_weight = 13,
                              colsample_bytree = 0.234479,
                              subsample = 0.992427,
                              gamma = 2.955693),
             xgb.XGBClassifier(max_depth = 9,
                              min_child_weight = 12,
                              colsample_bytree = 0.430052,
                              subsample = 0.984938,
                              gamma = 2.981919),         
             xgb.XGBClassifier(max_depth = 9,
                              min_child_weight = 15,
                              colsample_bytree = 0.326926,
                              subsample = 0.999855,
                              gamma = 2.921145),  
             xgb.XGBClassifier(max_depth = 9,
                              min_child_weight = 13,
                              colsample_bytree = 0.489616,
                              subsample = 0.989539,
                              gamma = 2.874991)              
             ]

#  	 	max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 	 	score
# 21 	9.858298 	13.249991 	 	 	0.433260 	 	 	0.999819 	2.944467 	-0.526405
# 10 	9.893582 	13.986790 	 	 	0.234479 	 	 	0.992427 	2.955693 	-0.526417
# 6 	9.922328 	12.322410 	 	 	0.430052 	 	 	0.984938 	2.981919 	-0.526687
# 4 	9.878492 	15.646924 	 	 	0.326926 	 	 	0.999855 	2.921145 	-0.526713
# 23 	9.979036 	13.050578 	 	 	0.489616 	 	 	0.989539 	2.874991 	-0.526878

# 8 	8.540735 	14.309724 	 	 	0.569809 	 	 	0.989256 	2.951742 	-0.527090
# 15 	9.890100 	22.421182 	 	 	0.347370 	 	 	0.992242 	2.909582 	-0.527123
# 18 	9.892799 	26.129075 	 	 	0.317373 	 	 	0.976754 	2.990347 	-0.527206
# 28 	9.721016 	13.650527 	 	 	0.203559 	 	 	0.966315 	2.993670 	-0.527347
# 20 	9.078562 	13.132861 	 	 	0.214693 	 	 	0.981959 	2.900196 	-0.527363
# 25 	5.983142 	15.522353 	 	 	0.365431 	 	 	0.894089 	1.799546 	-0.527589
# 23 	5.974873 	16.538875 	 	 	0.485034 	 	 	0.938367 	0.114823 	-0.527952
# 2 	5.963303 	12.117000 	 	 	0.433634 	 	 	0.978037 	1.715329 	-0.528031
# 36 	5.975359 	18.867714 	 	 	0.475715 	 	 	0.976875 	1.762219 	-0.528089
# 27 	5.160960 	15.607619 	 	 	0.492978 	 	 	0.913682 	1.794400 	-0.528097
# 26 	5.953624 	15.884306 	 	 	0.490160 	 	 	0.931286 	1.172554 	-0.528127
# 8 	5.980293 	12.262870 	 	 	0.382213 	 	 	0.985894 	0.013603 	-0.528166
# 18 	5.969315 	27.978419 	 	 	0.242325 	 	 	0.972658 	1.665230 	-0.528323
# 5 	5.833405 	27.333782 	 	 	0.493605 	 	 	0.988708 	1.759493 	-0.528409
# 37 	5.991305 	12.978177 	 	 	0.244283 	 	 	0.907146 	1.750585 	-0.528450

(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              10,
                              500)


NameError: name 'xgb' is not defined

In [10]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean, delimiter=",")

[ 0.52128253  0.52168419  0.52109504  0.52076998  0.52132577]
[ 3811.9  3832.7  3687.   4127.2  2683.2]


In [None]:
# [ 0.52385999  0.52420308  0.52429754  0.52366222  0.52450185]
# [ 2866.7  3979.7  3102.9  2783.1  4450.5]

In [11]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_xgb_mean[:,9:12])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)


# ypreds.columns = cols

# df = pd.read_json(open("../input/test.json", "r"))
# ypreds['listing_id'] = df["listing_id"]

# ypreds.to_csv('my_preds.csv', index=None)

In [48]:
test_blend_x_xgb_gmean[:,9:12]


array([[  3.20544260e-01,   6.10888687e-01,   6.56750820e-02],
       [  9.66028463e-01,   2.30937453e-02,   1.06505838e-02],
       [  9.53579737e-01,   4.13955017e-02,   3.97691225e-03],
       ..., 
       [  9.78252564e-01,   2.03241753e-02,   1.01263996e-03],
       [  9.71474749e-01,   2.74180665e-02,   5.09567844e-04],
       [  5.87161787e-01,   3.92164954e-01,   1.94305868e-02]])

In [49]:
test_blend_x_xgb_gmean[:,:3]

array([[  3.20994298e-01,   6.04517100e-01,   7.12524217e-02],
       [  9.57150480e-01,   3.01748109e-02,   1.18646473e-02],
       [  9.58178383e-01,   3.74607705e-02,   3.46537444e-03],
       ..., 
       [  9.80787885e-01,   1.80516908e-02,   8.23179767e-04],
       [  9.70638017e-01,   2.80394743e-02,   5.14503672e-04],
       [  5.81934901e-01,   3.95535699e-01,   2.09051621e-02]])