In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_CV_statistics1.csv')
test_X = pd.read_csv(data_path + 'test_CV_statistics1.csv')
train_y = np.ravel(pd.read_csv(data_path + 'train_y_CV_statistics.csv',header=None))
sub_id = test_X.listing_id.astype('int32').values

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 137) (74659, 137) (49352,)


In [11]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 137)
(9871, 137)


In [12]:
# rgr = xgb.XGBClassifier(objective = 'multi:softprob',
#                        learning_rate = 0.1,
#                        n_estimators = 10000,
#                        nthread = -1,
#                        max_depth=10)

# rgr.fit(X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
# #         num_class = 3,
#         early_stopping_rounds=50,
#         verbose=25
#        )

In [13]:
# pred_y = rgr.predict_proba(test_X, ntree_limit = rgr.best_iteration)
# pred_y

# Tune XGBoost

In [15]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7,8,9,10]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.531135 621
4 	0.528788 436
5 	0.528401 334
6 	0.529342 216
7 	0.532007 141
8 	0.532342 134
9 	0.536764 97
10 	0.538813 76


In [16]:
max_depth = train_param
print max_depth

5


In [17]:
train_param = 1
for x in [2,4,8,12,16,20,24,28,32,40,48,64,128]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.527937 309
4 	0.528707 243
8 	0.529375 381
12 	0.528663 316
16 	0.528997 323
20 	0.52832 333
24 	0.528435 398
28 	0.528545 315
32 	0.528707 339
40 	0.527865 337
48 	0.529024 336
64 	0.528953 376
128 	0.531708 381


In [18]:
for x in [50,75,80,100,120,140,160,200]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

50 	0.527787 391
75 	0.528922 367
80 	0.529824 473
100 	0.530926 383
120 	0.531655 449
140 	0.530849 445
160 	0.531763 498
200 	0.533395 648


In [19]:
min_child_weight = train_param
print min_child_weight

50


In [20]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.533177 678
0.1 	0.528575 489
0.2 	0.527417 305
0.3 	0.527616 336
0.4 	0.527998 362
0.5 	0.526321 366
0.6 	0.527867 387
0.7 	0.52753 435
0.8 	0.527293 383
0.9 	0.529772 324


In [21]:
colsample_bytree = train_param
print colsample_bytree

0.5


In [22]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.532237 396
0.6 	0.52943 395
0.7 	0.528151 379
0.8 	0.528136 420
0.9 	0.52825 357


In [23]:
subsample = train_param
print subsample

1


In [24]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.527882 329
0.6 	0.527404 389
0.9 	0.528133 318
1.2 	0.526418 497
1.5 	0.528352 345
1.8 	0.528509 354
2.1 	0.527858 409
2.4 	0.52787 549
2.7 	0.525604 525
3.0 	0.527605 393


In [25]:
gamma = train_param
print gamma

2.7


In [None]:
# 0.3 	0.52431 436

In [26]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,10),
        'min_child_weight': (30,100),
        'colsample_bytree': (0.3,0.8),
        'subsample': (0.7,1),
        'gamma': (2,3)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[800]	train-mlogloss:0.359776+0.00266422	test-mlogloss:0.524082+0.00630427

    1 | 14m49s | [35m  -0.52408[0m | [32m            0.3699[0m | [32m   2.7645[0m | [32m     5.2078[0m | [32m           45.4868[0m | [32m     0.8879[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[285]	train-mlogloss:0.325537+0.0016814	test-mlogloss:0.524654+0.00677044

    2 | 14m57s |   -0.52465 |             0.6851 |    2.4539 |      8.



   11 | 26m10s |   -0.52478 |             0.7417 |    2.8924 |      4.6909 |            30.0294 |      0.9526 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1019]	train-mlogloss:0.403717+0.00185238	test-mlogloss:0.526458+0.00635421

   12 | 21m50s |   -0.52646 |             0.5782 |    2.8339 |      4.0838 |            99.9071 |      0.9376 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1056]	train-mlogloss:0.409918+0.00266758	test-mlogloss:0.52689+0.00596366

   13 | 27m46s |   -0.52689 |             0.7657 |    2.9047 |      4.3252 |            75.1984 |      0.9944 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Sto

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   14 | 11m22s | [35m  -0.52318[0m | [32m            0.3500[0m | [32m   2.4217[0m | [32m     9.9616[0m | [32m           30.0457[0m | [32m     0.9948[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[407]	train-mlogloss:0.319775+0.00179118	test-mlogloss:0.52196+0.00636937

   15 | 13m48s | [35m  -0.52196[0m | [32m            0.3431[0m | [32m   2.9852[0m | [32m     9.8936[0m | [32m           38.6922[0m | [32m     0.9886[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[527]	train-mlogloss:0.325852+0.00169995	test-mlogloss:0.523349+0.00587314



  " state: %s" % convergence_dict)


   16 | 15m21s |   -0.52335 |             0.3117 |    2.5705 |      9.8625 |            83.4601 |      0.9881 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[243]	train-mlogloss:0.288896+0.00319662	test-mlogloss:0.523907+0.00612811



  " state: %s" % convergence_dict)


   17 | 17m09s |   -0.52391 |             0.7668 |    2.0165 |      9.3644 |            36.4631 |      0.9824 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[388]	train-mlogloss:0.282784+0.00140062	test-mlogloss:0.523435+0.00518146

   18 | 12m26s |   -0.52343 |             0.3160 |    2.0538 |      9.9078 |            42.3049 |      0.9979 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[443]	train-mlogloss:0.347299+0.000955969	test-mlogloss:0.523667+0.00616503



  " state: %s" % convergence_dict)


   19 | 17m23s |   -0.52367 |             0.4510 |    2.9674 |      9.6916 |            89.0906 |      0.9825 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[415]	train-mlogloss:0.314453+0.00197418	test-mlogloss:0.522546+0.00639241

   20 | 13m38s |   -0.52255 |             0.3289 |    2.9662 |      9.4430 |            33.0534 |      0.9911 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[417]	train-mlogloss:0.334694+0.00199873	test-mlogloss:0.523154+0.00626636



  " state: %s" % convergence_dict)


   21 | 17m51s |   -0.52315 |             0.4872 |    2.9812 |      9.9585 |            70.2075 |      0.9688 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[390]	train-mlogloss:0.325184+0.00175253	test-mlogloss:0.522479+0.00583294



  " state: %s" % convergence_dict)


   22 | 13m26s |   -0.52248 |             0.3550 |    2.8788 |      9.7572 |            47.8806 |      0.9858 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1111]	train-mlogloss:0.392529+0.00268541	test-mlogloss:0.525521+0.00659181



  " state: %s" % convergence_dict)


   23 | 26m05s |   -0.52552 |             0.6639 |    2.9345 |      4.1309 |            51.0818 |      0.9879 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1384]	train-mlogloss:0.401367+0.00477641	test-mlogloss:0.525243+0.00638665



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   24 | 18m30s |   -0.52524 |             0.3105 |    2.9130 |      4.5316 |            38.7214 |      0.9996 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1250]	train-mlogloss:0.419666+0.00303129	test-mlogloss:0.526337+0.00621042



  " state: %s" % convergence_dict)


   25 | 16m49s |   -0.52634 |             0.3076 |    2.8708 |      4.0273 |            93.2869 |      0.9934 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[430]	train-mlogloss:0.325488+0.00255702	test-mlogloss:0.523205+0.00662472

   26 | 16m00s |   -0.52320 |             0.4217 |    2.1147 |      9.5000 |            99.9846 |      0.9990 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[404]	train-mlogloss:0.311997+0.002719	test-mlogloss:0.523532+0.00662383

   27 | 13m03s |   -0.52353 |             0.3322 |    2.0162 |      9.7306 |            76.0949 |      0.9801 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stoppi

  " state: %s" % convergence_dict)


   28 | 14m42s |   -0.52266 |             0.3077 |    2.7993 |      9.9480 |            35.9872 |      0.9760 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[395]	train-mlogloss:0.332722+0.00233335	test-mlogloss:0.523675+0.00629095



  " state: %s" % convergence_dict)


   29 | 13m05s |   -0.52368 |             0.3406 |    2.7163 |      9.9466 |            63.6880 |      0.9916 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[424]	train-mlogloss:0.321994+0.0016189	test-mlogloss:0.523886+0.00600027



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   30 | 14m00s |   -0.52389 |             0.3525 |    2.0270 |      9.9218 |            93.3794 |      0.9899 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[499]	train-mlogloss:0.322601+0.00186049	test-mlogloss:0.522958+0.00598764

   31 | 15m14s |   -0.52296 |             0.3203 |    2.9330 |      9.2927 |            51.2480 |      0.9928 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[396]	train-mlogloss:0.32169+0.00280418	test-mlogloss:0.522762+0.00677553

   32 | 14m25s |   -0.52276 |             0.3757 |    2.9831 |      9.3205 |            40.7963 |      0.9942 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopp

  " state: %s" % convergence_dict)


   33 | 14m36s |   -0.52214 |             0.3036 |    2.9123 |      8.0124 |            47.3447 |      0.9935 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[563]	train-mlogloss:0.329555+0.00272212	test-mlogloss:0.523007+0.00633885



  " state: %s" % convergence_dict)


   34 | 15m06s |   -0.52301 |             0.3080 |    2.9096 |      8.2670 |            49.2890 |      0.9955 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[271]	train-mlogloss:0.331888+0.00343583	test-mlogloss:0.523843+0.00606784

   35 | 12m00s |   -0.52384 |             0.4469 |    2.0094 |      9.7599 |            68.5086 |      0.9940 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1248]	train-mlogloss:0.408691+0.0032489	test-mlogloss:0.525518+0.00615553

   36 | 18m16s |   -0.52552 |             0.3439 |    2.9296 |      4.1428 |            63.8607 |      0.9949 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)


   37 | 15m53s |   -0.52242 |             0.3050 |    2.9975 |      7.4795 |            30.6884 |      0.9898 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[455]	train-mlogloss:0.32923+0.00185983	test-mlogloss:0.522154+0.00654895

   38 | 12m38s |   -0.52215 |             0.3041 |    2.8941 |      8.7357 |            37.7799 |      0.9874 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[529]	train-mlogloss:0.332931+0.00176829	test-mlogloss:0.523188+0.00578049



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   39 | 15m45s |   -0.52319 |             0.3103 |    2.8375 |      9.6975 |            72.8310 |      0.9928 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[913]	train-mlogloss:0.380035+0.00234085	test-mlogloss:0.524865+0.0066144



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)



Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[875]	train-mlogloss:0.398898+0.00195013	test-mlogloss:0.524852+0.00578102



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   41 | 14m45s |   -0.52485 |             0.3042 |    2.9897 |      5.7696 |            81.2279 |      0.9923 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[546]	train-mlogloss:0.340599+0.0023427	test-mlogloss:0.523179+0.00643942

   42 | 12m13s |   -0.52318 |             0.3284 |    2.4206 |      6.9817 |            33.0168 |      0.9909 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[495]	train-mlogloss:0.33266+0.00159504	test-mlogloss:0.522333+0.00656059



  " state: %s" % convergence_dict)


   43 | 13m35s |   -0.52233 |             0.3018 |    2.8663 |      8.0424 |            46.9486 |      0.9958 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[311]	train-mlogloss:0.329929+0.00202065	test-mlogloss:0.523588+0.00648347

   44 | 11m02s |   -0.52359 |             0.3237 |    2.8720 |      9.9537 |            38.2804 |      0.9071 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[337]	train-mlogloss:0.343478+0.00299261	test-mlogloss:0.524647+0.00534775



  " state: %s" % convergence_dict)


   45 | 21m18s |   -0.52465 |             0.7460 |    2.9599 |      9.5553 |            79.9405 |      0.9984 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[528]	train-mlogloss:0.344857+0.00165415	test-mlogloss:0.524154+0.00757224



  " state: %s" % convergence_dict)


   46 | 22m45s |   -0.52415 |             0.7843 |    2.9799 |      6.0434 |            34.6694 |      0.9908 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[288]	train-mlogloss:0.332665+0.00313116	test-mlogloss:0.524482+0.0062659

   47 | 19m37s |   -0.52448 |             0.7885 |    2.0738 |      9.4162 |            86.3951 |      0.9749 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[452]	train-mlogloss:0.355489+0.00215729	test-mlogloss:0.523887+0.00647073

   48 | 15m32s |   -0.52389 |             0.3705 |    2.9251 |      9.9386 |            99.9056 |      0.9590 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopp

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   49 | 20m27s |   -0.52435 |             0.7165 |    2.9591 |      7.7461 |            44.1374 |      0.9968 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[848]	train-mlogloss:0.366512+0.00208806	test-mlogloss:0.52401+0.00637071



  " state: %s" % convergence_dict)


   50 | 16m46s |   -0.52401 |             0.3021 |    2.9954 |      6.0283 |            69.7415 |      0.9776 | 


need improve

In [27]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
4,9.893582,38.692208,0.343099,0.988641,2.985231,-0.52196
22,8.012421,47.344712,0.303623,0.99348,2.912287,-0.522142
27,8.735716,37.779927,0.304125,0.987374,2.894111,-0.522154
32,8.042435,46.948556,0.301839,0.995753,2.866326,-0.522333
26,7.479458,30.688415,0.305018,0.98982,2.99753,-0.522417
11,9.757235,47.88063,0.355007,0.985846,2.878814,-0.522479
9,9.442971,33.053379,0.328882,0.991066,2.966176,-0.522546
17,9.947977,35.987158,0.307668,0.976048,2.799263,-0.522665
21,9.320539,40.796274,0.375652,0.994171,2.983123,-0.522762
20,9.292661,51.247953,0.320341,0.992831,2.933025,-0.522958


In [34]:
test_X.isnull().values.any()

True

In [35]:
tmp = test_X.copy()

In [36]:
null_ind = test_X.num_loc_price_diff.isnull()
test_X['num_loc_price_diff'] = test_X['num_price'] - test_X['num_loc_median_price']
test_X[null_ind][['num_loc_price_diff','num_price','num_loc_median_price']]

Unnamed: 0,num_loc_price_diff,num_price,num_loc_median_price
710,-49.5,2600,2649.5
779,-500.0,2750,3250.0
988,5700.0,8000,2300.0
1542,2800.0,10000,7200.0
2099,-1555.0,4195,5750.0
3447,-200.0,4200,4400.0
3697,-3464.0,2300,5764.0
4662,4200.0,6500,2300.0
4669,3275.0,6200,2925.0
4689,-1169.0,4595,5764.0


In [37]:
test_X.isnull().values.any()

False

In [3]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=5555)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [None]:
estimators = [
#             xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 5,
#                               colsample_bytree = 0.293906 ,
#                               subsample = 0.947733 ,
#                               gamma = 2.983057),
#              xgb.XGBClassifier(max_depth = 6,
#                               min_child_weight = 2,
#                               colsample_bytree = 0.200079,
#                               subsample = 0.976483,
#                               gamma = 2.872736),
#              xgb.XGBClassifier(max_depth = 6,
#                               min_child_weight = 25,
#                               colsample_bytree = 0.273249,
#                               subsample = 0.983080,
#                               gamma = 2.978747),         
#              xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 4,
#                               colsample_bytree = 0.219052,
#                               subsample = 0.741765,
#                               gamma = 2.649557),  
             xgb.XGBClassifier(max_depth = 8,
                              min_child_weight = 37,
                              colsample_bytree = 0.304125,
                              subsample = 0.987374,
                              gamma = 2.894111)              
             ]

#  	 	max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 	score
# 4 	9.893582 	38.692208 	0.343099 	0.988641 	2.985231 	-0.521960  52041029
# 22 	8.012421 	47.344712 	0.303623 	0.993480 	2.912287 	-0.522142  52057958
# 27 	8.735716 	37.779927 	0.304125 	0.987374 	2.894111 	-0.522154
(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              5,
                              500)


Blend 1 estimators for 5 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.304125,
       gamma=2.894111, learning_rate=0.02, max_delta_step=0, max_depth=8,
       min_child_weight=37, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.987374)
Model 1 fold 1


In [33]:
# # now = datetime.now()

# name_train_blend = '../output/train_blend_xgb_cv137_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
# name_test_blend_mean = '../output/test_blend_xgb_mean_cv137_5blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
# name_test_blend_gmean = '../output/test_blend_xgb_gmean_cv137_5blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


# print (np.mean(blend_scores_xgb,axis=0))
# print (np.mean(best_rounds_xgb,axis=0))
# np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
# np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean, delimiter=",")
# np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean, delimiter=",")

[ 0.52041029]
[ 2541.4]


In [None]:
# data 0322
# [ 0.52385999  0.52420308  0.52429754  0.52366222  0.52450185]
# [ 2866.7  3979.7  3102.9  2783.1  4450.5]

# data 0331 seed = 2017
# [ 0.5161796   0.51727863  0.51867825  0.517129    0.51732854]
# [ 4857.5  6379.5  5516.4  3337.9  1674.5]

In [5]:
train_blend_x_xgb = pd.DataFrame(train_blend_x_xgb[:,:3])
train_blend_x_xgb.columns = ["low", "medium", "high"]
train_blend_x_xgb["listing_id"] = train_X.listing_id.values

train_blend_x_xgb.head()

Unnamed: 0,low,medium,high,listing_id
0,0.9382,0.058592,0.003208,7211212
1,0.993766,0.005808,0.000426,7150865
2,0.391082,0.443861,0.165057,6887163
3,0.95468,0.042502,0.002818,6888711
4,0.994358,0.005169,0.000473,6934781


In [6]:
test_blend_x_xgb_mean = pd.DataFrame(test_blend_x_xgb_mean[:,:3])
test_blend_x_xgb_mean.columns = ["low", "medium", "high"]
test_blend_x_xgb_mean["listing_id"] = test_X.listing_id.values

test_blend_x_xgb_mean.head()

Unnamed: 0,low,medium,high,listing_id
0,0.401541,0.525831,0.072628,7142618
1,0.986221,0.009393,0.004387,7210040
2,0.668526,0.283461,0.048012,7103890
3,0.49694,0.472054,0.031006,7143442
4,0.74212,0.234169,0.023711,6860601


In [7]:
test_blend_x_xgb_gmean = pd.DataFrame(test_blend_x_xgb_gmean[:,:3])
test_blend_x_xgb_gmean.columns = ["low", "medium", "high"]
test_blend_x_xgb_gmean["listing_id"] = test_X.listing_id.values

test_blend_x_xgb_gmean.head()

Unnamed: 0,low,medium,high,listing_id
0,0.398646,0.524051,0.0724,7142618
1,0.986218,0.009124,0.004201,7210040
2,0.66686,0.280832,0.045953,7103890
3,0.49636,0.471526,0.030715,7143442
4,0.741411,0.231883,0.023414,6860601


In [8]:
data_path = "../input/"
train_X_BM = pd.read_csv(data_path + 'train_BM_0331.csv')
test_X_BM = pd.read_csv(data_path + 'test_BM_0331.csv')

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X_BM.shape, test_X_BM.shape, train_y.shape

(49352, 412) (74659, 412) (49352,)


In [9]:
tmp_train = train_X_BM[['listing_id']].merge(train_blend_x_xgb,on = 'listing_id', 
                                             how = 'left')[["low", "medium", "high"]].values
tmp_test_mean = test_X_BM[['listing_id']].merge(test_blend_x_xgb_mean,on = 'listing_id', 
                                                how = 'left')[["low", "medium", "high"]].values
tmp_test_gmean = test_X_BM[['listing_id']].merge(test_blend_x_xgb_gmean,on = 'listing_id', 
                                                 how = 'left')[["low", "medium", "high"]].values

In [10]:
tmp_train[:10]

array([[  5.45454741e-01,   4.35612619e-01,   1.89326406e-02],
       [  2.38422796e-01,   7.03407645e-01,   5.81695400e-02],
       [  7.36379206e-01,   2.52045274e-01,   1.15755545e-02],
       [  9.38200235e-01,   5.85920289e-02,   3.20768380e-03],
       [  9.71339464e-01,   2.82589309e-02,   4.01635072e-04],
       [  8.02813649e-01,   1.44351289e-01,   5.28350845e-02],
       [  8.38758707e-01,   1.45681798e-01,   1.55594582e-02],
       [  2.11300716e-01,   2.94703752e-01,   4.93995577e-01],
       [  9.66512680e-01,   2.57101785e-02,   7.77717493e-03],
       [  9.55001295e-01,   4.42494452e-02,   7.49233819e-04]])

In [11]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_cv137_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_cv137_5blend_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_cv137_5blend_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,tmp_test_gmean, delimiter=",")

[ 0.52057958]
[ 3222.6]


In [32]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_BM_cv137_5blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(tmp_test_mean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X_BM.listing_id.values
out_df.to_csv(sub_name, index=False)

In [44]:
test_blend_x_xgb_gmean[["low", "medium", "high"]].values

array([[ 0.39300703,  0.5333155 ,  0.06647976],
       [ 0.98636611,  0.00922575,  0.00406247],
       [ 0.65414999,  0.29349853,  0.04406867],
       ..., 
       [ 0.51356489,  0.42521578,  0.04422116],
       [ 0.21262641,  0.46016629,  0.31441058],
       [ 0.96598009,  0.03009712,  0.00326856]])