In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_0331.csv')
test_X = pd.read_csv(data_path + 'test_BM_0331.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 412) (74659, 412) (49352,)


In [3]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 412)
(9871, 412)


In [4]:
# rgr = xgb.XGBClassifier(objective = 'multi:softprob',
#                        learning_rate = 0.1,
#                        n_estimators = 10000,
#                        nthread = -1,
#                        max_depth=10)

# rgr.fit(X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
# #         num_class = 3,
#         early_stopping_rounds=50,
#         verbose=25
#        )

In [5]:
# pred_y = rgr.predict_proba(test_X, ntree_limit = rgr.best_iteration)
# pred_y

# Tune XGBoost

In [6]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [2,3,4,5,6,7,8,9,10]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.53415 1712
3 	0.533524 757
4 	0.534241 466
5 	0.53388 367
6 	0.533953 271
7 	0.535565 189
8 	0.538649 153
9 	0.541866 125
10 	0.548011 76


In [9]:
# max_depth = train_param
max_depth = 6
print max_depth

6


In [10]:
train_param = 1
for x in [2,4,8,12,16,20,24,28,32,40,48,64,128]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.53235 282
4 	0.532919 288
8 	0.53183 305
12 	0.531764 320
16 	0.532015 323
20 	0.531898 353
24 	0.532541 383
28 	0.533103 328
32 	0.53122 318
40 	0.532665 371
48 	0.531682 422
64 	0.530637 492
128 	0.532116 480


In [18]:
train_param = 64
for x in [75,80,100,120,140,160,200]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

75 	0.532236 423
80 	0.531871 301
100 	0.532396 484
120 	0.532189 424
140 	0.53347 487
160 	0.533486 528
200 	0.5388 490


In [19]:
min_child_weight = train_param
print min_child_weight

64


In [20]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.533157 871
0.1 	0.530213 621
0.2 	0.526989 589
0.3 	0.527614 604
0.4 	0.52638 567
0.5 	0.528794 399
0.6 	0.529739 392
0.7 	0.528932 440
0.8 	0.530149 469
0.9 	0.530505 418


In [27]:
colsample_bytree = train_param
print colsample_bytree

0.4


In [28]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.535153 344
0.6 	0.533371 357
0.7 	0.532252 469
0.8 	0.531656 333
0.9 	0.530366 319


In [29]:
subsample = train_param
print subsample

1


In [30]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.527861 404
0.6 	0.528877 450
0.9 	0.527787 475
1.2 	0.529479 487
1.5 	0.528439 606
1.8 	0.528744 403
2.1 	0.52782 477
2.4 	0.529459 492
2.7 	0.528578 621
3.0 	0.530566 482


In [31]:
gamma = train_param
print gamma

0


In [None]:
# 0.3 	0.52431 436

In [32]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,10),
        'min_child_weight': (1,100),
        'colsample_bytree': (0.2,0.8),
        'subsample': (0.7,1),
        'gamma': (0,3)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[491]	train-mlogloss:0.369143+0.00141556	test-mlogloss:0.524025+0.00535169

    1 | 19m03s | [35m  -0.52402[0m | [32m            0.2598[0m | [32m   2.6639[0m | [32m     8.4837[0m | [32m           54.9284[0m | [32m     0.9771[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[299]	train-mlogloss:0.319295+0.00100848	test-mlogloss:0.523601+0.00596716

    2 | 13m47s | [35m  -0.52360[0m | [32m            0.3543[0m | 

  " state: %s" % convergence_dict)


   11 | 19m48s |   -0.52728 |             0.2192 |    0.3522 |      4.0512 |            99.9218 |      0.9976 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[376]	train-mlogloss:0.297039+0.00132814	test-mlogloss:0.523402+0.00485786

   12 | 17m26s | [35m  -0.52340[0m | [32m            0.2558[0m | [32m   2.7017[0m | [32m     9.6630[0m | [32m            1.4767[0m | [32m     0.8498[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[412]	train-mlogloss:0.356789+0.00118658	test-mlogloss:0.524028+0.0059227

   13 | 15m51s |   -0.52403 |             0.2068 |    2.9092 |      9.9406 |            29.4807 |      0.8315 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train u

  " state: %s" % convergence_dict)


   18 | 19m40s |   -0.52688 |             0.3053 |    0.0029 |      9.8138 |            98.5699 |      0.9853 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[332]	train-mlogloss:0.343878+0.00116347	test-mlogloss:0.525954+0.00522726



  " state: %s" % convergence_dict)


   19 | 36m37s |   -0.52595 |             0.7514 |    2.9370 |      9.9507 |            43.4940 |      0.9653 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[424]	train-mlogloss:0.354516+0.00142327	test-mlogloss:0.524079+0.00535335

   20 | 15m50s |   -0.52408 |             0.2414 |    2.8622 |      8.8889 |            18.1170 |      0.9892 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[699]	train-mlogloss:0.435319+0.00117557	test-mlogloss:0.528053+0.00611008

   21 | 13m08s |   -0.52805 |             0.2108 |    0.0327 |      4.9744 |            89.0565 |      0.9509 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)


   29 | 08m25s |   -0.52803 |             0.2196 |    0.1880 |      8.9031 |             1.0870 |      0.9877 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[562]	train-mlogloss:0.339802+0.000969745	test-mlogloss:0.522373+0.00486769

   30 | 21m19s | [35m  -0.52237[0m | [32m            0.2939[0m | [32m   2.9831[0m | [32m     7.3638[0m | [32m            5.2056[0m | [32m     0.9477[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[239]	train-mlogloss:0.303319+0.000729387	test-mlogloss:0.525761+0.00488401



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   31 | 12m10s |   -0.52576 |             0.2635 |    0.0687 |      9.7298 |            23.5484 |      0.7600 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[443]	train-mlogloss:0.379603+0.00106819	test-mlogloss:0.5258+0.00542867



  " state: %s" % convergence_dict)


   32 | 15m31s |   -0.52580 |             0.2110 |    2.9076 |      8.9744 |            38.1256 |      0.7151 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[755]	train-mlogloss:0.36593+0.00161783	test-mlogloss:0.526411+0.00510749

   33 | 33m16s |   -0.52641 |             0.2847 |    2.7754 |      9.9386 |            99.8665 |      0.9590 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1049]	train-mlogloss:0.347335+0.000984919	test-mlogloss:0.523264+0.00510022

   34 | 25m06s |   -0.52326 |             0.2001 |    2.8727 |      6.8038 |             2.3888 |      0.9765 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Sto

  " state: %s" % convergence_dict)


   38 | 14m26s |   -0.52587 |             0.2309 |    0.0019 |      9.9000 |            78.4521 |      0.9415 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[293]	train-mlogloss:0.330222+0.00111667	test-mlogloss:0.525889+0.00588314



  " state: %s" % convergence_dict)


   39 | 12m02s |   -0.52589 |             0.2041 |    0.3891 |      9.8814 |            47.7581 |      0.8912 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1337]	train-mlogloss:0.39829+0.000434908	test-mlogloss:0.524267+0.0053944



  " state: %s" % convergence_dict)


   40 | 23m44s |   -0.52427 |             0.2051 |    2.9612 |      4.0368 |             4.6387 |      0.8971 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1181]	train-mlogloss:0.423+0.00120629	test-mlogloss:0.526558+0.00549919



  " state: %s" % convergence_dict)


   41 | 22m37s |   -0.52656 |             0.2235 |    2.9391 |      4.0231 |            46.5912 |      0.7962 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[623]	train-mlogloss:0.388245+0.00132764	test-mlogloss:0.523853+0.00510554



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   42 | 19m14s |   -0.52385 |             0.2732 |    2.9787 |      6.1874 |            25.2938 |      0.9831 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[919]	train-mlogloss:0.406833+0.00171946	test-mlogloss:0.526366+0.00536774



  " state: %s" % convergence_dict)


   43 | 17m00s |   -0.52637 |             0.2148 |    0.0822 |      4.3978 |            67.2873 |      0.9938 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[854]	train-mlogloss:0.374183+0.00122259	test-mlogloss:0.524666+0.00570887

   44 | 15m26s |   -0.52467 |             0.2012 |    0.0196 |      4.8042 |             7.7232 |      0.9656 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[835]	train-mlogloss:0.395393+0.00129872	test-mlogloss:0.524719+0.00562002

   45 | 19m34s |   -0.52472 |             0.2124 |    2.9960 |      5.7975 |            19.2861 |      0.7872 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stop

  " state: %s" % convergence_dict)


   46 | 16m05s |   -0.52391 |             0.2089 |    2.8316 |      7.7883 |             8.0460 |      0.8411 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[1274]	train-mlogloss:0.389239+0.00207845	test-mlogloss:0.524654+0.00553646

   47 | 29m47s |   -0.52465 |             0.2025 |    2.9862 |      6.0283 |            57.2058 |      0.9776 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[760]	train-mlogloss:0.403392+0.00220186	test-mlogloss:0.5259+0.00552248

   48 | 18m24s |   -0.52590 |             0.3094 |    0.2867 |      4.1067 |            40.2533 |      0.9836 | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopp

need improve

In [33]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
19,7.363771,5.205648,0.293906,0.947733,2.983057,-0.522373
23,6.803827,2.388761,0.200079,0.976483,2.872736,-0.523264
1,9.663005,1.476686,0.25575,0.849778,2.70171,-0.523402
31,6.187359,25.293805,0.273249,0.98308,2.978747,-0.523853
35,7.788349,8.04604,0.208883,0.841063,2.831562,-0.523912
38,7.295389,4.728571,0.219052,0.741765,2.649557,-0.523989
2,9.940602,29.480668,0.2068,0.8315,2.909245,-0.524028
9,8.888892,18.117018,0.241398,0.989176,2.862189,-0.524079
17,9.472113,33.943859,0.262672,0.980347,0.186261,-0.524086
29,4.036754,4.638651,0.205099,0.897103,2.961195,-0.524267


In [34]:
test_X.isnull().values.any()

True

In [35]:
tmp = test_X.copy()

In [36]:
null_ind = test_X.num_loc_price_diff.isnull()
test_X['num_loc_price_diff'] = test_X['num_price'] - test_X['num_loc_median_price']
test_X[null_ind][['num_loc_price_diff','num_price','num_loc_median_price']]

Unnamed: 0,num_loc_price_diff,num_price,num_loc_median_price
710,-49.5,2600,2649.5
779,-500.0,2750,3250.0
988,5700.0,8000,2300.0
1542,2800.0,10000,7200.0
2099,-1555.0,4195,5750.0
3447,-200.0,4200,4400.0
3697,-3464.0,2300,5764.0
4662,4200.0,6500,2300.0
4669,3275.0,6200,2925.0
4689,-1169.0,4595,5764.0


In [37]:
test_X.isnull().values.any()

False

In [42]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=5555)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [46]:
estimators = [
#             xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 5,
#                               colsample_bytree = 0.293906 ,
#                               subsample = 0.947733 ,
#                               gamma = 2.983057),
#              xgb.XGBClassifier(max_depth = 6,
#                               min_child_weight = 2,
#                               colsample_bytree = 0.200079,
#                               subsample = 0.976483,
#                               gamma = 2.872736),
#              xgb.XGBClassifier(max_depth = 6,
#                               min_child_weight = 25,
#                               colsample_bytree = 0.273249,
#                               subsample = 0.983080,
#                               gamma = 2.978747),         
#              xgb.XGBClassifier(max_depth = 7,
#                               min_child_weight = 4,
#                               colsample_bytree = 0.219052,
#                               subsample = 0.741765,
#                               gamma = 2.649557),  
             xgb.XGBClassifier(max_depth = 9,
                              min_child_weight = 33,
                              colsample_bytree = 0.262672,
                              subsample = 0.980347,
                              gamma = 0.186261)              
             ]

#  	 	max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 	score
# 19 	7.363771 	5.205648 	 	 	0.293906 	 	 	0.947733 	2.983057 	-0.522373
# 23 	6.803827 	2.388761 	 	 	0.200079 	 	 	0.976483 	2.872736 	-0.523264
####### 1 	9.663005 	1.476686 	 	 	0.255750 	 	 	0.849778 	2.701710 	-0.523402
# 31 	6.187359 	25.293805 	 	 	0.273249 	 	 	0.983080 	2.978747 	-0.523853
####### 35 	7.788349 	8.046040 	 	 	0.208883 	 	 	0.841063 	2.831562 	-0.523912
# 38 	7.295389 	4.728571 	 	 	0.219052 	 	 	0.741765 	2.649557 	-0.523989
####### 2 	9.940602 	29.480668 	 	 	0.206800 	 	 	0.831500 	2.909245 	-0.524028
####### 9 	8.888892 	18.117018 	 	 	0.241398 	 	 	0.989176 	2.862189 	-0.524079
# 17 	9.472113 	33.943859 	 	 	0.262672 	 	 	0.980347 	0.186261 	-0.524086

(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              30,
                              500)


Blend 1 estimators for 30 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.262672,
       gamma=0.186261, learning_rate=0.02, max_delta_step=0, max_depth=9,
       min_child_weight=33, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.980347)
Model 1 fold 1
best round 1710
('Score: ', 0.52962694755729511)
Model 1 fold 1 fitting finished in 1241.869s
Model 1 fold 2
best round 1391
('Score: ', 0.50391127368957611)
Model 1 fold 2 fitting finished in 1057.409s
Model 1 fold 3
best round 2524
('Score: ', 0.47359197539021541)
Model 1 fold 3 fitting finished in 1677.893s
Model 1 fold 4
best round 2337
('Score: ', 0.492760497293112)
Model 1 fold 4 fitting finished in 1574.285s
Model 1 fold 5
best round 2885
('Score: ', 0.48593347557063327)
Model 1 fold 5 fitting finished in 1895.388s
Model 1 fold 6
best round 2480
('Score: ', 0.4772433693131

In [47]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_BM_0331_30blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_BM_0331_30blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_BM_0331_30blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean, delimiter=",")

[ 0.51441484]
[ 1930.33333333]


In [None]:
# data 0322
# [ 0.52385999  0.52420308  0.52429754  0.52366222  0.52450185]
# [ 2866.7  3979.7  3102.9  2783.1  4450.5]

# data 0331 seed = 2017
# [ 0.5161796   0.51727863  0.51867825  0.517129    0.51732854]
# [ 4857.5  6379.5  5516.4  3337.9  1674.5]

In [48]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_BM_0331_30blend_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_xgb_mean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)


# ypreds.columns = cols

# df = pd.read_json(open("../input/test.json", "r"))
# ypreds['listing_id'] = df["listing_id"]

# ypreds.to_csv('my_preds.csv', index=None)

In [48]:
test_blend_x_xgb_gmean[:,9:12]


array([[  3.20544260e-01,   6.10888687e-01,   6.56750820e-02],
       [  9.66028463e-01,   2.30937453e-02,   1.06505838e-02],
       [  9.53579737e-01,   4.13955017e-02,   3.97691225e-03],
       ..., 
       [  9.78252564e-01,   2.03241753e-02,   1.01263996e-03],
       [  9.71474749e-01,   2.74180665e-02,   5.09567844e-04],
       [  5.87161787e-01,   3.92164954e-01,   1.94305868e-02]])

In [49]:
test_blend_x_xgb_gmean[:,:3]

array([[  3.20994298e-01,   6.04517100e-01,   7.12524217e-02],
       [  9.57150480e-01,   3.01748109e-02,   1.18646473e-02],
       [  9.58178383e-01,   3.74607705e-02,   3.46537444e-03],
       ..., 
       [  9.80787885e-01,   1.80516908e-02,   8.23179767e-04],
       [  9.70638017e-01,   2.80394743e-02,   5.14503672e-04],
       [  5.81934901e-01,   3.95535699e-01,   2.09051621e-02]])