In [3]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [4]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 322) (74659, 322) (49352,)


In [5]:
y_low =[]
for i in range(train_X.shape[0]):
    y_low.append(1 if train_y[i] == 0 else 0)
    
y_low = np.array(y_low)  
print np.sum(y_low)

34284


In [6]:
X_train, X_val, y_train, y_val = train_test_split(train_X, y_low, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 322)
(9871, 322)


# Tune XGBoost

In [7]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.389768 775
4 	0.387766 647
5 	0.386815 470
6 	0.388625 244
7 	0.388481 284


In [8]:
for x in [8,9]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

8 	0.39298 164
9 	0.394179 167


In [9]:
max_depth = train_param
print max_depth

5


In [10]:
train_param = 1
for x in [2,4,8,12,16,20]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.388387 458
4 	0.387306 390
8 	0.386451 364
12 	0.387574 431
16 	0.3882 401
20 	0.388992 430


In [11]:
for x in [24,28,32]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

24 	0.387114 494
28 	0.387909 409
32 	0.389136 560


In [12]:
min_child_weight = train_param
print min_child_weight

8


In [13]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.393649 811
0.1 	0.386903 795
0.2 	0.38595 460
0.3 	0.383998 407
0.4 	0.385639 486
0.5 	0.385972 483
0.6 	0.387992 388
0.7 	0.385474 553
0.8 	0.384517 494
0.9 	0.385831 405


In [14]:
colsample_bytree = train_param
print colsample_bytree

0.3


In [15]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.391043 386
0.6 	0.388263 325
0.7 	0.385446 449
0.8 	0.38811 448
0.9 	0.385152 452


In [16]:
subsample = train_param
print subsample

1


In [17]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.38524 566
0.6 	0.385303 444
0.9 	0.386186 409
1.2 	0.385752 465
1.5 	0.385982 407
1.8 	0.384223 498
2.1 	0.384146 462
2.4 	0.38645 536
2.7 	0.386064 420
3.0 	0.387325 346


In [18]:
gamma = train_param
print gamma

0


In [None]:
# 0.3 	0.528756 371
# 0.6 	0.530068 353
# 0.9 	0.530043 275
# 1.2 	0.530065 388
# 1.5 	0.529657 331
# 1.8 	0.529906 328
# 2.1 	0.528338 393
# 2.4 	0.529364 372
# 2.7 	0.527919 456
# 3.0 	0.528962 417

In [28]:
xgtrain = xgb.DMatrix(train_X, label=y_low) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='binary:logistic'
    params['eval_metric']='logloss',
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=100000, nfold=5,
        metrics = 'logloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-logloss-mean'].values[-1]

xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,8),
        'min_child_weight': (4,32),
        'colsample_bytree': (0.2,0.9),
        'subsample': (0.6,1),
        'gamma': (0,2.4)
    }
)

xgb_BO.maximize(init_points=5, n_iter=30)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[728]	train-logloss:0.318965+0.000298937	test-logloss:0.38847+0.0024191

    1 | 05m06s | [35m  -0.38847[0m | [32m            0.3385[0m | [32m   1.4743[0m | [32m     4.3969[0m | [32m           30.8001[0m | [32m     0.8516[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[376]	train-logloss:0.29796+0.00110782	test-logloss:0.387419+0.00227476

    2 | 05m50s | [35m  -0.38742[0m | [32m            0.6816[0m | [32m   1.

  " state: %s" % convergence_dict)


[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[324]	train-logloss:0.248693+0.00337335	test-logloss:0.388228+0.00273798

    6 | 04m07s |   -0.38823 |             0.2570 |    0.6062 |      7.4735 |             5.5443 |      0.9632 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[262]	train-logloss:0.299482+0.00129517	test-logloss:0.391794+0.00165781

    7 | 07m49s |   -0.39179 |             0.8926 |    0.3489 |      7.8022 |            31.8117 |      0.6087 | 
Multiple ev

  " state: %s" % convergence_dict)


    8 | 09m43s | [35m  -0.38704[0m | [32m            0.8620[0m | [32m   0.0115[0m | [32m     4.2145[0m | [32m           22.5101[0m | [32m     0.9581[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[803]	train-logloss:0.300608+0.00140029	test-logloss:0.387062+0.00253108



  " state: %s" % convergence_dict)


    9 | 04m55s |   -0.38706 |             0.2262 |    0.3218 |      4.0504 |             4.1954 |      0.7397 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[497]	train-logloss:0.328352+0.000756206	test-logloss:0.390853+0.00235392

   10 | 08m11s |   -0.39085 |             0.8824 |    2.3783 |      4.0122 |            22.3863 |      0.6186 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[639]	train-logloss:0.306761+0.000471879	test-logloss:0.386068+0.0025347

   11 | 08m41s | [35m  -0.38607[0m | [32m            0.7687[0m | [32m   0.0030[0m | [32m     4.3620[0m | [32m            8.3544[0m | [32m     0.9866[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until te

  " state: %s" % convergence_dict)


   16 | 07m31s |   -0.38889 |             0.8174 |    0.0011 |      4.1763 |            12.0946 |      0.6670 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[820]	train-logloss:0.300991+0.00034811	test-logloss:0.386418+0.00201665

   17 | 07m25s |   -0.38642 |             0.4661 |    0.0772 |      4.0678 |            17.6464 |      0.9901 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[782]	train-logloss:0.303133+0.000998155	test-logloss:0.386062+0.00210512

   18 | 05m13s | [35m  -0.38606[0m | [32m            0.2904[0m | [32m   0.0751[0m | [32m     4.6980[0m | [32m            6.4395[0m | [32m     0.9887[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until te

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   23 | 07m38s |   -0.38776 |             0.8433 |    2.3496 |      7.9671 |            16.0185 |      0.9537 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[600]	train-logloss:0.303791+0.00104357	test-logloss:0.388908+0.00201635

   24 | 09m54s |   -0.38891 |             0.8989 |    0.2711 |      4.0468 |             6.4124 |      0.6144 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[423]	train-logloss:0.283377+0.00199258	test-logloss:0.387398+0.00301243

   25 | 03m47s |   -0.38740 |             0.2267 |    0.0808 |      6.2003 |            15.4702 |      0.9930 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best

  " state: %s" % convergence_dict)


   26 | 04m54s |   -0.38747 |             0.2614 |    2.2310 |      4.0952 |            17.5443 |      0.9932 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[796]	train-logloss:0.305274+0.000468474	test-logloss:0.386951+0.00225619

   27 | 11m50s |   -0.38695 |             0.8990 |    2.2081 |      4.9470 |            28.3733 |      0.9805 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[744]	train-logloss:0.312185+0.000751908	test-logloss:0.386993+0.00176623



  " state: %s" % convergence_dict)


   28 | 04m31s |   -0.38699 |             0.2275 |    0.0220 |      4.6321 |             9.6112 |      0.9972 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[782]	train-logloss:0.29954+0.000548602	test-logloss:0.386928+0.00187042



  " state: %s" % convergence_dict)


   29 | 06m03s |   -0.38693 |             0.3463 |    2.2058 |      4.0017 |             4.2505 |      0.9322 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[788]	train-logloss:0.308055+0.000789245	test-logloss:0.387922+0.00258852

   30 | 10m51s |   -0.38792 |             0.7876 |    2.3949 |      4.5742 |            31.9361 |      0.9433 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[939]	train-logloss:0.308478+0.0035556	test-logloss:0.386348+0.00196064



  " state: %s" % convergence_dict)


   31 | 06m08s |   -0.38635 |             0.2850 |    2.3411 |      4.0639 |             7.9416 |      0.9999 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[843]	train-logloss:0.300769+0.00395044	test-logloss:0.386569+0.00222769

   32 | 09m58s |   -0.38657 |             0.6673 |    2.3495 |      4.1148 |            12.9803 |      0.9987 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[597]	train-logloss:0.321669+0.000558272	test-logloss:0.387577+0.00287125



  " state: %s" % convergence_dict)


   33 | 09m09s |   -0.38758 |             0.8573 |    0.0118 |      4.3225 |            30.9361 |      0.9911 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[490]	train-logloss:0.292768+0.000522952	test-logloss:0.38615+0.00245278

   34 | 09m33s |   -0.38615 |             0.8838 |    0.1358 |      5.0210 |            19.8662 |      0.9847 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[396]	train-logloss:0.274398+0.000938045	test-logloss:0.386871+0.00202166



  " state: %s" % convergence_dict)


   35 | 09m24s |   -0.38687 |             0.8933 |    0.7168 |      6.0019 |            17.1897 |      0.9977 | 


In [29]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
12,4.697965,6.439472,0.290381,0.988734,0.075128,-0.386062
5,4.361961,8.354384,0.768667,0.986623,0.003024,-0.386068
28,5.020978,19.866215,0.883846,0.984659,0.135751,-0.38615
25,4.063896,7.941608,0.285013,0.999906,2.341089,-0.386348
11,4.067804,17.646352,0.466116,0.990075,0.077247,-0.386418
26,4.114844,12.98027,0.667338,0.998706,2.349501,-0.386569
29,6.001897,17.189702,0.89332,0.997712,0.716788,-0.386871
23,4.001672,4.250499,0.346327,0.932208,2.20577,-0.386928
21,4.946982,28.373279,0.899002,0.980545,2.208064,-0.386951
22,4.632058,9.611172,0.22746,0.997227,0.02204,-0.386993


In [40]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'binary:logistic')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.01)
        est.set_params(n_estimators=1000000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'logloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [41]:
estimators = [
            xgb.XGBClassifier(max_depth = 4,
                              min_child_weight = 6,
                              colsample_bytree = 0.290381 ,
                              subsample = 0.988734 ,
                              gamma = 0.075128)          
             ]
#  	 	max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 	 	score
# 12 	4.697965 	6.439472 	 	 	0.290381 	 	 	0.988734 	0.075128 	-0.386062
# 5 	4.361961 	8.354384 	 	 	0.768667 	 	 	0.986623 	0.003024 	-0.386068
# 28 	5.020978 	19.866215 	 	 	0.883846 	 	 	0.984659 	0.135751 	-0.386150


(train_blend_x_xgb_low,
 test_blend_x_xgb_mean_low,
 test_blend_x_xgb_gmean_low,
 blend_scores_xgb_low,
 best_rounds_xgb_low) = xgb_blend(estimators,
                              train_X,y_low,
                              test_X,
                              10,
                              300)


Blend 1 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.290381,
       gamma=0.075128, learning_rate=0.01, max_delta_step=0, max_depth=4,
       min_child_weight=6, missing=None, n_estimators=1000000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.988734)
Model 1 fold 1
best round 8144
('Score: ', 0.3789331444729267)
Model 1 fold 1 fitting finished in 639.601s
Model 1 fold 2
best round 7940
('Score: ', 0.36865721507830884)
Model 1 fold 2 fitting finished in 623.079s
Model 1 fold 3
best round 7523
('Score: ', 0.38109002739399661)
Model 1 fold 3 fitting finished in 593.384s
Model 1 fold 4
best round 10223
('Score: ', 0.36724477967345648)
Model 1 fold 4 fitting finished in 798.468s
Model 1 fold 5
best round 7520
('Score: ', 0.38253252385543984)
Model 1 fold 5 fitting finished in 588.520s
Model 1 fold 6
best round 8744
('Score: ', 0.382967509038668

In [37]:
test_blend_x_xgb_gmean_low

array([[ 0.46335066,  0.53660144],
       [ 0.35060727,  0.64935041],
       [ 0.37533208,  0.62466067],
       ..., 
       [ 0.37050161,  0.62949031],
       [ 0.33564321,  0.66435498],
       [ 0.44256736,  0.55742242]])

In [44]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_low_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_low_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_low_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb_low,axis=0))
print (np.mean(best_rounds_xgb_low,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb_low, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean_low, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean_low, delimiter=",")

[ 0.3835474]
[ 8421.4]


In [57]:
y_medium =[]
for i in range(train_X.shape[0]):
    y_medium.append(1 if train_y[i] == 1 else 0)
    
y_medium = np.array(y_medium)  
print np.sum(y_medium)

11229


In [58]:
X_train, X_val, y_train, y_val = train_test_split(train_X, y_medium, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 322)
(9871, 322)


In [59]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.427114 556
4 	0.426936 326
5 	0.425979 341
6 	0.425824 214
7 	0.428035 175


In [61]:
for x in [8,9,10]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

8 	0.428427 138
9 	0.429057 75
10 	0.431596 79


In [62]:
max_depth = train_param
print max_depth

6


In [63]:
train_param = 1
for x in [2,4,8,12,16,20]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.427491 207
4 	0.426616 192
8 	0.426349 213
12 	0.425813 235
16 	0.426269 225
20 	0.425559 261


In [64]:
for x in [24,28,32,36,40]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

24 	0.425494 285
28 	0.425878 249
32 	0.425939 250
36 	0.426459 222
40 	0.425027 228


In [66]:
for x in [50,60,70,80]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

50 	0.425912 321
60 	0.424979 284
70 	0.426524 395
80 	0.427186 208


In [74]:
min_child_weight = train_param
print min_child_weight

60


In [75]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.431676 711
0.1 	0.427319 487
0.2 	0.425365 444
0.3 	0.424966 430
0.4 	0.42466 241
0.5 	0.424707 380
0.6 	0.425919 229
0.7 	0.426333 258
0.8 	0.425071 339
0.9 	0.425589 184


In [76]:
colsample_bytree = train_param
print colsample_bytree

0.4


In [77]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.431324 235
0.6 	0.429724 241
0.7 	0.428098 292
0.8 	0.426616 378
0.9 	0.425089 271


In [78]:
subsample = train_param
print subsample

1


In [79]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.424238 325
0.6 	0.424873 362
0.9 	0.424794 293
1.2 	0.425611 362
1.5 	0.424802 375
1.8 	0.424455 352
2.1 	0.426117 230
2.4 	0.425576 374
2.7 	0.425873 312
3.0 	0.425292 303


In [80]:
gamma = train_param
print gamma

0.3


In [81]:
xgtrain = xgb.DMatrix(train_X, label=y_medium) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='binary:logistic'
    params['eval_metric']='logloss',
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=100000, nfold=5,
        metrics = 'logloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-logloss-mean'].values[-1]

xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,8),
        'min_child_weight': (8,70),
        'colsample_bytree': (0.2,1),
        'subsample': (0.6,1),
        'gamma': (0,2.1)
    }
)

xgb_BO.maximize(init_points=5, n_iter=30)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[388]	train-logloss:0.358773+0.000627986	test-logloss:0.426161+0.00431845

    1 | 07m03s | [35m  -0.42616[0m | [32m            0.3466[0m | [32m   1.6306[0m | [32m     5.5204[0m | [32m           28.0399[0m | [32m     0.9024[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[183]	train-logloss:0.35721+0.0014735	test-logloss:0.426745+0.0036933

    2 | 04m56s |   -0.42675 |             0.3380 |    0.5470 |      7.8976 |   

  " state: %s" % convergence_dict)


    8 | 03m36s | [35m  -0.42565[0m | [32m            0.2508[0m | [32m   0.0278[0m | [32m     4.5962[0m | [32m            8.3928[0m | [32m     0.9973[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[643]	train-logloss:0.377441+0.000992163	test-logloss:0.427881+0.00354449



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


    9 | 03m57s |   -0.42788 |             0.2256 |    0.2465 |      4.0341 |            69.9510 |      0.9968 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[502]	train-logloss:0.371334+0.000583189	test-logloss:0.425545+0.00245458

   10 | 03m14s | [35m  -0.42555[0m | [32m            0.2146[0m | [32m   2.0760[0m | [32m     4.2445[0m | [32m           14.4548[0m | [32m     0.9245[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[661]	train-logloss:0.368665+0.000838283	test-logloss:0.426622+0.0032114



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   11 | 04m14s |   -0.42662 |             0.2436 |    2.0778 |      4.0620 |            41.3385 |      0.9412 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[272]	train-logloss:0.347235+0.00163009	test-logloss:0.424918+0.00358875

   12 | 03m08s | [35m  -0.42492[0m | [32m            0.2476[0m | [32m   0.4003[0m | [32m     7.8826[0m | [32m           36.3112[0m | [32m     0.9922[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[204]	train-logloss:0.349396+0.00150577	test-logloss:0.426233+0.00364197

   13 | 06m35s |   -0.42623 |             0.9352 |    2.0559 |      7.9671 |            34.6124 |      0.9537 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until tes

  " state: %s" % convergence_dict)


   16 | 05m26s |   -0.42585 |             0.7221 |    0.0770 |      7.9884 |            43.0463 |      0.9828 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[325]	train-logloss:0.352969+0.00194674	test-logloss:0.427305+0.00371601



  " state: %s" % convergence_dict)


   17 | 03m54s |   -0.42730 |             0.2840 |    0.0568 |      7.9479 |            64.1360 |      0.9865 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[435]	train-logloss:0.367616+0.000803206	test-logloss:0.424737+0.003349

   18 | 06m44s | [35m  -0.42474[0m | [32m            0.8499[0m | [32m   0.0026[0m | [32m     4.3620[0m | [32m           17.6419[0m | [32m     0.9866[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[199]	train-logloss:0.344222+0.00119354	test-logloss:0.424839+0.00362764



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   19 | 03m33s |   -0.42484 |             0.4236 |    1.8921 |      7.9035 |            18.4537 |      0.9963 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[441]	train-logloss:0.364617+0.00145002	test-logloss:0.424751+0.00325523



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   20 | 07m47s |   -0.42475 |             0.9746 |    2.0829 |      4.6732 |             8.5282 |      0.9946 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[654]	train-logloss:0.369632+0.000920136	test-logloss:0.426423+0.00389196

   21 | 06m53s |   -0.42642 |             0.4559 |    2.0627 |      4.0547 |            59.2859 |      0.9929 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[179]	train-logloss:0.339394+0.00122724	test-logloss:0.424844+0.00317741

   22 | 06m08s |   -0.42484 |             0.9731 |    1.8626 |      7.7017 |            15.2944 |      0.9987 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Bes

  " state: %s" % convergence_dict)


   24 | 03m57s |   -0.42681 |             0.2893 |    0.0796 |      4.0172 |            48.0601 |      0.9857 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[439]	train-logloss:0.36498+0.00123905	test-logloss:0.424943+0.00341856

   25 | 07m39s |   -0.42494 |             0.9795 |    2.0901 |      4.7571 |            17.7530 |      0.9777 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[334]	train-logloss:0.344931+0.00230646	test-logloss:0.426215+0.00318002



  " state: %s" % convergence_dict)


   26 | 03m34s |   -0.42621 |             0.2299 |    2.0216 |      7.6593 |            46.7562 |      0.9772 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[214]	train-logloss:0.348093+0.00166594	test-logloss:0.425047+0.0030231



  " state: %s" % convergence_dict)


   27 | 05m14s |   -0.42505 |             0.6706 |    0.0098 |      7.9210 |            31.6615 |      0.9905 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[423]	train-logloss:0.367148+0.00102646	test-logloss:0.424996+0.00295159

   28 | 07m05s |   -0.42500 |             0.9299 |    0.1161 |      4.1818 |            13.6859 |      0.9938 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[507]	train-logloss:0.38268+0.00110085	test-logloss:0.427509+0.00354797

   29 | 04m04s |   -0.42751 |             0.3297 |    2.0656 |      4.3555 |            65.5540 |      0.9967 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best 

  " state: %s" % convergence_dict)


   33 | 07m58s |   -0.42496 |             0.9323 |    0.1641 |      4.6704 |            16.4485 |      0.9914 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[333]	train-logloss:0.351626+0.00115566	test-logloss:0.426568+0.00304313

   34 | 03m42s |   -0.42657 |             0.2255 |    1.4992 |      7.9005 |            58.3629 |      0.9993 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[600]	train-logloss:0.36381+0.000675577	test-logloss:0.425611+0.0029427



  " state: %s" % convergence_dict)


   35 | 04m08s |   -0.42561 |             0.2441 |    0.2925 |      4.2201 |            20.7939 |      0.9994 | 


In [82]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
25,5.313812,16.438783,0.986761,0.986039,0.034931,-0.424444
12,4.361961,17.641851,0.849905,0.986623,0.002646,-0.424737
14,4.673238,8.528189,0.974644,0.994641,2.082919,-0.424751
13,7.903491,18.453651,0.423628,0.996315,1.892103,-0.424839
16,7.701706,15.294353,0.973098,0.998693,1.862606,-0.424844
6,7.882566,36.311186,0.247632,0.992209,0.400293,-0.424918
19,4.757087,17.753006,0.979509,0.977735,2.090111,-0.424943
27,4.670405,16.448451,0.932301,0.991362,0.164141,-0.424964
22,4.181783,13.685928,0.929936,0.993809,0.116063,-0.424996
17,7.187237,18.001715,0.906303,0.996229,0.203309,-0.425028


In [83]:
estimators = [
            xgb.XGBClassifier(max_depth = 5,
                              min_child_weight = 16,
                              colsample_bytree = 0.986761 ,
                              subsample = 0.986039 ,
                              gamma = 0.034931)          
             ]



(train_blend_x_xgb_medium,
 test_blend_x_xgb_mean_medium,
 test_blend_x_xgb_gmean_medium,
 blend_scores_xgb_medium,
 best_rounds_xgb_medium) = xgb_blend(estimators,
                              train_X,y_medium,
                              test_X,
                              10,
                              300)


Blend 1 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.986761,
       gamma=0.034931, learning_rate=0.01, max_delta_step=0, max_depth=5,
       min_child_weight=16, missing=None, n_estimators=1000000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.986039)
Model 1 fold 1
best round 3668
('Score: ', 0.42200046767533278)
Model 1 fold 1 fitting finished in 943.465s
Model 1 fold 2
best round 4558
('Score: ', 0.40364797993256535)
Model 1 fold 2 fitting finished in 1149.859s
Model 1 fold 3
best round 3037
('Score: ', 0.4236135890042792)
Model 1 fold 3 fitting finished in 787.461s
Model 1 fold 4
best round 4561
('Score: ', 0.41215656720418348)
Model 1 fold 4 fitting finished in 1145.537s
Model 1 fold 5
best round 2795
('Score: ', 0.42871615508426603)
Model 1 fold 5 fitting finished in 730.797s
Model 1 fold 6
best round 2622
('Score: ', 0.4190467267906

In [84]:
name_train_blend = '../output/train_blend_xgb_medium_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_medium_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_medium_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb_medium,axis=0))
print (np.mean(best_rounds_xgb_medium,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb_medium, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean_medium, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean_medium, delimiter=",")

[ 0.42222178]
[ 3580.4]


In [85]:
y_high =[]
for i in range(train_X.shape[0]):
    y_high.append(1 if train_y[i] == 2 else 0)
    
y_high = np.array(y_high)  
print np.sum(y_high)

3839


In [86]:
X_train, X_val, y_train, y_val = train_test_split(train_X, y_high, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 322)
(9871, 322)


In [87]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.177956 678
4 	0.176987 375
5 	0.178364 301
6 	0.179676 151
7 	0.179587 172


In [88]:
max_depth = train_param
print max_depth

4


In [89]:
train_param = 1
for x in [2,4,8,12,16,20]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.176475 476
4 	0.177699 451
8 	0.178257 433
12 	0.180064 479
16 	0.179598 329
20 	0.179727 313


In [90]:
min_child_weight = train_param
print min_child_weight

2


In [91]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.178819 940
0.1 	0.178555 481
0.2 	0.177601 371
0.3 	0.175319 479
0.4 	0.175341 517
0.5 	0.176634 492
0.6 	0.17636 555
0.7 	0.177471 524
0.8 	0.178216 345
0.9 	0.177681 500


In [92]:
colsample_bytree = train_param
print colsample_bytree

0.3


In [93]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.178006 396
0.6 	0.177543 414
0.7 	0.177879 388
0.8 	0.177606 492
0.9 	0.176629 382


In [94]:
subsample = train_param
print subsample

1


In [95]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='binary:logistic',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='logloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.176517 342
0.6 	0.175433 424
0.9 	0.17573 444
1.2 	0.175961 486
1.5 	0.176763 389
1.8 	0.176244 560
2.1 	0.175417 433
2.4 	0.176278 511
2.7 	0.176951 486
3.0 	0.176515 395


In [96]:
gamma = train_param
print gamma

0


In [97]:
xgtrain = xgb.DMatrix(train_X, label=y_high) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='binary:logistic'
    params['eval_metric']='logloss',
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=100000, nfold=5,
        metrics = 'logloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-logloss-mean'].values[-1]

xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,8),
        'min_child_weight': (1,12),
        'colsample_bytree': (0.2,0.5),
        'subsample': (0.7,1),
        'gamma': (0,3)
    }
)

xgb_BO.maximize(init_points=5, n_iter=30)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[395]	train-logloss:0.10991+0.000920344	test-logloss:0.171475+0.00697916

    1 | 02m51s | [35m  -0.17148[0m | [32m            0.2550[0m | [32m   2.3294[0m | [32m     5.5204[0m | [32m            4.5555[0m | [32m     0.9268[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[209]	train-logloss:0.090857+0.00129286	test-logloss:0.173007+0.00720012

    2 | 02m12s |   -0.17301 |             0.2518 |    0.7814 |      7.8976 | 

  " state: %s" % convergence_dict)


    6 | 04m20s |   -0.17261 |             0.4121 |    2.9446 |      7.9800 |             1.2759 |      0.8209 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[432]	train-logloss:0.131106+0.001821	test-logloss:0.172601+0.00671771

    7 | 02m50s |   -0.17260 |             0.2096 |    0.3522 |      4.0341 |            11.9913 |      0.9976 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[264]	train-logloss:0.105272+0.00142327	test-logloss:0.172168+0.00669271



  " state: %s" % convergence_dict)


    8 | 03m07s |   -0.17217 |             0.2448 |    2.9061 |      7.9759 |            11.9230 |      0.9290 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[568]	train-logloss:0.11638+0.00129432	test-logloss:0.171954+0.0070856



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


    9 | 03m34s |   -0.17195 |             0.2224 |    0.4372 |      4.0260 |             1.2505 |      0.9796 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[522]	train-logloss:0.120977+0.00112013	test-logloss:0.172069+0.00699078



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   10 | 05m03s |   -0.17207 |             0.4547 |    2.9764 |      4.1430 |             8.7463 |      0.9578 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[657]	train-logloss:0.1165+0.00160376	test-logloss:0.171697+0.00704755



  " state: %s" % convergence_dict)


   11 | 06m22s |   -0.17170 |             0.4905 |    2.9756 |      4.6732 |             1.0937 |      0.9960 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[286]	train-logloss:0.103074+0.00130976	test-logloss:0.171855+0.00707092

   12 | 04m35s |   -0.17185 |             0.4980 |    2.9914 |      6.7088 |             6.4224 |      0.9916 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[525]	train-logloss:0.123516+0.00142058	test-logloss:0.172031+0.007291



  " state: %s" % convergence_dict)


   13 | 03m41s |   -0.17203 |             0.2702 |    2.7764 |      4.0515 |             5.3757 |      0.9901 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[334]	train-logloss:0.109252+0.000367714	test-logloss:0.171845+0.00662825



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   14 | 02m57s |   -0.17184 |             0.2002 |    2.9781 |      6.4376 |             8.6927 |      0.9748 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[581]	train-logloss:0.122602+0.00149505	test-logloss:0.172046+0.00723664

   15 | 05m49s |   -0.17205 |             0.3121 |    2.9644 |      4.2976 |            11.7543 |      0.9802 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[328]	train-logloss:0.102556+0.000794462	test-logloss:0.171938+0.00698952



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   16 | 05m57s |   -0.17194 |             0.2235 |    2.9580 |      6.4603 |             2.9046 |      0.9967 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[499]	train-logloss:0.119596+0.00130554	test-logloss:0.171646+0.00634939

   17 | 08m39s |   -0.17165 |             0.4331 |    0.1608 |      4.9610 |             5.8846 |      0.9953 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[271]	train-logloss:0.0948834+0.00106594	test-logloss:0.171673+0.00649617



  " state: %s" % convergence_dict)


   18 | 05m05s |   -0.17167 |             0.2203 |    0.0851 |      6.3433 |             1.1436 |      0.9923 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[269]	train-logloss:0.107222+0.00135952	test-logloss:0.171361+0.0067242



  " state: %s" % convergence_dict)


   19 | 02m47s | [35m  -0.17136[0m | [32m            0.2322[0m | [32m   1.9047[0m | [32m     6.0060[0m | [32m            6.4164[0m | [32m     0.9998[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[359]	train-logloss:0.107731+0.000745078	test-logloss:0.171857+0.00724305

   20 | 04m47s |   -0.17186 |             0.4982 |    1.6629 |      5.4155 |             4.9300 |      0.9863 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[575]	train-logloss:0.1201+0.00111413	test-logloss:0.172123+0.00709367

   21 | 03m33s |   -0.17212 |             0.2074 |    0.0822 |      4.2652 |             8.3653 |      0.9938 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   24 | 05m43s |   -0.17171 |             0.2364 |    2.9264 |      4.0639 |             2.5485 |      0.9999 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[271]	train-logloss:0.103837+0.00170992	test-logloss:0.171359+0.00740406



  " state: %s" % convergence_dict)


   25 | 02m50s | [35m  -0.17136[0m | [32m            0.2320[0m | [32m   0.2116[0m | [32m     6.5459[0m | [32m            6.8521[0m | [32m     0.9980[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[312]	train-logloss:0.107192+0.00144881	test-logloss:0.172138+0.00688724



  " state: %s" % convergence_dict)


   26 | 02m58s |   -0.17214 |             0.2028 |    1.6719 |      6.2454 |            11.2442 |      0.9876 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[362]	train-logloss:0.111562+0.00110633	test-logloss:0.17147+0.00672054

   27 | 03m22s |   -0.17147 |             0.2613 |    0.5547 |      5.5618 |             6.3850 |      0.9888 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[286]	train-logloss:0.0984188+0.000790129	test-logloss:0.171823+0.00691752



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   28 | 02m50s |   -0.17182 |             0.2071 |    1.9347 |      6.9774 |             1.0441 |      0.9981 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[389]	train-logloss:0.110262+0.00149144	test-logloss:0.17156+0.00677134



  " state: %s" % convergence_dict)


   29 | 03m11s |   -0.17156 |             0.2236 |    0.7930 |      5.8304 |             6.8156 |      0.9996 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[337]	train-logloss:0.116402+0.00127411	test-logloss:0.171859+0.00648323



  " state: %s" % convergence_dict)


   30 | 02m49s |   -0.17186 |             0.2034 |    0.0873 |      5.7365 |             5.2219 |      0.9451 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[160]	train-logloss:0.091836+0.00106874	test-logloss:0.172203+0.00675766



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   31 | 03m03s |   -0.17220 |             0.3822 |    0.0227 |      7.9737 |             1.1807 |      0.9788 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[259]	train-logloss:0.106361+0.000989804	test-logloss:0.171354+0.0066011



  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   32 | 03m02s | [35m  -0.17135[0m | [32m            0.2549[0m | [32m   1.5280[0m | [32m     6.7523[0m | [32m            6.3547[0m | [32m     0.9987[0m | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[393]	train-logloss:0.10844+0.000894609	test-logloss:0.171926+0.00665133



  " state: %s" % convergence_dict)


   33 | 03m37s |   -0.17193 |             0.2571 |    0.1684 |      5.9816 |             6.3250 |      0.9860 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[225]	train-logloss:0.104176+0.00125071	test-logloss:0.172456+0.00668234

   34 | 02m58s |   -0.17246 |             0.2062 |    2.5126 |      7.7983 |             7.7306 |      0.9928 | 
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[554]	train-logloss:0.118528+0.00124893	test-logloss:0.1722+0.00679825

   35 | 04m01s |   -0.17220 |             0.2422 |    0.2084 |      4.0440 |             4.0712 |      0.9859 | 


In [98]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
26,6.752262,6.354654,0.254931,0.99869,1.52802,-0.171354
19,6.545895,6.852102,0.232032,0.998036,0.211568,-0.171359
13,6.005964,6.41644,0.23223,0.99979,1.904689,-0.171361
21,5.56177,6.384971,0.261301,0.988807,0.554664,-0.17147
23,5.830431,6.815567,0.223637,0.999604,0.792956,-0.17156
11,4.961041,5.884557,0.433066,0.995258,0.160786,-0.171646
12,6.34327,1.143575,0.220345,0.992262,0.085061,-0.171673
5,4.673238,1.093711,0.490491,0.99598,2.975598,-0.171697
18,4.063896,2.548489,0.236434,0.99993,2.926362,-0.171714
16,7.875869,11.841097,0.252646,0.985262,0.00286,-0.17175


In [99]:
estimators = [
            xgb.XGBClassifier(max_depth = 6,
                              min_child_weight = 6,
                              colsample_bytree = 0.254931 ,
                              subsample = 0.998690 ,
                              gamma = 1.528020)          
             ]



(train_blend_x_xgb_high,
 test_blend_x_xgb_mean_high,
 test_blend_x_xgb_gmean_high,
 blend_scores_xgb_high,
 best_rounds_xgb_high) = xgb_blend(estimators,
                              train_X,y_high,
                              test_X,
                              10,
                              300)

Blend 1 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.254931,
       gamma=1.52802, learning_rate=0.01, max_delta_step=0, max_depth=6,
       min_child_weight=6, missing=None, n_estimators=1000000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.99869)
Model 1 fold 1
best round 3593
('Score: ', 0.16040199197453783)
Model 1 fold 1 fitting finished in 409.761s
Model 1 fold 2
best round 4291
('Score: ', 0.16251477268862224)
Model 1 fold 2 fitting finished in 438.511s
Model 1 fold 3
best round 3230
('Score: ', 0.1726481059179639)
Model 1 fold 3 fitting finished in 338.188s
Model 1 fold 4
best round 4015
('Score: ', 0.15474943988224088)
Model 1 fold 4 fitting finished in 413.777s
Model 1 fold 5
best round 3320
('Score: ', 0.17994719447588786)
Model 1 fold 5 fitting finished in 345.771s
Model 1 fold 6
best round 2799
('Score: ', 0.16513287584688019)

In [100]:
name_train_blend = '../output/train_blend_xgb_high_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_high_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_high_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb_high,axis=0))
print (np.mean(best_rounds_xgb_high,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb_high, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean_high, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean_high, delimiter=",")

[ 0.16811698]
[ 3406.5]


In [110]:
train_blend_x_xgb_low[:10]

array([[ 0.4236182 ,  0.5763818 ],
       [ 0.59415483,  0.40584514],
       [ 0.21109557,  0.78890443],
       [ 0.19131744,  0.80868256],
       [ 0.02921897,  0.97078103],
       [ 0.28675401,  0.71324599],
       [ 0.24010056,  0.75989944],
       [ 0.64165068,  0.35834932],
       [ 0.04677582,  0.95322418],
       [ 0.02284718,  0.97715282]])

In [111]:
train_blend_x_xgb_medium[:10]

array([[ 0.61000806,  0.38999194],
       [ 0.53991276,  0.46008724],
       [ 0.79552573,  0.20447429],
       [ 0.72584319,  0.27415678],
       [ 0.95426494,  0.04573506],
       [ 0.80321014,  0.19678983],
       [ 0.75437468,  0.2456253 ],
       [ 0.69807637,  0.30192366],
       [ 0.87716442,  0.1228356 ],
       [ 0.97949302,  0.02050696]])

In [112]:
train_blend_x_xgb_high[:10]

array([[  9.67689753e-01,   3.23102176e-02],
       [  9.45587397e-01,   5.44126257e-02],
       [  9.81699526e-01,   1.83004793e-02],
       [  9.96058881e-01,   3.94109543e-03],
       [  9.99282181e-01,   7.17798830e-04],
       [  9.63740110e-01,   3.62598673e-02],
       [  9.90686119e-01,   9.31385346e-03],
       [  7.30362296e-01,   2.69637674e-01],
       [  9.84624028e-01,   1.53759578e-02],
       [  9.99281704e-01,   7.18279567e-04]])

In [109]:
train_y[:10]

array([1, 0, 1, 1, 0, 0, 0, 2, 0, 0])

In [125]:
train_blend_x_xgb = np.vstack([train_blend_x_xgb_low[:,1],train_blend_x_xgb_medium[:,1],train_blend_x_xgb_high[:,1]]).T
train_blend_x_xgb.shape

(49352, 3)

In [126]:
train_blend_x_xgb[:10]

array([[  5.76381803e-01,   3.89991939e-01,   3.23102176e-02],
       [  4.05845135e-01,   4.60087240e-01,   5.44126257e-02],
       [  7.88904428e-01,   2.04474285e-01,   1.83004793e-02],
       [  8.08682561e-01,   2.74156779e-01,   3.94109543e-03],
       [  9.70781028e-01,   4.57350612e-02,   7.17798830e-04],
       [  7.13245988e-01,   1.96789831e-01,   3.62598673e-02],
       [  7.59899437e-01,   2.45625302e-01,   9.31385346e-03],
       [  3.58349323e-01,   3.01923662e-01,   2.69637674e-01],
       [  9.53224182e-01,   1.22835599e-01,   1.53759578e-02],
       [  9.77152824e-01,   2.05069594e-02,   7.18279567e-04]])

In [127]:
log_loss(train_y,train_blend_x_xgb)

0.52698075424503898

In [128]:
test_blend_x_xgb_mean = np.vstack([test_blend_x_xgb_mean_low[:,1],test_blend_x_xgb_mean_medium[:,1],test_blend_x_xgb_mean_high[:,1]]).T
test_blend_x_xgb_mean.shape

(74659, 3)

In [129]:
test_blend_x_xgb_gmean = np.vstack([test_blend_x_xgb_gmean_low[:,1],test_blend_x_xgb_gmean_medium[:,1],test_blend_x_xgb_gmean_high[:,1]]).T
test_blend_x_xgb_gmean.shape

(74659, 3)

In [131]:
name_train_blend = '../output/train_blend_xgb_ovr_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_ovr_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_ovr_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


# print (np.mean(blend_scores_xgb,axis=0))
# print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean, delimiter=",")

In [None]:
# [ 0.52385999  0.52420308  0.52429754  0.52366222  0.52450185]
# [ 2866.7  3979.7  3102.9  2783.1  4450.5]

In [132]:
# now = datetime.now()
sub_name = '../output/sub_xgb_ovr_BM_0322_mean_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_xgb_mean)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)


# ypreds.columns = cols

# df = pd.read_json(open("../input/test.json", "r"))
# ypreds['listing_id'] = df["listing_id"]

# ypreds.to_csv('my_preds.csv', index=None)