In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [7]:
data_path = "../input/"
train_X = pd.read_pickle(data_path + 'train_X_0319.pkl')
test_X = pd.read_pickle(data_path + 'test_X_0319.pkl')
train_y = np.ravel(pd.read_pickle(data_path + 'train_y_0319.pkl'))
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 464) (74659, 464) (49352,)


In [10]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 464)
(9871, 464)


In [11]:
# rgr = xgb.XGBClassifier(objective = 'multi:softprob',
#                        learning_rate = 0.1,
#                        n_estimators = 10000,
#                        nthread = -1)

# rgr.fit(X_train,y_train,
#         eval_set=[(X_val,y_val)],
#         eval_metric='mlogloss',
# #         num_class = 3,
#         early_stopping_rounds=50,
#         verbose=25
#        )

[0]	validation_0-mlogloss:1.03718
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.635855
[50]	validation_0-mlogloss:0.594012
[75]	validation_0-mlogloss:0.578747
[100]	validation_0-mlogloss:0.569621
[125]	validation_0-mlogloss:0.563314
[150]	validation_0-mlogloss:0.55842
[175]	validation_0-mlogloss:0.554957
[200]	validation_0-mlogloss:0.551957
[225]	validation_0-mlogloss:0.549701
[250]	validation_0-mlogloss:0.548004
[275]	validation_0-mlogloss:0.546493
[300]	validation_0-mlogloss:0.545255
[325]	validation_0-mlogloss:0.544215
[350]	validation_0-mlogloss:0.543186
[375]	validation_0-mlogloss:0.542323
[400]	validation_0-mlogloss:0.541659
[425]	validation_0-mlogloss:0.540934
[450]	validation_0-mlogloss:0.540231
[475]	validation_0-mlogloss:0.539755
[500]	validation_0-mlogloss:0.539421
[525]	validation_0-mlogloss:0.539184
[550]	validation_0-mlogloss:0.538677
[575]	validation_0-mlogloss:0.538061
[600]	validation_0-mlogloss:0.538047
[625]	validat

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [12]:
# pred_y = rgr.predict_proba(test_X, ntree_limit = rgr.best_iteration)

In [13]:
# now = datetime.now()
# sub_name = '../output/sub_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

# out_df = pd.DataFrame(pred_y[:,:3])
# out_df.columns = ["low", "medium", "high"]
# out_df["listing_id"] = sub_id
# out_df.to_csv(sub_name, index=False)

# Tune XGBoost

In [14]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.537874 585
4 	0.533246 581
5 	0.534774 392
6 	0.534358 252
7 	0.535118 189


In [15]:
max_depth = train_param
print max_depth

4


In [16]:
train_param = 1
for x in [2,4,8,12,16,20]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.534686 579
4 	0.534913 515
8 	0.53319 490
12 	0.533521 568
16 	0.532501 524
20 	0.533333 505


In [17]:
for x in [24,28,32]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

24 	0.533516 565
28 	0.533766 482
32 	0.533918 480


In [18]:
min_child_weight = train_param
print min_child_weight

16


In [19]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.539257 1092
0.1 	0.535705 640
0.2 	0.532362 648
0.3 	0.532268 671
0.4 	0.532383 622
0.5 	0.532407 560
0.6 	0.533004 511
0.7 	0.531554 605
0.8 	0.532438 604
0.9 	0.533032 621


In [20]:
colsample_bytree = train_param
print colsample_bytree

0.7


In [21]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.535465 500
0.6 	0.534347 517
0.7 	0.533483 462
0.8 	0.532123 520
0.9 	0.531371 592


In [22]:
subsample = train_param
print subsample

0.9


In [23]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.532157 616
0.6 	0.532724 520
0.9 	0.53279 613
1.2 	0.531874 602
1.5 	0.532611 597
1.8 	0.532247 529
2.1 	0.532049 678
2.4 	0.533522 529
2.7 	0.532063 721
3.0 	0.533154 661


In [24]:
gamma = train_param
print gamma

0


In [None]:
# 0.3 	0.528756 371
# 0.6 	0.530068 353
# 0.9 	0.530043 275
# 1.2 	0.530065 388
# 1.5 	0.529657 331
# 1.8 	0.529906 328
# 2.1 	0.528338 393
# 2.4 	0.529364 372
# 2.7 	0.527919 456
# 3.0 	0.528962 417

In [25]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (4,7),
        'min_child_weight': (8,28),
        'colsample_bytree': (0.5,0.8),
        'subsample': (0.7,1),
        'gamma': (0,1.8)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[402]	train-mlogloss:0.349671+0.00279218	test-mlogloss:0.533837+0.00430309

    1 | 32m04s | [35m  -0.53384[0m | [32m            0.7713[0m | [32m   1.7190[0m | [32m     6.1334[0m | [32m           24.0622[0m | [32m     0.9267[0m | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[376]	train-mlogloss:0.337871+0.00230234	test-mlogloss:0.532842+0.00528728

    2 | 21m22s | [35m  -0.53284[0m | [32m            0.5063[0m | 

KeyboardInterrupt: 

In [20]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
0,5.028481,4.216682,0.768085,0.772111,2.97874,-0.53422
1,7.730213,19.955073,0.310748,0.722331,0.314302,-0.535228


In [32]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [46]:
estimators = [
    xgb.XGBClassifier(max_depth = 7,
                              min_child_weight = 24,
                              colsample_bytree = 0.309861 ,
                              subsample = 0.998132 ,
                              gamma = 2.211859),
             xgb.XGBClassifier(max_depth = 6,
                              min_child_weight = 19,
                              colsample_bytree = 0.432358,
                              subsample = 0.949350,
                              gamma = 2.976848),
             xgb.XGBClassifier(max_depth = 7,
                              min_child_weight = 23,
                              colsample_bytree = 0.214791,
                              subsample = 0.997197,
                              gamma = 2.163581),         
             xgb.XGBClassifier(max_depth = 8,
                              min_child_weight = 23,
                              colsample_bytree = 0.5,
                              subsample = 0.988002,
                              gamma = 3.0),  
             xgb.XGBClassifier(max_depth = 6,
                              min_child_weight = 13,
                              colsample_bytree = 0.236769,
                              subsample = 0.947349,
                              gamma = 2.870883)              
             ]

#  		max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 		score
# 33 	7.513940 	24.341212 			0.309861 			0.998132 	2.211859 	-0.528512
# 2 	6.865995 	19.894651 			0.432358 			0.949350 	2.976848 	-0.528667
# 9 	7.270910 	23.515202 			0.214791 			0.997197 	2.163581 	-0.528757
# 22 	8.000000 	23.438976 			0.500000 			0.988002 	3.000000 	-0.528779
# 1 	6.622058 	13.857343 			0.236769 			0.947349 	2.870883 	-0.529040
# 11 	5.122812 	11.339506 			0.338006 			0.930783 	2.867388 	-0.529081
# 8 	7.270911 	23.515206 			0.214791 			0.997198 	2.163575 	-0.529257
# 24 	7.917312 	13.816491 			0.490651 			0.889237 	1.880521 	-0.529304
# 39 	7.288872 	29.477693 			0.319768 			0.937097 	2.265661 	-0.529336

(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              10,
                              300)


Blend 5 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.309861,
       gamma=2.211859, learning_rate=0.02, max_delta_step=0, max_depth=7,
       min_child_weight=24, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.998132)
Model 1 fold 1
best round 3487
('Score: ', 0.51383791963799097)
Model 1 fold 1 fitting finished in 2071.905s
Model 1 fold 2
best round 3603
('Score: ', 0.49909809177754771)
Model 1 fold 2 fitting finished in 2110.754s
Model 1 fold 3
best round 2662
('Score: ', 0.52464419065607182)
Model 1 fold 3 fitting finished in 1629.569s
Model 1 fold 4
best round 3757
('Score: ', 0.50177507994359405)
Model 1 fold 4 fitting finished in 2193.679s
Model 1 fold 5
best round 2813
('Score: ', 0.53355855754643977)
Model 1 fold 5 fitting finished in 1742.833s
Model 1 fold 6
best round 1575
('Score: ', 0.52200505888

In [47]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean, delimiter=",")

[ 0.52385999  0.52420308  0.52429754  0.52366222  0.52450185]
[ 2866.7  3979.7  3102.9  2783.1  4450.5]


In [50]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_xgb_mean[:,9:12])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)


# ypreds.columns = cols

# df = pd.read_json(open("../input/test.json", "r"))
# ypreds['listing_id'] = df["listing_id"]

# ypreds.to_csv('my_preds.csv', index=None)

In [48]:
test_blend_x_xgb_gmean[:,9:12]


array([[  3.20544260e-01,   6.10888687e-01,   6.56750820e-02],
       [  9.66028463e-01,   2.30937453e-02,   1.06505838e-02],
       [  9.53579737e-01,   4.13955017e-02,   3.97691225e-03],
       ..., 
       [  9.78252564e-01,   2.03241753e-02,   1.01263996e-03],
       [  9.71474749e-01,   2.74180665e-02,   5.09567844e-04],
       [  5.87161787e-01,   3.92164954e-01,   1.94305868e-02]])

In [49]:
test_blend_x_xgb_gmean[:,:3]

array([[  3.20994298e-01,   6.04517100e-01,   7.12524217e-02],
       [  9.57150480e-01,   3.01748109e-02,   1.18646473e-02],
       [  9.58178383e-01,   3.74607705e-02,   3.46537444e-03],
       ..., 
       [  9.80787885e-01,   1.80516908e-02,   8.23179767e-04],
       [  9.70638017e-01,   2.80394743e-02,   5.14503672e-04],
       [  5.81934901e-01,   3.95535699e-01,   2.09051621e-02]])