In [2]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [3]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_0323.csv')
test_X = pd.read_csv(data_path + 'test_BM_0323.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 324) (74659, 324) (49352,)


In [4]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 324)
(9871, 324)


In [5]:
rgr = xgb.XGBClassifier(objective = 'multi:softprob',
                       learning_rate = 0.1,
                       n_estimators = 10000,
                       nthread = -1)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         num_class = 3,
        early_stopping_rounds=50,
        verbose=25
       )

[0]	validation_0-mlogloss:1.03729
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.639448
[50]	validation_0-mlogloss:0.599782
[75]	validation_0-mlogloss:0.584479
[100]	validation_0-mlogloss:0.575353
[125]	validation_0-mlogloss:0.569133
[150]	validation_0-mlogloss:0.564753
[175]	validation_0-mlogloss:0.561131
[200]	validation_0-mlogloss:0.55795
[225]	validation_0-mlogloss:0.555207
[250]	validation_0-mlogloss:0.553067
[275]	validation_0-mlogloss:0.551302
[300]	validation_0-mlogloss:0.54985
[325]	validation_0-mlogloss:0.548806
[350]	validation_0-mlogloss:0.547753
[375]	validation_0-mlogloss:0.546786
[400]	validation_0-mlogloss:0.545899
[425]	validation_0-mlogloss:0.545081
[450]	validation_0-mlogloss:0.544123
[475]	validation_0-mlogloss:0.543393
[500]	validation_0-mlogloss:0.54302
[525]	validation_0-mlogloss:0.542573
[550]	validation_0-mlogloss:0.542197
[575]	validation_0-mlogloss:0.541644
[600]	validation_0-mlogloss:0.541362
[625]	validatio

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [6]:
pred_y = rgr.predict_proba(test_X, ntree_limit = rgr.best_iteration)
# [999]	validation_0-mlogloss:0.538056

In [7]:
now = datetime.now()
sub_name = '../output/sub_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

# Tune XGBoost

In [8]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.537709 1119
4 	0.536943 643
5 	0.536582 435
6 	0.537476 289
7 	0.53886 210


In [12]:
for x in [8,9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

8 	0.539197 171
9 	0.544561 123


In [9]:
max_depth = train_param
print max_depth

5


In [10]:
train_param = 1
for x in [2,4,8,12,16,20]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.536208 411
4 	0.536915 486
8 	0.535435 407
12 	0.537371 437
16 	0.537278 436
20 	0.535844 443


In [11]:
for x in [24,28,32]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

24 	0.535395 506
28 	0.535698 391
32 	0.535755 379


In [12]:
min_child_weight = train_param
print min_child_weight

24


In [13]:
train_param = 1
for x in [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.05 	0.542236 1009
0.1 	0.536086 779
0.2 	0.533585 475
0.3 	0.532998 560
0.4 	0.533129 522
0.5 	0.532483 501
0.6 	0.533895 501
0.7 	0.534892 406
0.8 	0.53459 532
0.9 	0.535261 515


In [14]:
colsample_bytree = train_param
print colsample_bytree

0.5


In [15]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.538463 409
0.6 	0.535441 458
0.7 	0.535343 445
0.8 	0.534473 438
0.9 	0.532524 464


In [16]:
subsample = train_param
print subsample

1


In [17]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.533898 443
0.6 	0.533621 507
0.9 	0.532848 497
1.2 	0.533065 492
1.5 	0.534829 510
1.8 	0.533417 536
2.1 	0.534944 480
2.4 	0.534796 505
2.7 	0.534799 603
3.0 	0.533409 1083


In [18]:
gamma = train_param
print gamma

0


In [None]:
# 1.2 	0.530871 677

In [30]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (3,8),
        'min_child_weight': (12,28),
        'colsample_bytree': (0.2,0.6),
        'subsample': (0.8,1),
        'gamma': (0,3)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.


KeyboardInterrupt: 

In [24]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
25,5.983142,15.522353,0.365431,0.894089,1.799546,-0.527589
23,5.974873,16.538875,0.485034,0.938367,0.114823,-0.527952
2,5.963303,12.117,0.433634,0.978037,1.715329,-0.528031
36,5.975359,18.867714,0.475715,0.976875,1.762219,-0.528089
27,5.16096,15.607619,0.492978,0.913682,1.7944,-0.528097
26,5.953624,15.884306,0.49016,0.931286,1.172554,-0.528127
8,5.980293,12.26287,0.382213,0.985894,0.013603,-0.528166
18,5.969315,27.978419,0.242325,0.972658,1.66523,-0.528323
5,5.833405,27.333782,0.493605,0.988708,1.759493,-0.528409
37,5.991305,12.978177,0.244283,0.907146,1.750585,-0.52845


In [28]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [29]:
estimators = [
            xgb.XGBClassifier(max_depth = 5,
                              min_child_weight = 15,
                              colsample_bytree = 0.365431 ,
                              subsample = 0.894089 ,
                              gamma = 1.799546),
             xgb.XGBClassifier(max_depth = 5,
                              min_child_weight = 16,
                              colsample_bytree = 0.485034,
                              subsample = 0.938367,
                              gamma = 0.114823),
             xgb.XGBClassifier(max_depth = 5,
                              min_child_weight = 12,
                              colsample_bytree = 0.433634,
                              subsample = 0.978037,
                              gamma = 1.715329),         
             xgb.XGBClassifier(max_depth = 5,
                              min_child_weight = 18,
                              colsample_bytree = 0.475715,
                              subsample = 0.976875,
                              gamma = 1.762219),  
             xgb.XGBClassifier(max_depth = 5,
                              min_child_weight = 15,
                              colsample_bytree = 0.492978,
                              subsample = 0.913682,
                              gamma = 1.794400)              
             ]

#  	 	max_depth 	min_child_weight 	colsample_bytree 	subsample 	gamma 	 	score
# 25 	5.983142 	15.522353 	 	 	0.365431 	 	 	0.894089 	1.799546 	-0.527589
# 23 	5.974873 	16.538875 	 	 	0.485034 	 	 	0.938367 	0.114823 	-0.527952
# 2 	5.963303 	12.117000 	 	 	0.433634 	 	 	0.978037 	1.715329 	-0.528031
# 36 	5.975359 	18.867714 	 	 	0.475715 	 	 	0.976875 	1.762219 	-0.528089
# 27 	5.160960 	15.607619 	 	 	0.492978 	 	 	0.913682 	1.794400 	-0.528097
# 26 	5.953624 	15.884306 	 	 	0.490160 	 	 	0.931286 	1.172554 	-0.528127
# 8 	5.980293 	12.262870 	 	 	0.382213 	 	 	0.985894 	0.013603 	-0.528166
# 18 	5.969315 	27.978419 	 	 	0.242325 	 	 	0.972658 	1.665230 	-0.528323
# 5 	5.833405 	27.333782 	 	 	0.493605 	 	 	0.988708 	1.759493 	-0.528409
# 37 	5.991305 	12.978177 	 	 	0.244283 	 	 	0.907146 	1.750585 	-0.528450

(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              10,
                              500)


Blend 5 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.365431,
       gamma=1.799546, learning_rate=0.02, max_delta_step=0, max_depth=5,
       min_child_weight=15, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.894089)
Model 1 fold 1
best round 3122
('Score: ', 0.51484276567931386)
Model 1 fold 1 fitting finished in 1661.400s
Model 1 fold 2
best round 4477
('Score: ', 0.50128107622723372)
Model 1 fold 2 fitting finished in 2956.466s
Model 1 fold 3


KeyboardInterrupt: 

In [27]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_BM_0322_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean, delimiter=",")

NameError: name 'blend_scores_xgb' is not defined

In [None]:
# [ 0.52385999  0.52420308  0.52429754  0.52366222  0.52450185]
# [ 2866.7  3979.7  3102.9  2783.1  4450.5]

In [50]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_xgb_mean[:,9:12])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)


# ypreds.columns = cols

# df = pd.read_json(open("../input/test.json", "r"))
# ypreds['listing_id'] = df["listing_id"]

# ypreds.to_csv('my_preds.csv', index=None)

In [48]:
test_blend_x_xgb_gmean[:,9:12]


array([[  3.20544260e-01,   6.10888687e-01,   6.56750820e-02],
       [  9.66028463e-01,   2.30937453e-02,   1.06505838e-02],
       [  9.53579737e-01,   4.13955017e-02,   3.97691225e-03],
       ..., 
       [  9.78252564e-01,   2.03241753e-02,   1.01263996e-03],
       [  9.71474749e-01,   2.74180665e-02,   5.09567844e-04],
       [  5.87161787e-01,   3.92164954e-01,   1.94305868e-02]])

In [49]:
test_blend_x_xgb_gmean[:,:3]

array([[  3.20994298e-01,   6.04517100e-01,   7.12524217e-02],
       [  9.57150480e-01,   3.01748109e-02,   1.18646473e-02],
       [  9.58178383e-01,   3.74607705e-02,   3.46537444e-03],
       ..., 
       [  9.80787885e-01,   1.80516908e-02,   8.23179767e-04],
       [  9.70638017e-01,   2.80394743e-02,   5.14503672e-04],
       [  5.81934901e-01,   3.95535699e-01,   2.09051621e-02]])