In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [2]:
data_path = "../input/"

train_df = pd.read_pickle(data_path + 'train_2017-03-05-22-40.pkl')
train_y = pd.read_pickle(data_path + 'y_2017-03-05-22-40.pkl')
test_df = pd.read_pickle(data_path + 'test_2017-03-05-22-40.pkl')
features_to_use = pd.read_pickle(data_path + 'featurestouse_2017-03-05-22-40.pkl')

tr_desc_sparse = pd.read_pickle(data_path + 'tr_desc_sparse_2017-03-05-22-40.pkl')
tr_feat_sparse = pd.read_pickle(data_path + 'tr_feat_sparse_2017-03-05-22-40.pkl')
te_desc_sparse = pd.read_pickle(data_path + 'te_desc_sparse_2017-03-05-22-40.pkl')
te_feat_sparse = pd.read_pickle(data_path + 'te_feat_sparse_2017-03-05-22-40.pkl')

desc_sparse_cols = pd.read_pickle(data_path + 'desc_sparse_cols_2017-03-05-22-40.pkl')
feat_sparse_cols = pd.read_pickle(data_path + 'feat_sparse_cols_2017-03-05-22-40.pkl')

In [3]:
train_X = pd.read_pickle(data_path + 'train_y_0319.pkl')
test_X = pd.read_pickle(data_path + 'test_X_0319.pkl')


all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape

(49352, 457) (74659, 457)


In [28]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 457)
(9871, 457)


In [10]:
rgr = xgb.XGBClassifier(objective = 'multi:softprob',
                       learning_rate = 0.1,
                       n_estimators = 10000,
                       nthread = -1)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         num_class = 3,
        early_stopping_rounds=50,
        verbose=25
       )

[0]	validation_0-mlogloss:1.04205
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.677803
[50]	validation_0-mlogloss:0.636772
[75]	validation_0-mlogloss:0.618659
[100]	validation_0-mlogloss:0.607344
[125]	validation_0-mlogloss:0.599361
[150]	validation_0-mlogloss:0.594042
[175]	validation_0-mlogloss:0.589287
[200]	validation_0-mlogloss:0.585338
[225]	validation_0-mlogloss:0.582099
[250]	validation_0-mlogloss:0.579162
[275]	validation_0-mlogloss:0.57698
[300]	validation_0-mlogloss:0.575315
[325]	validation_0-mlogloss:0.573552
[350]	validation_0-mlogloss:0.571865
[375]	validation_0-mlogloss:0.570756
[400]	validation_0-mlogloss:0.569717
[425]	validation_0-mlogloss:0.568638
[450]	validation_0-mlogloss:0.567676
[475]	validation_0-mlogloss:0.567038
[500]	validation_0-mlogloss:0.5663
[525]	validation_0-mlogloss:0.565627
[550]	validation_0-mlogloss:0.564942
[575]	validation_0-mlogloss:0.564367
[600]	validation_0-mlogloss:0.563886
[625]	validatio

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [13]:
pred_y = rgr.predict_proba(test_X, ntree_limit = rgr.best_iteration)

In [14]:
now = datetime.now()
sub_name = '../output/sub_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y[:,:3])
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv(sub_name, index=False)

# Tune XGBoost

In [34]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7,8,9,10]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.555392 1038
4 	0.552951 554
5 	0.551107 382
6 	0.552202 317
7 	0.553047 200
8 	0.555228 146
9 	0.560529 123
10 	0.566363 87


In [35]:
max_depth = train_param
print train_param
# 3 	0.561939
# 4 	0.558393
# 5 	0.55522
# 6 	0.558043
# 7 	0.55804
# 8 	0.561611
# 9 	0.56762
# 10 	0.572505

5


In [36]:
best_score = 1000
train_param = 0
for x in [5,10,15,20,25,30,40,50,80,120,180]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

5 	0.551747 447
10 	0.548771 495
15 	0.550488 407
20 	0.551501 441
25 	0.549999 420
30 	0.549632 497
40 	0.550637 490
50 	0.547949 555
80 	0.549408 625
120 	0.55105 654
180 	0.554957 494


In [37]:
min_child_weight = train_param
print min_child_weight
# 5 	0.556379
# 10 	0.55509
# 15 	0.555654
# 20 	0.554354
# 25 	0.557894
# 30 	0.558761
# 40 	0.554595
# 50 	0.559861
# 80 	0.556099
# 120 	0.560399
# 180 	0.55925
# 240 	0.566327
# 300 	0.567732


50


In [38]:
best_score = 1000
train_param = 0
for x in [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.550644
0.4 	0.548466
0.5 	0.549137
0.6 	0.549148
0.7 	0.550557
0.8 	0.549855
0.9 	0.550225
1 	0.547949


In [39]:
colsample_bytree = train_param
print train_param

1


In [40]:
best_score = 1000
train_param = 0
for x in [0.5,0.6,0.7,0.8,0.9,1]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.553251 518
0.6 	0.554923 507
0.7 	0.551352 447
0.8 	0.548554 544
0.9 	0.549394 464
1 	0.547949 555


In [41]:
subsample = train_param
print train_param

1


In [42]:
best_score = 1000
train_param = 0
for x in [0, 0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0 	0.547949 555
0.3 	0.550121 440
0.6 	0.549192 523
0.9 	0.548671 525
1.2 	0.54916 500
1.5 	0.551122 397
1.8 	0.549421 488
2.1 	0.549759 492
2.4 	0.551915 372
2.7 	0.550643 390
3.0 	0.551964 384


In [43]:
gamma = train_param
print train_param

0


In [None]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=10,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (3,6),
        'min_child_weight': (5,80),
        'colsample_bytree': (0.7,1),
        'subsample': (0.7,1),
        'gamma': (0,2.1)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.


In [43]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score
1,4.977517,7.085047,0.755216,0.832299,2.07225,-0.54332
12,5.294358,6.430458,0.77867,0.794628,2.053074,-0.543492
10,4.831209,1.988818,0.976038,0.791368,1.601972,-0.543742
2,5.337825,2.370541,0.747375,0.727643,1.396417,-0.544077
3,3.844964,2.067151,0.82866,0.821156,1.951036,-0.54445
0,4.003396,2.082787,0.999265,0.893346,0.512947,-0.544678
25,4.607936,5.782687,0.904582,0.933866,0.366536,-0.545158
17,5.319597,25.284915,0.932358,0.94935,2.065272,-0.545291
13,4.126909,16.160868,0.857025,0.913964,0.708522,-0.545368
21,3.624415,8.16612,0.709067,0.845712,1.583732,-0.545375


In [12]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.03)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = train_y[val_index]   

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x, scores,best_rounds)


In [13]:
estimators = [xgb.XGBClassifier(max_depth = 6,
                              min_child_weight = 5,
                              colsample_bytree = 0.6,
                              subsample = 0.9,
                              gamma = 3),
#              xgb.XGBClassifier(max_depth = 5,
#                               min_child_weight = 6,
#                               colsample_bytree = 0.778670,
#                               subsample = 0.794628,
#                               gamma = 2.053074)
             ]





# xgb_params = [{'max_depth':6,
#                'min_child_weight':5,
#                'colsample_bytree':0.6,
#                'subsample':0.9,
#                'gamma':3},
# #               score -0.543320        
#               {'max_depth':5,
#                'min_child_weight':6,
#                'colsample_bytree':0.778670,
#                'subsample':0.794628,
#                'gamma':2.053074},
#               score -0.543492
#               {'max_depth':4,
#                'min_child_weight':1,
#                'colsample_bytree':0.976038,
#                'subsample':0.791368,
#                'gamma':1.601972},
# #               score -0.543742  
#               {'max_depth':5,
#                'min_child_weight':2,
#                'colsample_bytree':0.747375,
#                'subsample':0.727643,
#                'gamma':1.396417},
# #               score -0.544077      
#               {'max_depth':3,
#                'min_child_weight':2,
#                'colsample_bytree':0.828660,
#                'subsample':0.821156,
#                'gamma':1.951036}
# #               score -0.544450
#              ]

(train_blend_x_xgb,
 test_blend_x_xgb,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              10,
                              300)

# print (np.mean(blend_scores_xgb_le,axis=0))
# print (np.mean(best_rounds_xgb_le,axis=0))

Blend 1 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=3, learning_rate=0.03, max_delta_step=0, max_depth=6,
       min_child_weight=5, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.9)
Model 1 fold 1
best round 4153
('Score: ', 0.54316354151854473)
Model 1 fold 1 fitting finished in 563.207s
Model 1 fold 2
best round 4213
('Score: ', 0.53714232689948793)
Model 1 fold 2 fitting finished in 575.166s
Model 1 fold 3
best round 4785
('Score: ', 0.51961801525155782)
Model 1 fold 3 fitting finished in 643.666s
Model 1 fold 4
best round 3268
('Score: ', 0.53183819808578614)
Model 1 fold 4 fitting finished in 452.333s
Model 1 fold 5
best round 3046
('Score: ', 0.52786289571353395)
Model 1 fold 5 fitting finished in 423.868s
Model 1 fold 6
best round 4562
('Score: ', 0.53641104075152768)
Model 1 fold 6 

In [22]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend = '../output/test_blend_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
np.savetxt(name_test_blend,test_blend_x_xgb, delimiter=",")

[ 0.54393943  0.54340168  0.54486921  0.54463413  0.5455042 ]
4024.2


In [130]:
test_blend_x_xgb[:,:3]


array([[ 0.05068891,  0.47914108,  0.47017003],
       [ 0.00355554,  0.03459964,  0.96184484],
       [ 0.09190651,  0.43008367,  0.47800982],
       ..., 
       [ 0.04040814,  0.38965375,  0.5699381 ],
       [ 0.06617295,  0.5109495 ,  0.42287757],
       [ 0.0445355 ,  0.33600247,  0.61946204]])

In [14]:
now = datetime.now()
sub_name = '../output/sub_XGB_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_xgb[:,:3])
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv(sub_name, index=False)


# ypreds.columns = cols

# df = pd.read_json(open("../input/test.json", "r"))
# ypreds['listing_id'] = df["listing_id"]

# ypreds.to_csv('my_preds.csv', index=None)