In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



# Load Data

In [2]:
# https://www.kaggle.com/brandenkmurray/two-sigma-connect-rental-listing-inquiries/it-is-lit/comments
# param <- list(booster="gbtree",
#               objective="multi:softprob",
#               eval_metric="mlogloss",
#               nthread=13,
#               num_class=3,
#               eta = .02,
#               gamma = 1,
#               max_depth = 4,
#               min_child_weight = 1,
#               subsample = .7,
#               colsample_bytree = .5
# )
# xgb2 <- xgb.train(data = dtrain,
#                   params = param,
#                   nrounds = 2710
# )

data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BrandenMurray.csv')
test_X = pd.read_csv(data_path + 'test_BrandenMurray.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 285) (74659, 285) (49352,)


In [13]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)
print X_train.shape
print X_val.shape
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 285)
(9871, 285)


In [10]:
rgr = xgb.XGBClassifier(objective = 'multi:softprob',
                       learning_rate = 0.1,
                       n_estimators = 10000,
                       nthread = -1)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         num_class = 3,
        early_stopping_rounds=50,
        verbose=25
       )

[0]	validation_0-mlogloss:1.04205
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.677803
[50]	validation_0-mlogloss:0.636772
[75]	validation_0-mlogloss:0.618659
[100]	validation_0-mlogloss:0.607344
[125]	validation_0-mlogloss:0.599361
[150]	validation_0-mlogloss:0.594042
[175]	validation_0-mlogloss:0.589287
[200]	validation_0-mlogloss:0.585338
[225]	validation_0-mlogloss:0.582099
[250]	validation_0-mlogloss:0.579162
[275]	validation_0-mlogloss:0.57698
[300]	validation_0-mlogloss:0.575315
[325]	validation_0-mlogloss:0.573552
[350]	validation_0-mlogloss:0.571865
[375]	validation_0-mlogloss:0.570756
[400]	validation_0-mlogloss:0.569717
[425]	validation_0-mlogloss:0.568638
[450]	validation_0-mlogloss:0.567676
[475]	validation_0-mlogloss:0.567038
[500]	validation_0-mlogloss:0.5663
[525]	validation_0-mlogloss:0.565627
[550]	validation_0-mlogloss:0.564942
[575]	validation_0-mlogloss:0.564367
[600]	validation_0-mlogloss:0.563886
[625]	validatio

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [13]:
pred_y = rgr.predict_proba(test_X, ntree_limit = rgr.best_iteration)

In [14]:
now = datetime.now()
sub_name = '../output/sub_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y[:,:3])
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv(sub_name, index=False)

# Tune XGBoost

In [14]:
learning_rate = 0.1
best_score = 1000
train_param = 0
for x in [3,4,5,6,7,8,9,10]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= x,
        nthread = -1,
        silent = False
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x

    print x, '\t', rgr.best_score, rgr.best_iteration

3 	0.542025 1055
4 	0.538678 715
5 	0.540994 424
6 	0.541306 311


KeyboardInterrupt: 

In [15]:
max_depth = train_param
print train_param
# 3 	0.542025 1055
# 4 	0.538678 715
# 5 	0.540994 424
# 6 	0.541306 311

4


In [33]:
train_param = 1
for x in [2,4,8,16]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

2 	0.539881 572
4 	0.539478 637
8 	0.54012 688
16 	0.540211 688


In [34]:
min_child_weight = train_param
print min_child_weight

1


In [35]:
train_param = 1
for x in [0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.537705 746
0.4 	0.538701 701
0.5 	0.536984 746
0.6 	0.538665 602
0.7 	0.539456 684
0.8 	0.538669 675
0.9 	0.538632 702


In [36]:
colsample_bytree = train_param
print train_param

0.5


In [37]:
train_param = 1
for x in [0.5,0.6,0.7,0.8,0.9]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.5 	0.539962 533
0.6 	0.538777 705
0.7 	0.539737 575
0.8 	0.536025 767
0.9 	0.536844 753


In [38]:
subsample = train_param
print train_param

0.8


In [39]:
train_param = 0
for x in [0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.3 	0.536765 690
0.6 	0.536754 690
0.9 	0.535341 721
1.2 	0.537316 604
1.5 	0.535724 689
1.8 	0.53678 689
2.1 	0.536009 759
2.4 	0.536206 786
2.7 	0.536567 801
3.0 	0.53716 1051


In [40]:
gamma = train_param
print gamma

0.9


In [41]:
for x in [0.8, 1, 1.1]:
    rgr = xgb.XGBClassifier(
        objective='multi:softprob',
        seed = 1234, # use a fixed seed during tuning so we can reproduce the results
        learning_rate = learning_rate,
        n_estimators = 10000,
        max_depth= max_depth,
        nthread = -1,
        silent = False,
        min_child_weight = min_child_weight,
        colsample_bytree = colsample_bytree,
        subsample = subsample,
        gamma = x
    )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )

    if rgr.best_score < best_score:
        best_score = rgr.best_score
        train_param = x
        

    print x, '\t', rgr.best_score, rgr.best_iteration

0.8 	0.536286 730
1 	0.536515 572
1.1 	0.536068 692


In [42]:
gamma = train_param
print gamma

0.9


In [3]:
xgtrain = xgb.DMatrix(train_X, label=train_y) 

def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=1
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    
    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=5,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )
    
    return -cv_result['test-mlogloss-mean'].values[-1]


xgb_BO = BayesianOptimization(
    xgb_evaluate, 
    {
        'max_depth': (3,6),
        'min_child_weight': (1,32),
        'colsample_bytree': (0.4,0.7),
        'subsample': (0.7,1),
        'gamma': (0.6,1.8)
    }
)

xgb_BO.maximize(init_points=10, n_iter=40)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.


KeyboardInterrupt: 

In [4]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'colsample_bytree',
                                       'subsample',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores.head(10)

Unnamed: 0,max_depth,min_child_weight,colsample_bytree,subsample,gamma,score


In [6]:
def xgb_blend(estimators, train_x, train_y, test_x, fold, early_stopping_rounds=0):
    N_params = len(estimators)
    print ("Blend %d estimators for %d folds" % (N_params, fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = len(set(train_y))
        
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x_mean = np.zeros((test_x.shape[0], N_class*N_params))
    test_blend_x_gmean = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    
    for j, est in enumerate(estimators):
        est.set_params(objective = 'multi:softprob')
        est.set_params(silent = False)
        est.set_params(learning_rate = 0.02)
        est.set_params(n_estimators=100000)
        
        print ("Model %d: %s" %(j+1, est))

        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
    
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x.iloc[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x.iloc[val_index]
            val_y_fold = train_y[val_index]      

            est.fit(train_x_fold,train_y_fold,
                    eval_set = [(val_x_fold, val_y_fold)],
                    eval_metric = 'mlogloss',
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=False)
            best_round=est.best_iteration
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            val_y_predict_fold = est.predict_proba(val_x_fold,ntree_limit=best_round)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = est.predict_proba(test_x,ntree_limit=best_round)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))
            
        test_blend_x_mean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*N_class):(j+1)*N_class] = \
                np.stack([gmean(test_blend_x_j[:,range(0,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(1,N_class*fold,N_class)], axis=1),
                          gmean(test_blend_x_j[:,range(2,N_class*fold,N_class)], axis=1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x_mean, test_blend_x_gmean, scores,best_rounds)


In [7]:
estimators = [
#     xgb.XGBClassifier(max_depth = 6,
#                               min_child_weight = 25,
#                               colsample_bytree = 0.5087 ,
#                               subsample = 0.9127 ,
#                               gamma = 1.4976),
#              xgb.XGBClassifier(max_depth = 5,
#                               min_child_weight = 22,
#                               colsample_bytree = 0.4111,
#                               subsample = 0.8606,
#                               gamma = 1.3973),
             xgb.XGBClassifier(max_depth = 4,
                              min_child_weight = 1,
                              colsample_bytree = 0.5,
                              subsample = 0.7,
                              gamma = 1),              
             ]



(train_blend_x_xgb,
 test_blend_x_xgb_mean,
 test_blend_x_xgb_gmean,
 blend_scores_xgb,
 best_rounds_xgb) = xgb_blend(estimators,
                              train_X,train_y,
                              test_X,
                              10,
                              300)

# print (np.mean(blend_scores_xgb_le,axis=0))
# print (np.mean(best_rounds_xgb_le,axis=0))

Blend 1 estimators for 10 folds
Model 1: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=1, learning_rate=0.02, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=100000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.7)
Model 1 fold 1
best round 3393
('Score: ', 0.52033469833776624)
Model 1 fold 1 fitting finished in 1163.118s
Model 1 fold 2
best round 5117
('Score: ', 0.50508494291569817)
Model 1 fold 2 fitting finished in 1732.264s
Model 1 fold 3
best round 4925
('Score: ', 0.52640409608140426)
Model 1 fold 3 fitting finished in 1644.224s
Model 1 fold 4
best round 4321
('Score: ', 0.50730211597415953)
Model 1 fold 4 fitting finished in 1442.580s
Model 1 fold 5
best round 4117
('Score: ', 0.53490438336772639)
Model 1 fold 5 fitting finished in 1375.703s
Model 1 fold 6
best round 4101
('Score: ', 0.52610811454384954)
Model 1 fo

In [8]:
now = datetime.now()

name_train_blend = '../output/train_blend_xgb_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_xgb_mean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../output/test_blend_xgb_gmean_BM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


print (np.mean(blend_scores_xgb,axis=0))
print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x_xgb, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_xgb_mean, delimiter=",")
np.savetxt(name_test_blend_gmean,test_blend_x_xgb_gmean, delimiter=",")

[ 0.52819574]
[ 4418.2]


In [9]:
test_blend_x_xgb_gmean[:,:3]


array([[  5.03001129e-01,   4.29423432e-01,   6.47619821e-02],
       [  9.66029106e-01,   2.41776605e-02,   9.27577514e-03],
       [  9.47215352e-01,   4.78949875e-02,   3.84072897e-03],
       ..., 
       [  9.81110307e-01,   1.81140445e-02,   5.52531050e-04],
       [  9.75277521e-01,   2.41763717e-02,   2.58555912e-04],
       [  6.04880104e-01,   3.60453272e-01,   3.18548085e-02]])

In [10]:
test_blend_x_xgb_mean[:,:3]

array([[  5.03994900e-01,   4.30403882e-01,   6.56012200e-02],
       [  9.66035891e-01,   2.43881196e-02,   9.57598523e-03],
       [  9.47266597e-01,   4.87686567e-02,   3.96475056e-03],
       ..., 
       [  9.81113887e-01,   1.83079859e-02,   5.78121052e-04],
       [  9.75284213e-01,   2.44465699e-02,   2.69220243e-04],
       [  6.05839831e-01,   3.61796886e-01,   3.23632663e-02]])

In [14]:
# now = datetime.now()
sub_name = '../output/sub_XGB_mean_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x_xgb_mean[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)


# ypreds.columns = cols

# df = pd.read_json(open("../input/test.json", "r"))
# ypreds['listing_id'] = df["listing_id"]

# ypreds.to_csv('my_preds.csv', index=None)