In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import time
from sklearn.model_selection import train_test_split
import xgboost

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import StratifiedKFold
from bayes_opt import BayesianOptimization

# the bayesian optimisation library throws a lot of warning message, so for readability we disable warning in this notebook.
# *NOT* encouraged if you want to find out what is going on under the cover :) 
import warnings
warnings.filterwarnings("ignore") 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../../../mltestdata/07_telstra/"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

event_type.csv
log_feature.csv
resource_type.csv
sample_submission.csv
severity_type.csv
test.csv
train.csv



# The case against gridsearch

We haven't touched on hyper-parameters tuning in this seris of notebook. In many of the examples you found on the internet, you will see people using gridsearch techniques to identify as set of parameters for a given machine learning algorithms. This would make sense if you are looking a small set of paraemters.

However, when the number of hyper-parameters are larger - for instance there are at least 10 parameters from XGB which people often used, the time cost associate with exhaustive gridsearch quickly become prohibitive.  You can refer to this article to see a concise arguement on [*why gridsearch is plain stupid*](https://medium.com/rants-on-machine-learning/smarter-parameter-sweeps-or-why-grid-search-is-plain-stupid-c17d97a0e881)

In this notebook, we have a look an alternative technique to perform hyper-parameters tuning using Bayesian Optimisation. More technical details about this approach can be seen [here](https://github.com/fmfn/BayesianOptimization). Here we focus on the illustration on how we can apply this technique on the Telstra dataset.

First of all, let's replicate the whole pipeline from data loading to some simple feature engineering:

# Data loading and reshaping

In [2]:
def str_to_num(string):
    return int(string.split(" ")[1])

train=pd.read_csv('../../../mltestdata/07_telstra/train.csv', converters={'location':str_to_num})
test=pd.read_csv('../../../mltestdata/07_telstra/test.csv', converters={'location':str_to_num})
event=pd.read_csv('../../../mltestdata/07_telstra/event_type.csv', converters={'event_type':str_to_num})
log_feature=pd.read_csv('../../../mltestdata/07_telstra/log_feature.csv', converters={'log_feature':str_to_num})
severity=pd.read_csv('../../../mltestdata/07_telstra/severity_type.csv', converters={'severity_type':str_to_num})
resource=pd.read_csv('../../../mltestdata/07_telstra/resource_type.csv', converters={'resource_type':str_to_num})

sample=pd.read_csv('../../../mltestdata/07_telstra/sample_submission.csv')

In [3]:
# merge train and test set for now
traintest=train.append(test)

# create resource one-hot data per id
resource_by_id=pd.get_dummies(resource,columns=['resource_type'])
resource_by_id=resource_by_id.groupby(['id']).sum().reset_index(drop=False)

# create event one-hot data per id
event_by_id=pd.get_dummies(event,columns=['event_type'])
event_by_id=event_by_id.groupby(['id']).sum().reset_index(drop=False)

In [4]:
log_feature_dict={}

for row in log_feature.itertuples():
    if row.id not in log_feature_dict:
        log_feature_dict[row.id]={}
    if row.log_feature not in log_feature_dict[row.id]:
        log_feature_dict[row.id][row.log_feature]=row.volume

colnames=['id']
for i in range(1,387):
    colnames.append('log_feature_'+str(i))

log_feature_by_id_np=np.zeros((18552,387))
count=0
for key, feature_dict in log_feature_dict.items():
    log_feature_by_id_np[count, 0]=np.int(key)
    for feature, volume in feature_dict.items():
        log_feature_by_id_np[count, feature]=np.int(volume)
    count+=1
log_feature_by_id=pd.DataFrame(data=log_feature_by_id_np, columns=colnames, dtype=np.int)

In [5]:
# Merge datasets together for ml input dataframe

traintest=traintest.merge(right=severity, on='id')
print(traintest.shape)

traintest=traintest.merge(right=resource_by_id, on='id')
print(traintest.shape)

traintest=traintest.merge(right=event_by_id, on='id')
print(traintest.shape)

traintest=traintest.merge(right=log_feature_by_id, on='id')
print(traintest.shape)

(18552, 4)
(18552, 14)
(18552, 67)
(18552, 453)


In [6]:
# Seperate the traintest dataframe into train and test input dataframes
train_input=traintest.loc[0:train.shape[0]-1].copy()
print("train_input shape is", train_input.shape)

test_input=traintest.loc[train.shape[0]::].copy()
print("test_input shape is", test_input.shape)

train_input shape is (7381, 453)
test_input shape is (11171, 453)


In [7]:
y=train_input.fault_severity
train_input.drop(['fault_severity'], axis=1, inplace=True)
test_input.drop(['fault_severity'], axis=1, inplace=True)

In [17]:
train_input.head()

Unnamed: 0,location,severity_type,resource_type_1,resource_type_2,resource_type_3,resource_type_4,resource_type_5,resource_type_6,resource_type_7,resource_type_8,resource_type_9,resource_type_10,event_type_1,event_type_2,event_type_3,event_type_4,event_type_5,event_type_6,event_type_7,event_type_8,event_type_9,event_type_10,event_type_11,event_type_12,event_type_13,event_type_14,event_type_15,event_type_17,event_type_18,event_type_19,event_type_20,event_type_21,event_type_22,event_type_23,event_type_24,event_type_25,event_type_26,event_type_27,event_type_28,event_type_29,event_type_30,event_type_31,event_type_32,event_type_33,event_type_34,event_type_35,event_type_36,event_type_37,event_type_38,event_type_39,event_type_40,event_type_41,event_type_42,event_type_43,event_type_44,event_type_45,event_type_46,event_type_47,event_type_48,event_type_49,event_type_50,event_type_51,event_type_52,event_type_53,event_type_54,log_feature_1,log_feature_2,log_feature_3,log_feature_4,log_feature_5,log_feature_6,log_feature_7,log_feature_8,log_feature_9,log_feature_10,log_feature_11,log_feature_12,log_feature_13,log_feature_14,log_feature_15,log_feature_16,log_feature_17,log_feature_18,log_feature_19,log_feature_20,log_feature_21,log_feature_22,log_feature_23,log_feature_24,log_feature_25,log_feature_26,log_feature_27,log_feature_28,log_feature_29,log_feature_30,log_feature_31,log_feature_32,log_feature_33,log_feature_34,log_feature_35,log_feature_36,log_feature_37,log_feature_38,log_feature_39,log_feature_40,log_feature_41,log_feature_42,log_feature_43,log_feature_44,log_feature_45,log_feature_46,log_feature_47,log_feature_48,log_feature_49,log_feature_50,log_feature_51,log_feature_52,log_feature_53,log_feature_54,log_feature_55,log_feature_56,log_feature_57,log_feature_58,log_feature_59,log_feature_60,log_feature_61,log_feature_62,log_feature_63,log_feature_64,log_feature_65,log_feature_66,log_feature_67,log_feature_68,log_feature_69,log_feature_70,log_feature_71,log_feature_72,log_feature_73,log_feature_74,log_feature_75,log_feature_76,log_feature_77,log_feature_78,log_feature_79,log_feature_80,log_feature_81,log_feature_82,log_feature_83,log_feature_84,log_feature_85,log_feature_86,log_feature_87,log_feature_88,log_feature_89,log_feature_90,log_feature_91,log_feature_92,log_feature_93,log_feature_94,log_feature_95,log_feature_96,log_feature_97,log_feature_98,log_feature_99,log_feature_100,log_feature_101,log_feature_102,log_feature_103,log_feature_104,log_feature_105,log_feature_106,log_feature_107,log_feature_108,log_feature_109,log_feature_110,log_feature_111,log_feature_112,log_feature_113,log_feature_114,log_feature_115,log_feature_116,log_feature_117,log_feature_118,log_feature_119,log_feature_120,log_feature_121,log_feature_122,log_feature_123,log_feature_124,log_feature_125,log_feature_126,log_feature_127,log_feature_128,log_feature_129,log_feature_130,log_feature_131,log_feature_132,log_feature_133,log_feature_134,log_feature_135,log_feature_136,log_feature_137,log_feature_138,log_feature_139,log_feature_140,log_feature_141,log_feature_142,log_feature_143,log_feature_144,log_feature_145,log_feature_146,log_feature_147,log_feature_148,log_feature_149,log_feature_150,log_feature_151,log_feature_152,log_feature_153,log_feature_154,log_feature_155,log_feature_156,log_feature_157,log_feature_158,log_feature_159,log_feature_160,log_feature_161,log_feature_162,log_feature_163,log_feature_164,log_feature_165,log_feature_166,log_feature_167,log_feature_168,log_feature_169,log_feature_170,log_feature_171,log_feature_172,log_feature_173,log_feature_174,log_feature_175,log_feature_176,log_feature_177,log_feature_178,log_feature_179,log_feature_180,log_feature_181,log_feature_182,log_feature_183,log_feature_184,log_feature_185,log_feature_186,log_feature_187,log_feature_188,log_feature_189,log_feature_190,log_feature_191,log_feature_192,log_feature_193,log_feature_194,log_feature_195,log_feature_196,log_feature_197,log_feature_198,log_feature_199,log_feature_200,log_feature_201,log_feature_202,log_feature_203,log_feature_204,log_feature_205,log_feature_206,log_feature_207,log_feature_208,log_feature_209,log_feature_210,log_feature_211,log_feature_212,log_feature_213,log_feature_214,log_feature_215,log_feature_216,log_feature_217,log_feature_218,log_feature_219,log_feature_220,log_feature_221,log_feature_222,log_feature_223,log_feature_224,log_feature_225,log_feature_226,log_feature_227,log_feature_228,log_feature_229,log_feature_230,log_feature_231,log_feature_232,log_feature_233,log_feature_234,log_feature_235,log_feature_236,log_feature_237,log_feature_238,log_feature_239,log_feature_240,log_feature_241,log_feature_242,log_feature_243,log_feature_244,log_feature_245,log_feature_246,log_feature_247,log_feature_248,log_feature_249,log_feature_250,log_feature_251,log_feature_252,log_feature_253,log_feature_254,log_feature_255,log_feature_256,log_feature_257,log_feature_258,log_feature_259,log_feature_260,log_feature_261,log_feature_262,log_feature_263,log_feature_264,log_feature_265,log_feature_266,log_feature_267,log_feature_268,log_feature_269,log_feature_270,log_feature_271,log_feature_272,log_feature_273,log_feature_274,log_feature_275,log_feature_276,log_feature_277,log_feature_278,log_feature_279,log_feature_280,log_feature_281,log_feature_282,log_feature_283,log_feature_284,log_feature_285,log_feature_286,log_feature_287,log_feature_288,log_feature_289,log_feature_290,log_feature_291,log_feature_292,log_feature_293,log_feature_294,log_feature_295,log_feature_296,log_feature_297,log_feature_298,log_feature_299,log_feature_300,log_feature_301,log_feature_302,log_feature_303,log_feature_304,log_feature_305,log_feature_306,log_feature_307,log_feature_308,log_feature_309,log_feature_310,log_feature_311,log_feature_312,log_feature_313,log_feature_314,log_feature_315,log_feature_316,log_feature_317,log_feature_318,log_feature_319,log_feature_320,log_feature_321,log_feature_322,log_feature_323,log_feature_324,log_feature_325,log_feature_326,log_feature_327,log_feature_328,log_feature_329,log_feature_330,log_feature_331,log_feature_332,log_feature_333,log_feature_334,log_feature_335,log_feature_336,log_feature_337,log_feature_338,log_feature_339,log_feature_340,log_feature_341,log_feature_342,log_feature_343,log_feature_344,log_feature_345,log_feature_346,log_feature_347,log_feature_348,log_feature_349,log_feature_350,log_feature_351,log_feature_352,log_feature_353,log_feature_354,log_feature_355,log_feature_356,log_feature_357,log_feature_358,log_feature_359,log_feature_360,log_feature_361,log_feature_362,log_feature_363,log_feature_364,log_feature_365,log_feature_366,log_feature_367,log_feature_368,log_feature_369,log_feature_370,log_feature_371,log_feature_372,log_feature_373,log_feature_374,log_feature_375,log_feature_376,log_feature_377,log_feature_378,log_feature_379,log_feature_380,log_feature_381,log_feature_382,log_feature_383,log_feature_384,log_feature_385,log_feature_386,resource_sum,event_sum,location_frequency,log_feature_pattern_id,resource_pattern_id,log_feat_ind_sum,volsum,logfeat_pat_freq,severity_frequency
0,118,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,97,0,0,2,38,1737,8737
1,91,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,98,1,0,2,316,70,8737
2,152,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,12,2,0,2,2,45,8737
3,931,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,69,3,3,3,22,141,8728
4,120,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,4,19,4,4,9,12,1,8728


# Feature Engineering

In [8]:
# sum feature for resource and event
resource_cols=train_input.columns[train_input.columns.str.find('resource')==0].tolist()
event_cols=train_input.columns[train_input.columns.str.find('event')==0].tolist()

train_input['resource_sum']=train_input[resource_cols].sum(axis=1)
train_input['event_sum']=train_input[event_cols].sum(axis=1)

test_input['resource_sum']=test_input[resource_cols].sum(axis=1)
test_input['event_sum']=test_input[event_cols].sum(axis=1)

In [9]:
# frequency feature for location

# create a dataframe with with the location and the location frequency as features
location_frequency=traintest.location.value_counts()
location_frequency.name='location_frequency'
location_frequency=pd.DataFrame(location_frequency).reset_index()
location_frequency.rename(columns={'index':'location'}, inplace=True)

# merge this location frequency dataframe with the training and testing ML input data on location
train_input=train_input.merge(right=location_frequency, on='location', how='left')
test_input=test_input.merge(right=location_frequency, on='location', how='left')

In [10]:
# pattern feature for log feature
log_feature_cols=traintest.columns[traintest.columns.str.find('log_feature')==0].tolist()
traintest_log_feature=traintest[log_feature_cols].copy()

mask=(traintest_log_feature>0)
traintest_log_feature.where(mask, other=0, inplace=True)

mask=(traintest_log_feature<1)
traintest_log_feature.where(mask, other=1, inplace=True)

traintest_log_feature['log_feature_pattern_raw']= traintest_log_feature.apply(lambda x: ''.join(x.astype(str)), axis=1)


log_feature_pattern_df=pd.DataFrame(traintest_log_feature.log_feature_pattern_raw.drop_duplicates())
log_feature_pattern_df.reset_index(inplace=True)
log_feature_pattern_df.rename(columns={'index':'log_feature_pattern_id'}, inplace=True)

# merge log_feature_pattern_df back to traintest_log_feature on log_feature_pattern_raw
traintest_log_feature=traintest_log_feature.merge(right=log_feature_pattern_df, on='log_feature_pattern_raw', how='left')

# finally insert the log_feature_pattern_id column into input dataframes as new feature
train_input['log_feature_pattern_id']=traintest_log_feature.loc[0:train.shape[0]-1, 'log_feature_pattern_id'].values
test_input['log_feature_pattern_id']=traintest_log_feature.loc[train.shape[0]::]['log_feature_pattern_id'].values


In [11]:
# remove id column
train_input.drop(['id'], axis=1, inplace=True)
test_input.drop(['id'], axis=1, inplace=True)

train_input_fe1=train_input.copy()
test_input_fe1=test_input.copy()

In [12]:
# pattern feature for resource
resource_cols=traintest.columns[traintest.columns.str.find('resource')==0].tolist()
traintest_resource=traintest[resource_cols].copy()

traintest_resource['resource_pattern_raw']= traintest_resource.apply(lambda x: ''.join(x.astype(str)), axis=1)


resource_pattern_df=pd.DataFrame(traintest_resource.resource_pattern_raw.drop_duplicates())
resource_pattern_df.reset_index(inplace=True)
resource_pattern_df.rename(columns={'index':'resource_pattern_id'}, inplace=True)

# merge resource_pattern_df back to traintest_resource on resource_pattern_raw
traintest_resource=traintest_resource.merge(right=resource_pattern_df, on='resource_pattern_raw', how='left')

# finally insert the resource_pattern_id column into input dataframes as new feature
train_input['resource_pattern_id']=traintest_resource.loc[0:train.shape[0]-1, 'resource_pattern_id'].values
test_input['resource_pattern_id']=traintest_resource.loc[train.shape[0]::]['resource_pattern_id'].values

In [13]:
# log feature indicator sum
# improve private LB, decrease public LB 
traintest_log_feature['log_feat_ind_sum']=traintest_log_feature[log_feature_cols].sum(axis=1)
train_input['log_feat_ind_sum']=traintest_log_feature.loc[0:train.shape[0]-1, 'log_feat_ind_sum'].values
test_input['log_feat_ind_sum']=traintest_log_feature.loc[train.shape[0]::]['log_feat_ind_sum'].values

In [14]:
traintest['volsum']=traintest[log_feature_cols].sum(axis=1)
train_input['volsum']=traintest.loc[0:train.shape[0]-1, 'volsum'].values
test_input['volsum']=traintest.loc[train.shape[0]::]['volsum'].values

In [15]:
traintest_input=train_input.append(test_input)

# create a dataframe with with the logfeat_pat_freq and the logfeat_pat_freq frequency as features
logfeat_pat_freq=traintest_input.log_feature_pattern_id.value_counts()
logfeat_pat_freq.name='logfeat_pat_freq'
logfeat_pat_freq=pd.DataFrame(logfeat_pat_freq).reset_index()
logfeat_pat_freq.rename(columns={'index':'log_feature_pattern_id'}, inplace=True)

# merge this logfeat_pat_freq frequency dataframe with the training and testing ML input data on logfeat_pat_freq

traintest_input=traintest_input.merge(right=logfeat_pat_freq, on='log_feature_pattern_id', how='left')

train_input=traintest_input.loc[0:train.shape[0]-1].copy()
test_input=traintest_input.loc[train.shape[0]::].copy()


In [16]:
severity_frequency=traintest.severity_type.value_counts()
severity_frequency.name='severity_frequency'
severity_frequency=pd.DataFrame(severity_frequency).reset_index()
severity_frequency.rename(columns={'index':'severity_type'}, inplace=True)

# merge this severity frequency dataframe with the training and testing ML input data on severity
train_input=train_input.merge(right=severity_frequency, on='severity_type', how='left')
test_input=test_input.merge(right=severity_frequency, on='severity_type', how='left')

# Cross validation with XGB

The application bayesian application require the usage of cross validation to compute each combination of parameters during the exploration, so we will also need to set up the cross validation function for XGB. In this case, we only return the cross validation score as the output of the function.  Essentially, the CV funciton is being treated as the target function to be optimised in our little exercise here

In [17]:
def cross_validate_xgb(params, x_train, y_train, kf, verbose=True, verbose_eval=50):
    start_time=time.time()
    nround=[]
    # the prediction matrix need to contains 3 columns, one for the probability of each class
    train_pred = np.zeros((x_train.shape[0],3))
    
    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)):
        x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        y_train_kf, y_val_kf = y_train[train_index], y_train[val_index]
        
        d_train = xgboost.DMatrix(x_train_kf, y_train_kf)
        d_val=xgboost.DMatrix(x_val_kf, y_val_kf)

        watchlist= [(d_train, "train"), (d_val, 'val')]
        bst = xgboost.train(params=params, dtrain=d_train, num_boost_round=3000, early_stopping_rounds=100,
                            evals=watchlist, verbose_eval=verbose_eval)        

        y_val_kf_preds=bst.predict(d_val, ntree_limit=bst.best_ntree_limit)
        nround.append(bst.best_ntree_limit)
        
        train_pred[val_index] += y_val_kf_preds
        
        fold_cv = log_loss(y_val_kf.values, y_val_kf_preds)
        if verbose:
            print('fold cv {} log_loss score is {:.6f}'.format(i, fold_cv))
        
    cv_score = log_loss(y_train, train_pred)
    
    if verbose:
        print('cv log_loss score is {:.6f}'.format(cv_score))    
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))
    return cv_score # for the purpose of bayesian optimisation, we only need to return the CV score

let's test the above cross validation function with some simple xgb parameters

In [18]:
xgb_params = {
    "objective" : "multi:softprob",
    "num_class" : 3,
    "tree_method" : "hist",
    "eval_metric" : "mlogloss",
    "nthread": 4,
    "seed" : 0,
    'silent': 1,

    "eta":0.05,  # default 0.3
    "max_depth" : 5, # default 6
    "subsample" : 0.8, # default 1
    "colsample_bytree" : 0.6, # default 1
    "gamma": 0.5
}

# only do 3 fold CV here so that we save some running time on Kaggle Kernel
kf=StratifiedKFold(n_splits=3, shuffle=True, random_state=2017)

print('Start training...')


cv_score =cross_validate_xgb(xgb_params, train_input, y, kf, verbose=False, verbose_eval=50)

print('cv score is {:.6f}'.format(cv_score))

Start training...
[0]	train-mlogloss:1.06346	val-mlogloss:1.06533
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.545517	val-mlogloss:0.595939
[100]	train-mlogloss:0.46139	val-mlogloss:0.534824
[150]	train-mlogloss:0.427132	val-mlogloss:0.516735
[200]	train-mlogloss:0.402704	val-mlogloss:0.507676
[250]	train-mlogloss:0.381913	val-mlogloss:0.501956
[300]	train-mlogloss:0.363002	val-mlogloss:0.498393
[350]	train-mlogloss:0.346867	val-mlogloss:0.497183
[400]	train-mlogloss:0.332531	val-mlogloss:0.496637
[450]	train-mlogloss:0.320011	val-mlogloss:0.49666
[500]	train-mlogloss:0.307604	val-mlogloss:0.496459
[550]	train-mlogloss:0.296493	val-mlogloss:0.496676
[600]	train-mlogloss:0.287124	val-mlogloss:0.498415
Stopping. Best iteration:
[507]	train-mlogloss:0.306031	val-mlogloss:0.496159

[0]	train-mlogloss:1.06459	val-mlogloss:1.06517
Multiple eval metrics have been pass

# Bayesian Optimsation - Setup

The next step is to setup the search space for bayesian optimisation to explore for optimun cross validation score. 
This search space is denfied by sets of interval values for hyper-parameters of interest. In this example, we use the four parameters that are most 

In [21]:
params={'max_depth':(4,10),
        'learning_rate':(0.05,0.3),
        'subsample': (0.4, 1),
        'colsample_bytree': (0.4, 1)
       }

In [22]:
# reload(xgb_wrapper)
def xgbcv_func(max_depth, learning_rate, subsample, colsample_bytree, nthread=4, seed=0):
    params = {
        "objective" : "multi:softprob",
        "num_class" : 3,
        "tree_method" : "hist",
        "eval_metric" : "mlogloss",
        "nthread": nthread,
        "seed" : 0,
        'silent': 1,

        "eta":learning_rate,  # default 0.3
        "max_depth" : int(max_depth), # default 6
        "subsample" : subsample, # default 1
        "colsample_bytree" : colsample_bytree, # default 1
    }
    
    # for a more ideal out-of-fold model prediction for this dataset, we use 10-fold CV
    kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2017)
    
    # we will disable all the verbose setting in this functional call, so that we don't have too much information 
    # to read during the bayesian optimisation process.
    return 1-cross_validate_xgb(params, train_input, y, kf, verbose=False, verbose_eval=False)

now we can setup the bayesian optimisation object

In [23]:
xgb_bo=BayesianOptimization(xgbcv_func, params)

and... action time (In this example we use small values for *init_points* and *n_iter* so that the run can be fitted in 60 minutes of Kaggle's kernel run time limit :

In [24]:
xgb_bo.maximize(init_points=5, n_iter=20)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   subsample | 
    1 | 01m00s | [35m   0.50264[0m | [32m            0.9745[0m | [32m         0.2634[0m | [32m     5.4654[0m | [32m     0.9019[0m | 
    2 | 00m56s |    0.49491 |             0.9140 |          0.2311 |      5.6636 |      0.4037 | 
    3 | 01m14s | [35m   0.50482[0m | [32m            0.6768[0m | [32m         0.2154[0m | [32m     7.3889[0m | [32m     0.9879[0m | 
    4 | 01m20s |    0.50125 |             0.9249 |          0.1788 |      6.8017 |      0.6467 | 
    5 | 01m12s |    0.50104 |             0.8798 |          0.1801 |      4.1285 |      0.5372 | 
[31mBayesian Optimization[0m
[94m------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   le

In [None]:
xgb_custom_

In [None]:
'''
Initialization
------------------------------------------------------------------------------------------------
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   subsample | 
    1 | 01m00s |    0.50264 |             0.9745 |          0.2634 |      5.4654 |      0.9019 | 
    2 | 00m56s |    0.49491 |             0.9140 |          0.2311 |      5.6636 |      0.4037 | 
    3 | 01m14s |    0.50482 |             0.6768 |          0.2154 |      7.3889 |      0.9879 | 
    4 | 01m20s |    0.50125 |             0.9249 |          0.1788 |      6.8017 |      0.6467 | 
    5 | 01m12s |    0.50104 |             0.8798 |          0.1801 |      4.1285 |      0.5372 | 
Bayesian Optimization
------------------------------------------------------------------------------------------------
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   subsample | 
    6 | 03m32s |    0.49228 |             1.0000 |          0.0500 |     10.0000 |      1.0000 | 
    7 | 04m49s |    0.50288 |             0.4000 |          0.0500 |      4.7812 |      1.0000 | 
    8 | 03m27s |    0.50144 |             1.0000 |          0.0500 |      6.6491 |      1.0000 | 
    9 | 01m34s |    0.48695 |             0.4000 |          0.3000 |     10.0000 |      0.4000 | 
   10 | 01m22s |    0.50166 |             0.4000 |          0.3000 |      4.0000 |      1.0000 | 
   11 | 01m27s |    0.48771 |             0.9891 |          0.2951 |      8.6588 |      0.4699 | 
   12 | 03m59s |    0.50014 |             0.9541 |          0.0500 |      4.0000 |      1.0000 | 
   13 | 01m28s |    0.50098 |             0.4000 |          0.3000 |     10.0000 |      1.0000 | 
   14 | 01m05s |    0.50265 |             0.4000 |          0.3000 |      6.3330 |      1.0000 | 
   15 | 03m22s |    0.50993 |             0.4000 |          0.0500 |      8.9846 |      1.0000 | 
   16 | 02m51s |    0.50470 |             0.4000 |          0.0500 |      7.7482 |      0.4000 | 
   17 | 03m49s |    0.50855 |             0.4000 |          0.0500 |     10.0000 |      1.0000 | 
   18 | 03m42s |    0.51152 |             0.4049 |          0.0538 |      8.0589 |      0.9605 | 
   19 | 03m21s |    0.51046 |             0.4010 |          0.0502 |      7.6252 |      0.9561 | 
   20 | 03m11s |    0.50991 |             0.4147 |          0.0560 |      8.3311 |      0.9966 | 
   21 | 02m16s |    0.50146 |             0.4026 |          0.0793 |      4.0103 |      0.4119 | 
   22 | 01m23s |    0.50619 |             0.4022 |          0.2983 |      7.8264 |      0.7933 | 
   23 | 03m22s |    0.51128 |             0.4033 |          0.0534 |      7.9585 |      0.8098 | 
   24 | 03m40s |    0.50970 |             0.4274 |          0.0531 |      9.5585 |      0.9931 | 
   25 | 03m47s |    0.50999 |             0.4629 |          0.0503 |      7.9026 |      0.9567 |

'''