## 4.3.4　xgboost

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x =train.drop(['Survived'], axis=1)
train_y = train['Survived']
test_x = test.copy()

train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)

train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

num_cols = [col for col in train_x.columns if train_x[col].dtype != 'object']
cat_cols = [col for col in train_x.columns if train_x[col].dtype == 'object']

In [3]:
train_x = train_x[num_cols]
test_x = test_x[num_cols]
tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y, test_size=0.2, random_state=22)

In [4]:
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x)

params = {'objective': 'binary:logistic', 'random_state': 71}
num_round = 50

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist)

va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

pred = model.predict(dtest)

[0]	train-logloss:0.61618	eval-logloss:0.64308
[1]	train-logloss:0.56375	eval-logloss:0.61172
[2]	train-logloss:0.52687	eval-logloss:0.58735
[3]	train-logloss:0.50174	eval-logloss:0.57084
[4]	train-logloss:0.48548	eval-logloss:0.56140
[5]	train-logloss:0.47282	eval-logloss:0.55536
[6]	train-logloss:0.46002	eval-logloss:0.55274
[7]	train-logloss:0.45090	eval-logloss:0.55239
[8]	train-logloss:0.44157	eval-logloss:0.55004
[9]	train-logloss:0.43714	eval-logloss:0.55244
[10]	train-logloss:0.43345	eval-logloss:0.55536
[11]	train-logloss:0.42653	eval-logloss:0.55583
[12]	train-logloss:0.42467	eval-logloss:0.55634
[13]	train-logloss:0.41760	eval-logloss:0.55307
[14]	train-logloss:0.41167	eval-logloss:0.55139
[15]	train-logloss:0.40819	eval-logloss:0.55285
[16]	train-logloss:0.40288	eval-logloss:0.55024
[17]	train-logloss:0.39759	eval-logloss:0.54581
[18]	train-logloss:0.39637	eval-logloss:0.54624
[19]	train-logloss:0.38857	eval-logloss:0.55511
[20]	train-logloss:0.38511	eval-logloss:0.55962
[2

`objective` には、最小化したい目的関数を指定する。<br>
`evals` に学習データとバリデーションデータを渡すことで、決定木を追加するごとにスコアを出力できる。<br>
`eval_metric` には、モニタリングしたい評価指標を指定する。

In [5]:
params = {'objective': 'binary:logistic', 'random_state': 71, 'eval_metric': 'logloss'}
num_round = 500

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=20)

pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)

[0]	train-logloss:0.61618	eval-logloss:0.64308
[1]	train-logloss:0.56375	eval-logloss:0.61172
[2]	train-logloss:0.52687	eval-logloss:0.58735
[3]	train-logloss:0.50174	eval-logloss:0.57084
[4]	train-logloss:0.48548	eval-logloss:0.56140
[5]	train-logloss:0.47282	eval-logloss:0.55536
[6]	train-logloss:0.46002	eval-logloss:0.55274
[7]	train-logloss:0.45090	eval-logloss:0.55239
[8]	train-logloss:0.44157	eval-logloss:0.55004
[9]	train-logloss:0.43714	eval-logloss:0.55244
[10]	train-logloss:0.43345	eval-logloss:0.55536
[11]	train-logloss:0.42653	eval-logloss:0.55583
[12]	train-logloss:0.42467	eval-logloss:0.55634
[13]	train-logloss:0.41760	eval-logloss:0.55307
[14]	train-logloss:0.41167	eval-logloss:0.55139
[15]	train-logloss:0.40819	eval-logloss:0.55285
[16]	train-logloss:0.40288	eval-logloss:0.55024
[17]	train-logloss:0.39759	eval-logloss:0.54581
[18]	train-logloss:0.39637	eval-logloss:0.54624
[19]	train-logloss:0.38857	eval-logloss:0.55511
[20]	train-logloss:0.38511	eval-logloss:0.55962
[2