# XGBoostのデモ

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

### データの取り込み・前処理

In [3]:
df_date_data = pd.read_csv('../data/day.csv', header=0)

df_date_data.head()

FileNotFoundError: File b'../data/day.csv' does not exist

In [None]:
used_columns = ['cnt', 'season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum', 'windspeed']
df = df_date_data.loc[:,used_columns]
df.head()

In [None]:
ohe_columns = ['season', 'mnth', 'weathersit']
df_ohe = pd.get_dummies(df, columns=ohe_columns)

In [None]:
df_ohe.head()

In [None]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0,)

### 学習の実行

In [None]:
# データセット作成
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

# パラメータの定義
param = {
         'gamma':0.2,
         'max_depth':5,
         'min_child_weight':1,
         'subsample':0.8,
         'colsample_bytree':1,
#          'objective':'reg:squarederror',
         'booster':'gbtree'
         }

# 評価用のリストと、トレーニング回数の指定
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 30

# トレーニング
bst = xgb.train(param, dtrain, num_round, watchlist)

# 予測
preds = bst.predict(dtest)
labels = dtest.get_label()

print('test deviance=%f' % (2 * np.sum((labels - preds) / preds - np.log(labels) + np.log(preds))))

In [None]:
# 評価
print('MSE :{}'.format(mean_squared_error(y_test, preds)))
print('RMSE :{}'.format(np.sqrt(mean_squared_error(y_test, preds))))

### 結果の可視化

In [None]:
# yyplot 作成関数
def yyplot(y_obs, y_pred):
    yvalues = np.concatenate([y_obs, y_pred])
    ymin, ymax, yrange = np.amin(yvalues), np.amax(yvalues), np.ptp(yvalues)
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(y_obs, y_pred)
    plt.plot([ymin - yrange * 0.01, ymax + yrange * 0.01], [ymin - yrange * 0.01, ymax + yrange * 0.01])
    plt.xlim(ymin - yrange * 0.01, ymax + yrange * 0.01)
    plt.ylim(ymin - yrange * 0.01, ymax + yrange * 0.01)
    plt.xlabel('y_test', fontsize=12)
    plt.ylabel('y_pred', fontsize=12)
#     plt.title('Observed-Predicted Plot', fontsize=24)
    plt.tick_params(labelsize=12)
    plt.show()

    return fig

In [None]:
# yyplot の実行例
fig = yyplot(y_test, preds)