In [83]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [84]:
now = datetime.datetime.now()

# Load the data
df = pd.read_excel("measurements2.xlsx")

#除外
df = df.drop(['specials','refill liters','refill gas'], axis=1)
#欠損値を平均値で埋める
df = df.fillna(df.mean())
# gas_typeのダミー変数に置き換え
df_dummy = pd.get_dummies(df['gas_type'])
df = pd.concat([df, df_dummy] , axis=1)
df = df.drop(['gas_type','SP98'], axis=1)

y = df['consume'].values
X = df.drop(['consume'], axis=1)
y = np.log(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

y_train_full = y_train
x_train_full = x_train


# Various hyper-parameters to tune
xgb1 = XGBRegressor()
parameters = {'nthread':[2], #when use hyperthread, xgboost may become slower 
              'learning_rate': [0.039,0.040], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [5],
              'silent': [0],
              'subsample': [0.44, 0.45, 0.46],
              'colsample_bytree': [0.74, 0.75, 0.76],
              'n_estimators': [204,205,206]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(x_train_full,
         y_train_full)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    3.3s


0.4860799301707269
{'colsample_bytree': 0.75, 'learning_rate': 0.04, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 204, 'nthread': 2, 'silent': 0, 'subsample': 0.45}


[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:    5.1s finished


In [85]:
xgb2 = XGBRegressor(colsample_bytree=0.75, learning_rate=0.039, max_depth=6, min_child_weight=5, n_estimators=204, 
                                       nthread=2, silent=0, subsample=0.45)


In [86]:
xgb2.fit(x_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.75, gamma=0,
             importance_type='gain', learning_rate=0.039, max_delta_step=0,
             max_depth=6, min_child_weight=5, missing=None, n_estimators=204,
             n_jobs=1, nthread=2, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=0,
             subsample=0.45, verbosity=1)

In [87]:
# 予測値を計算
y_train_pred = xgb2.predict(x_train)
y_test_pred = xgb2.predict(x_test)

# MSEの計算
from sklearn.metrics import mean_squared_error
print('MSE train : %.3f, test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)) )
# R^2の計算
from sklearn.metrics import r2_score
print('R2 train : %.3f, test : %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)) )


MSE train : 0.187, test : 0.487
R2 train : 0.802, test : 0.634
