In [1]:
#Xgboost（2018年12月〜2019年4月までの糖度を予測）

In [2]:
%matplotlib inline
import sys
sys.path.append('/home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages')

import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# データの読み込み
data = pd.read_csv("../src/Aiko_normalization_ver3_aaa.csv")
df = pd.DataFrame(data)

In [4]:
df.drop(['No','width','height','seed_date','house','url', 'COL_21'], axis=1, inplace=True) #対象の列を削除

In [5]:
# AveDiffTemp_7、AveSatu_7、monthを対象としたい時
df.drop(['AveCO2_7', 'AveSatu_7','AveHum_7', 'AveDiffTemp_7', 'AveTemp_7','SumMaxTemp_7','SumMinTemp_7','SumDiffTemp_7'], axis=1, inplace=True)

In [6]:
df.head(3)

Unnamed: 0,current_date,sweet,sweet_category,size,AveMaxTemp_7,AveMinTemp_7
0,11月28日,4.3,2,7.54,26.24,18.83
1,11月28日,4.5,2,6.48,26.24,18.83
2,11月28日,4.0,1,8.68,26.24,18.83


In [7]:
df['month'] = df['current_date'].str[:2] # "11月28日"とか"4月3日"の月日の先頭2文字だけ抽出
df['month'] = df['month'].str.strip('月') # このままだと"4月"のように"月"が入っているので、"月"を削除する
# 参考：https://deepage.net/features/pandas-str-extract.html
# 参考：https://deepage.net/features/pandas-str-replace.html
df['month'].isnull().sum() # 欠損値がないかを確認

df.drop(['current_date'], axis=1, inplace=True) # current_dateはもう使わない
print(df.shape)

(2176, 6)


In [8]:
# 四半期を求める(4~6月:1st、7~9月:2nd、10~12月:3rd、1~3月:4th)(＊＊＊＊使うか使わないか＊＊＊＊)
df['quarter'] = df['month'].replace({'4': '1st', '5': '1st', '6': '1st', 
                                     '7': '2nd', '8': '2nd', '9': '2nd', 
                                     '10': '3rd', '11': '3rd', '12': '3rd', 
                                     '1': '4th', '2': '4th', '3': '4th'
                                    })

In [9]:
# ダミー変数の作成
df = pd.get_dummies(df, columns=['quarter'], drop_first=True)

In [10]:
y = df["sweet_category"] #目的変数
df.drop(['sweet','sweet_category'], axis=1, inplace=True) #目的変数の列を削除
print(df.shape)

(2176, 6)


In [11]:
df.drop(['month'], axis=1, inplace=True)

In [12]:
df.head(3)

Unnamed: 0,size,AveMaxTemp_7,AveMinTemp_7,quarter_3rd,quarter_4th
0,7.54,26.24,18.83,1,0
1,6.48,26.24,18.83,1,0
2,8.68,26.24,18.83,1,0


In [13]:
# 標準化
from sklearn import preprocessing
df = preprocessing.scale(df)

  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# 訓練セットとテストセットに分割
X_trainval, X_test, y_trainval, y_test = train_test_split(df, y, test_size=0.3, stratify=y, random_state=0)
#stratify は母集団のカテゴリの割合を保って分割するもので、目的変数を指定するのが一般的

In [15]:
# xgboostモデルの作成
clf = xgb.XGBClassifier(objective='multi:softmax')

In [16]:
# ハイパーパラメータ探索
grid_search = GridSearchCV(clf, {'max_depth': [2,4,6,8], 'n_estimators': [50,100,200]}, verbose=1)
grid_search.fit(X_trainval, y_trainval)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    7.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None, objective='multi:softmax',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 4, 6, 8], 'n_estimators': [50, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [17]:
# グリッドサーチの結果
print('Best cross-validation: {}'.format(round(grid_search.best_score_, 3)))
print('Best parameters : {}'.format(grid_search.best_params_))
print('Test set score : {}'.format(round(grid_search.score(X_test, y_test), 3)))

Best cross-validation: 0.758
Best parameters : {'max_depth': 6, 'n_estimators': 100}
Test set score : 0.779


In [18]:
#多クラスの混合行列を作成
from sklearn.metrics import confusion_matrix
clf = grid_search.best_estimator_
pred = clf.predict(X_test)
print(confusion_matrix(y_test, pred))

[[ 56  21   0   0]
 [ 13 122  35   0]
 [  1  22 174  31]
 [  1   0  20 157]]


In [67]:
print(clf_cv.best_params_, clf_cv.best_score_)

{'max_depth': 2, 'n_estimators': 200} 0.7597701149425288


In [68]:
# 改めて最適パラメータで学習
clf = xgb.XGBClassifier(**clf_cv.best_params_)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=2, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [69]:
#clf.score(X_train, y_train)

In [70]:
# 学習モデルの評価
pred = clf.predict(X_test)
print(confusion_matrix(y_test, pred))

[[ 41  11   0   0]
 [  7  78  28   0]
 [  1  16 120  15]
 [  0   1  17 101]]


In [71]:
print(classification_report(y_test, pred))
clf.score(X_test, y_test)

              precision    recall  f1-score   support

           1       0.84      0.79      0.81        52
           2       0.74      0.69      0.71       113
           3       0.73      0.79      0.76       152
           4       0.87      0.85      0.86       119

   micro avg       0.78      0.78      0.78       436
   macro avg       0.79      0.78      0.79       436
weighted avg       0.78      0.78      0.78       436



0.7798165137614679

In [177]:
params = {
    #'objective': 'binary:logistic', # どのような分析を行うか？'multi:softmax'
    'objective': 'multi:softmax', # どのような分析を行うか？
    'eval_metric': 'logloss', # 正解率でモデルを評価
    'eta': 0.1, # 学習率
    'max_depth': 6, 
    'subsample': 1,
    'colsample_bytree': 1,
    'silent': 1,
    'min_child_weight': 1,
    'tree_method': 'exact',
    'predictor': 'cpu_predictor'
}

In [86]:
xgb_params = {
    # 多値分類問題
    'objective': 'multi:softmax',
    # クラス数
    'num_class': 5,
    # 学習用の指標 (Multiclass logloss)
    'eval_metric': 'mlogloss',
}


evals = [(dtrain, 'train'), (dtest, 'eval')]
evals_result = {}
bst = xgb.train(xgb_params,
                dtrain,
                num_boost_round=1000,
                early_stopping_rounds=10,
                evals=evals,
                evals_result=evals_result,
)

XGBoostError: [13:00:59] /workspace/src/objective/multiclass_obj.cu:110: SoftmaxMultiClassObj: label must be in [0, num_class).
Stack trace:
  [bt] (0) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7f471c576cb4]
  [bt] (1) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::obj::SoftmaxMultiClassObj::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0xa26) [0x7f471c770846]
  [bt] (2) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x345) [0x7f471c610505]
  [bt] (3) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f471c573aa5]
  [bt] (4) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f4758e99ec0]
  [bt] (5) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f4758e9987d]
  [bt] (6) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f47590afe5e]
  [bt] (7) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x13895) [0x7f47590b0895]
  [bt] (8) /home/yoshida/anaconda3/bin/python(_PyObject_FastCallDict+0x8b) [0x55617baf438b]



In [72]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

model = xgb.train(params=params,
                  dtrain=dtrain,
                  num_boost_round=1000, #何個決定木を作るか
                  early_stopping_rounds=5, #予測精度が向上されない場合に、いち早く学習をストップする
                  evals=[(dtest, 'test')] # 評価方法は、'dtest'を用いる
                 )

  if getattr(data, 'base', None) is not None and \


XGBoostError: [11:32:29] /workspace/src/objective/multiclass_obj.cu:110: SoftmaxMultiClassObj: label must be in [0, num_class).
Stack trace:
  [bt] (0) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7f471c576cb4]
  [bt] (1) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::obj::SoftmaxMultiClassObj::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0xa26) [0x7f471c770846]
  [bt] (2) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x345) [0x7f471c610505]
  [bt] (3) /home/yoshida/.pyenv/versions/3.6.6/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f471c573aa5]
  [bt] (4) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f4758e99ec0]
  [bt] (5) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f4758e9987d]
  [bt] (6) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f47590afe5e]
  [bt] (7) /home/yoshida/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x13895) [0x7f47590b0895]
  [bt] (8) /home/yoshida/anaconda3/bin/python(_PyObject_FastCallDict+0x8b) [0x55617baf438b]

