In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from matplotlib import pyplot as plt
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
import warnings
warnings.simplefilter('ignore')

In [2]:
data_dir = "F:/TARGET frontier/TXT33/"
data_name = "train"
tail = "fe5"
train_data = pd.read_csv(data_dir+data_name+tail+".csv",encoding="shift-jis",dtype={'レースID(旧)':str})

In [3]:
pd.set_option('display.max_columns', 100)
train_data.head()

Unnamed: 0,場所,芝・ダ,距離,クラス名,馬場状態,騎手コード,枠番,性別,年齢,キャリア,斤量,間隔,休み明け〜戦目,種牡馬,父タイプ名,調教師コード,前走場所,前芝・ダ,前距離,前クラス名,前走馬場状態,前走騎手コード,前走枠番,前走脚質,前走斤量,前走Ave-3F,前走上り3F,前走PCI3,前走RPCI,前PCI,前好走,前走平均1Fタイム,前走着差タイム,前走単勝オッズ,着差,レースID(旧),複勝オッズ下限,複勝オッズ上限,複勝配当,単勝オッズ
0,2,2,1200,2,3,5212,6,2,3,13,56.0,6.0,5.0,364,8,1105,13,2,1400.0,2,2,660.0,2.0,3,54.0,37.2,33.9,59.53,58.2,59.7,0,11.93,0.2,14.6,0.0,615581112,1.9,2.7,190.0,5.4
1,2,2,1200,2,3,5339,6,1,3,10,54.0,3.0,3.0,281,8,1002,1,2,1200.0,4,3,5339.0,7.0,3,54.0,34.5,34.6,48.8,46.9,49.7,0,11.52,0.0,2.2,0.0,615581111,1.3,1.6,130.0,2.8
2,2,2,1200,2,3,5203,3,2,3,9,56.0,5.0,2.0,387,5,1089,3,2,1200.0,2,3,1018.0,8.0,2,56.0,34.5,34.4,50.13,48.0,50.3,0,11.48,0.4,28.6,0.2,615581106,1.8,2.5,180.0,6.2
3,2,2,1200,2,3,1150,4,1,5,17,55.0,2.0,4.0,56,1,1134,2,2,1200.0,2,2,1150.0,6.0,2,53.0,34.1,34.4,48.17,46.3,49.1,0,11.42,0.0,19.1,0.3,615581108,1.7,2.5,0.0,6.6
4,2,2,1200,2,3,1004,8,1,3,15,54.0,2.0,4.0,193,4,1030,2,2,1200.0,2,2,1134.0,2.0,5,53.0,33.6,36.4,48.17,46.3,42.3,1,11.67,1.5,8.6,0.3,615581116,6.3,9.7,0.0,33.9


In [4]:
#test_data
test_data = pd.read_csv(data_dir+"predict2019"+tail+".csv",encoding="shift-jis",dtype={'レースID(旧)':str})
test_x = test_data.drop(["着差","レースID(旧)","複勝オッズ下限","複勝オッズ上限","複勝配当"],axis=1)
test_y = test_data["着差"]

In [5]:
train_x = train_data.drop(["着差","レースID(旧)","複勝オッズ下限","複勝オッズ上限","複勝配当"],axis=1)
train_y = train_data["着差"]

In [6]:
#ベースライン
params = {
    'booster':'gbtree',
    'objective':'reg:squarederror',
    'eta':0.1,
    'gamma':0.0,
    'alpha':0.0,
    'lambda':1.0,
    'min_child_weight':1,
    'max_depth':6,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'random_state':71,
}

In [7]:
#正解ラベルの欠損値の穴埋め
#これがないとエラー履くので注意
train_y.fillna(1,inplace=True)
test_y.fillna(1,inplace=True)

In [8]:
def score(param_space):
    scores = []
    histories = []
    models = []
    
    for i,j in params.items():
        if i not in param_space:
            param_space[i] = j
    param_space['max_depth'] = int(param_space['max_depth'])
    print(param_space)    
    
    kf = KFold(n_splits=4,shuffle=False)
    for tr_idx,va_idx in kf.split(train_x):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        dtrain = xgb.DMatrix(tr_x,label=tr_y)
        dvalid = xgb.DMatrix(va_x,label=va_y)
        watchlist = [(dtrain,"train"),(dvalid,"eval")]
        evals_result = {}
        model = xgb.train(
            param_space,
            dtrain,
            num_boost_round=10000,
            early_stopping_rounds=100,
            evals=watchlist,
            evals_result=evals_result,
            verbose_eval=100
        )
        models.append(model)
        histories.append(evals_result)
        va_pred = model.predict(dvalid)
        va_y.fillna(0.5,inplace=True)
        #print(va_pred[:10])
        #print(va_y[:10])
        score = np.sqrt(mean_squared_error(va_y,va_pred)) #rmse
        scores.append(score)
    rmse_score = np.mean(scores)
    history_opt.append((param_space,rmse_score))
    
    return {"loss": score, "status": STATUS_OK}

In [9]:
#パラメータチューニング
param_space = {

    'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)),

    'max_depth': hp.quniform('max_depth', 3, 9, 1),

    'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),

    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),

    'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),

    # 余裕があればalpha, lambdaも調整する

    # 'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),

    # 'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0)),

}

In [10]:
max_evals = 10
trials = Trials()
history_opt = []
fmin(score,param_space,algo=tpe.suggest,trials=trials,max_evals=max_evals) #パラメータチューニングの実行

{'colsample_bytree': 0.65, 'gamma': 0.013439667332810353, 'max_depth': 5, 'min_child_weight': 2.663317188920419, 'subsample': 0.9, 'booster': 'gbtree', 'objective': 'reg:squarederror', 'eta': 0.1, 'alpha': 0.0, 'lambda': 1.0, 'random_state': 71}
[0]	train-rmse:1.61655	eval-rmse:1.61827                                                                               

Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.                                   


Will train until eval-rmse hasn't improved in 50 rounds.                                                               

[100]	train-rmse:1.10196	eval-rmse:1.22578                                                                             

[200]	train-rmse:1.06734	eval-rmse:1.22561                                                                             

Stopping. Best iteration:                                                                                              
[155]	train-rmse:1.08224	eva

{'colsample_bytree': 0.8,
 'gamma': 9.752681049064432e-08,
 'max_depth': 4.0,
 'min_child_weight': 0.6351847375492726,
 'subsample': 0.7000000000000001}

In [11]:
#記録した情報からパラメータとスコアを取得
history_opt = sorted(history_opt,key=lambda tpl: tpl[1])
best = history_opt[0]
print(f"best params:{best[0]}, score:{best[1]:.4f}")

best params:{'colsample_bytree': 0.65, 'gamma': 0.013439667332810353, 'max_depth': 5, 'min_child_weight': 2.663317188920419, 'subsample': 0.9, 'booster': 'gbtree', 'objective': 'reg:squarederror', 'eta': 0.1, 'alpha': 0.0, 'lambda': 1.0, 'random_state': 71}, score:1.1834


In [12]:
history_opt

[({'colsample_bytree': 0.65,
   'gamma': 0.013439667332810353,
   'max_depth': 5,
   'min_child_weight': 2.663317188920419,
   'subsample': 0.9,
   'booster': 'gbtree',
   'objective': 'reg:squarederror',
   'eta': 0.1,
   'alpha': 0.0,
   'lambda': 1.0,
   'random_state': 71},
  1.1834036403467327),
 ({'colsample_bytree': 0.8,
   'gamma': 9.752681049064432e-08,
   'max_depth': 4,
   'min_child_weight': 0.6351847375492726,
   'subsample': 0.7000000000000001,
   'booster': 'gbtree',
   'objective': 'reg:squarederror',
   'eta': 0.1,
   'alpha': 0.0,
   'lambda': 1.0,
   'random_state': 71},
  1.1837693429954388),
 ({'colsample_bytree': 0.8500000000000001,
   'gamma': 0.02053814743838391,
   'max_depth': 5,
   'min_child_weight': 4.7264262229514875,
   'subsample': 0.9,
   'booster': 'gbtree',
   'objective': 'reg:squarederror',
   'eta': 0.1,
   'alpha': 0.0,
   'lambda': 1.0,
   'random_state': 71},
  1.184767132215382),
 ({'colsample_bytree': 0.9,
   'gamma': 2.0734739734271457e-08,
 

In [13]:
#学習結果のプロット
%matplotlib inline
def plot_loss(evals_result, title):
    train_metric = evals_result['train']['rmse']
    plt.plot(train_metric, label='train rmse')
    eval_metric = evals_result['eval']['rmse']
    plt.plot(eval_metric, label='eval rmse')
    plt.grid()
    plt.legend()
    plt.title(title)
    plt.xlabel('rounds')
    plt.ylabel('rmse')
    plt.show()

In [14]:
for i in range(4):
    plot_loss(histories[i],f'Training Dataset: {i}')

NameError: name 'histories' is not defined

In [None]:
test_length = len(test_data)
predict = np.array([0.0]*test_length)
dtest = xgb.DMatrix(test_x,label=test_y) #本来は学習に使用していないデータにする
for i in range(4):
    y_pred_proba = models[i].predict(dtest)
    predict += y_pred_proba

predict /= 4

In [None]:
y = pd.DataFrame(predict,columns=["y'"])
output = test_data #ここも書き換えること

output = pd.concat([output,y],axis=1)

output["複勝配当"] = output["複勝配当"].astype("int64")
output.to_csv("C:/programs/HRP/neweval.csv",index=False,encoding="shift-jis")

In [None]:
len(output)