In [1]:
# 選手情報・過去レース情報から3連単舟券120種をクラス分類する
# こちらではパラメタのベイズ最適化を試みる。
# スタートタイムや過去の連対率。逆に、オッズは消してみる。

# 汎用ライブラリのimport
import sys
import os
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.api as sm
import math
import tensorflow as tf
import collections

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata
from sklearn import metrics
import warnings

In [2]:
# 自作ライブラリのimport
if os.environ['BR_HOME']+"/boatrace" not in sys.path:
    sys.path.append(os.environ['BR_HOME']+"/boatrace")
#print(sys.path)

from setup.myUtil import dbHandler


In [3]:
# 分析期間の指定は一旦ここでまとめてみる。
trainStartDate="20170901"
trainEndDate="20181231"
# test はtrainからsplitする

In [4]:
dbh=dbHandler.getDBHandle()
#dbHandler.closeDBHandle(dbh)

In [5]:
# trainの元データを取得
with dbh.cursor() as cursor:
    sel_sql = "select * from raceabst_forml_rentai_v \
               where raceDate between '%s' and '%s' \
               order by raceId "\
               % (trainStartDate,trainEndDate)
    cursor.execute(sel_sql)
    loadList=cursor.fetchall()
print("traindata:",len(loadList))

traindata: 71699


In [6]:
df = pd.io.json.json_normalize(loadList)
df.head()

Unnamed: 0,funaken,l1Fcnt,l1boat2r,l1boat3r,l1motor2r,l1motor3r,l1oldavgstdev,l1oldavgsttime,l1oldrank1,l1oldrank2,...,l6oldrank4,l6oldrank5,l6oldrank6,l6rank,l6score,odds,raceDate,raceId,raceWaveHeight,raceWindSpeed
0,1-2-5,0,0.2632,0.3684,0.3158,0.4737,-0.02765,0.129,0.26,0.25,...,0.1,0.06,0.09,A1,2.973712,22.8,2017-09-01,20170901-05-01,3,6.0
1,2-3-6,0,0.4737,0.7368,0.2963,0.4074,-0.06915,0.1699,0.41,0.15,...,0.14,0.13,0.08,A1,4.878168,92.7,2017-09-01,20170901-05-02,3,4.0
2,3-1-6,0,0.35,0.65,0.5556,0.5556,-0.02338,0.1289,0.42,0.27,...,0.11,0.11,0.02,A1,0.877197,54.5,2017-09-01,20170901-05-03,2,4.0
3,1-5-2,0,0.4444,0.5556,0.2667,0.4,-0.01867,0.1476,0.38,0.27,...,0.16,0.15,0.09,A1,1.284942,13.1,2017-09-01,20170901-05-04,3,5.0
4,1-2-4,0,0.3,0.55,0.3077,0.5,0.00234,0.15212,0.25,0.28,...,0.17,0.1,0.05,A1,2.240696,12.0,2017-09-01,20170901-05-05,2,3.0


In [7]:
# 入力のデータ整形
xdf=df.drop(['funaken','odds','raceId','raceDate'],axis=1)
# オッズから作ったスコアは効きすぎるので捨ててみる
xdf=xdf.drop(['l1score','l2score','l3score','l4score','l5score','l6score'],axis=1)
#xdf=xdf.drop(['l1Fcnt','l2Fcnt','l3Fcnt','l4Fcnt','l5Fcnt','l6Fcnt'],axis=1)
#xdf=xdf.drop(['l1oldavgstdev','l2oldavgstdev','l3oldavgstdev','l4oldavgstdev','l5oldavgstdev','l6oldavgstdev'],axis=1)
#xdf=pd.get_dummies(xdf,columns=['l1rank','l2rank','l3rank','l4rank','l5rank','l6rank'])
rankLabel=LabelEncoder()
rankLabel=rankLabel.fit(xdf['l1rank'])
xdf['l1rank']=rankLabel.transform(xdf['l1rank'])
xdf['l2rank']=rankLabel.transform(xdf['l2rank'])
xdf['l3rank']=rankLabel.transform(xdf['l3rank'])
xdf['l4rank']=rankLabel.transform(xdf['l4rank'])
xdf['l5rank']=rankLabel.transform(xdf['l5rank'])
xdf['l6rank']=rankLabel.transform(xdf['l6rank'])
xdf.head()


Unnamed: 0,l1Fcnt,l1boat2r,l1boat3r,l1motor2r,l1motor3r,l1oldavgstdev,l1oldavgsttime,l1oldrank1,l1oldrank2,l1oldrank3,...,l6oldavgsttime,l6oldrank1,l6oldrank2,l6oldrank3,l6oldrank4,l6oldrank5,l6oldrank6,l6rank,raceWaveHeight,raceWindSpeed
0,0,0.2632,0.3684,0.3158,0.4737,-0.02765,0.129,0.26,0.25,0.15,...,0.1587,0.28,0.21,0.26,0.1,0.06,0.09,0,3,6.0
1,0,0.4737,0.7368,0.2963,0.4074,-0.06915,0.1699,0.41,0.15,0.21,...,0.1307,0.23,0.22,0.2,0.14,0.13,0.08,0,3,4.0
2,0,0.35,0.65,0.5556,0.5556,-0.02338,0.1289,0.42,0.27,0.11,...,0.1754,0.33,0.23,0.2,0.11,0.11,0.02,0,2,4.0
3,0,0.4444,0.5556,0.2667,0.4,-0.01867,0.1476,0.38,0.27,0.12,...,0.1642,0.23,0.22,0.15,0.16,0.15,0.09,0,3,5.0
4,0,0.3,0.55,0.3077,0.5,0.00234,0.15212,0.25,0.28,0.18,...,0.1508,0.34,0.19,0.15,0.17,0.1,0.05,0,2,3.0


In [8]:
# 結果のOne-Hot表現を作る⇒LGBMは数値配列なので数字にする。
ydf=df['funaken']
yLabel = LabelEncoder()
yLabel = yLabel.fit(ydf)
ydf = pd.DataFrame(yLabel.transform(ydf))
#ydf = yLabel.transform(ydf)
#ydf=pd.get_dummies(ydf,columns=['funaken'])
ydf.head()
#ydf.describe()

Unnamed: 0,0
0,2
1,27
2,43
3,12
4,1


In [9]:
# 重み付けのため、オッズのリストを作る
#odf=df['odds'].values
odf=pd.DataFrame(df['odds'])
#odf.describe()
print(type(odf))

<class 'pandas.core.frame.DataFrame'>


In [10]:
bayesian_tr_index, bayesian_val_index  = list(StratifiedKFold(n_splits=2, shuffle=True, random_state=1).split(xdf, ydf))[0]


In [11]:
def LGB_bayesian(
    num_leaves, #int
    min_data_in_leaf, #int
    reg_alpha,
    reg_lambda,
    max_depth #int
):
    
    # 整数じゃないといけないパラメータ。
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)    
    assert type(num_leaves)==int
    assert type(min_data_in_leaf)==int
    assert type(max_depth)==int
    
    params={
        # 多値分類問題
        'objective': 'multiclass',
        'num_boost_round':500,
        # クラス数は 120
        'num_class': 120,
        #'class_weight':'balanced',
        #'random_state':999,
        # 以下、ハイパーパラメタ
        'max_depth':max_depth,
        'num_leaves':num_leaves,
        'min_data_in_leaf':min_data_in_leaf,
        # 正則化
        'reg_alpha':reg_alpha,
        'reg_lambda':reg_lambda,
    }

    xg_train = lgb.Dataset(xdf.iloc[bayesian_tr_index],ydf.iloc[bayesian_tr_index])
    xg_valid = lgb.Dataset(xdf.iloc[bayesian_val_index],ydf.iloc[bayesian_val_index])

    evals_result = {}
    num_round = 5000
    clf = lgb.train(params, xg_train, num_round, valid_sets = [xg_valid], verbose_eval = 250 ,early_stopping_rounds = 50,evals_result=evals_result)
    #print(evals_result['eval']['multi_logloss'])
    #print(evals_result['valid_0']['multi_logloss'])
    print(min(evals_result['valid_0']['multi_logloss']))

    predictions = clf.predict(xdf.iloc[bayesian_val_index], num_iteration=clf.best_iteration)   
    
    #score = metrics.roc_auc_score(xdf.iloc[bayesian_val_index],predictions)
    # 精度 (Accuracy) を計算する
    #print(predictions)
    score=1/min(evals_result['valid_0']['multi_logloss'])
 
    return score

In [12]:
bounds_LGB={
    'max_depth':(2,15),
    'min_data_in_leaf':(0,300),
    'num_leaves':(3,20),
    'reg_alpha':(0,10.0),
    'reg_lambda':(0,10.0)
}

In [13]:
#LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=13)
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)
print(LGB_BO.space.keys)

['max_depth', 'min_data_in_leaf', 'num_leaves', 'reg_alpha', 'reg_lambda']


In [None]:
init_points = 5
n_iter = 30

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

|   iter    |  target   | max_depth | min_da... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[192]	valid_0's multi_logloss: 3.98067
3.98067150302112
| [0m 1       [0m | [0m 0.2512  [0m | [0m 2.516   [0m | [0m 161.0   [0m | [0m 19.82   [0m | [0m 4.715   [0m | [0m 6.368   [0m |
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[114]	valid_0's multi_logloss: 3.98686
3.98686063714877
| [0m 2       [0m | [0m 0.2508  [0m | [0m 3.117   [0m | [0m 267.2   [0m | [0m 16.9    [0m | [0m 5.463   [0m | [0m 8.28    [0m |
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[79]	valid_0's multi_logloss: 4.00373
4.003733487789074
| [0m 3       [0m | [0m 0.2498  [0m | [0m 13.6    [0m | [0m 169.6   [0m | [0