In [1]:
# 選手情報・過去レース情報から3連単舟券120種をクラス分類する
# LGBMでやってみる。
# todo:bayseの方と、インプットの与え方をあわせる。今はOne-Hotが一致していない。

# 汎用ライブラリのimport
import sys
import os
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.api as sm
import math
import tensorflow as tf
import collections

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# 自作ライブラリのimport
if os.environ['BR_HOME']+"/boatrace" not in sys.path:
    sys.path.append(os.environ['BR_HOME']+"/boatrace")
#print(sys.path)

from setup.myUtil import dbHandler


In [3]:
# 分析期間の指定は一旦ここでまとめてみる。
trainStartDate="20180101"
trainEndDate="20180731"
# test はtrainからsplitする

In [4]:
dbh=dbHandler.getDBHandle()
#dbHandler.closeDBHandle(dbh)

In [5]:
# trainの元データを取得
with dbh.cursor() as cursor:
    sel_sql = "select * from raceabst_forml_v \
               where raceDate between '%s' and '%s' \
               order by raceId "\
               % (trainStartDate,trainEndDate)
    cursor.execute(sel_sql)
    loadList=cursor.fetchall()
print("traindata:",len(loadList))

traindata: 31824


In [6]:
df = pd.io.json.json_normalize(loadList)
df.head()

Unnamed: 0,funaken,l1Fcnt,l1boat2r,l1boat3r,l1motor2r,l1motor3r,l1rank,l1score,l2Fcnt,l2boat2r,...,l6boat3r,l6motor2r,l6motor3r,l6rank,l6score,odds,raceDate,raceId,raceWaveHeight,raceWindSpeed
0,2-1-3,1,0.3636,0.5152,0.3558,0.5092,A1,13.871549,0,0.4746,...,0.4545,0.5421,0.6789,B2,0.851669,30.9,2018-01-01,20180101-06-01,3,5.0
1,2-1-6,0,0.2462,0.4462,0.3466,0.483,B1,3.409304,0,0.3636,...,0.4925,0.3392,0.5088,B1,0.886552,26.2,2018-01-01,20180101-06-02,3,5.0
2,3-5-4,0,0.3692,0.5231,0.3135,0.4108,B2,2.836839,0,0.2714,...,0.4091,0.3472,0.487,B1,0.580764,99.2,2018-01-01,20180101-06-03,3,5.0
3,1-5-2,0,0.4308,0.6308,0.3676,0.5514,A2,7.488113,0,0.3636,...,0.3939,0.3176,0.4706,B1,0.583266,16.8,2018-01-01,20180101-06-04,3,5.0
4,4-1-3,0,0.3636,0.5152,0.4469,0.5754,B1,6.454111,0,0.3231,...,0.4925,0.3916,0.5663,B2,0.470489,51.5,2018-01-01,20180101-06-05,3,5.0


In [7]:
# 入力のデータ整形
xdf=df.drop(['funaken','odds','raceId','raceDate'],axis=1)
xdf=pd.get_dummies(xdf,columns=['l1rank','l2rank','l3rank','l4rank','l5rank','l6rank'])
xdf.head()


Unnamed: 0,l1Fcnt,l1boat2r,l1boat3r,l1motor2r,l1motor3r,l1score,l2Fcnt,l2boat2r,l2boat3r,l2motor2r,...,l4rank_B1,l4rank_B2,l5rank_A1,l5rank_A2,l5rank_B1,l5rank_B2,l6rank_A1,l6rank_A2,l6rank_B1,l6rank_B2
0,1,0.3636,0.5152,0.3558,0.5092,13.871549,0,0.4746,0.661,0.3141,...,1,0,0,0,1,0,0,0,0,1
1,0,0.2462,0.4462,0.3466,0.483,3.409304,0,0.3636,0.5758,0.3132,...,0,0,0,0,0,1,0,0,1,0
2,0,0.3692,0.5231,0.3135,0.4108,2.836839,0,0.2714,0.5,0.3333,...,0,0,1,0,0,0,0,0,1,0
3,0,0.4308,0.6308,0.3676,0.5514,7.488113,0,0.3636,0.5303,0.2809,...,0,1,0,0,1,0,0,0,1,0
4,0,0.3636,0.5152,0.4469,0.5754,6.454111,0,0.3231,0.6154,0.329,...,0,0,1,0,0,0,0,0,0,1


In [8]:
# 結果のOne-Hot表現を作る
ydf=df['funaken']
yLabel = LabelEncoder()
yLabel = yLabel.fit(ydf)
#ydf = pd.DataFrame(yLabel.transform(ydf))
ydf = yLabel.transform(ydf)
#ydf=pd.get_dummies(ydf,columns=['funaken'])
#ydf.head()
#ydf.describe()

In [9]:
# 重み付けのため、オッズのリストを作る
odf=df['odds'].values
#odf=pd.DataFrame(df['odds'])
#odf.describe()
print(type(odf))

<class 'numpy.ndarray'>


In [10]:
X_train, X_test, y_train, y_test,o_train,o_test = train_test_split(xdf, ydf,odf)

In [11]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [12]:
lgbm_params = {
    # 多値分類問題
    'objective': 'multiclass',
    # クラス数は 120
    'num_class': 120,
    #'class_weight':'balanced',
    'random_state':999,
    # 以下、ハイパーパラメタ
    'max_depth':3,
    'num_leaves':19,
    # 正則化
    'reg_alpha':9.591,
    'reg_lambda':9.928 ,
}


In [13]:
lgb.LGBMClassifier()

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [14]:
model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval)

[1]	valid_0's multi_logloss: 4.67141
[2]	valid_0's multi_logloss: 4.5855
[3]	valid_0's multi_logloss: 4.51718
[4]	valid_0's multi_logloss: 4.46016
[5]	valid_0's multi_logloss: 4.411
[6]	valid_0's multi_logloss: 4.36869
[7]	valid_0's multi_logloss: 4.33168
[8]	valid_0's multi_logloss: 4.29864
[9]	valid_0's multi_logloss: 4.26918
[10]	valid_0's multi_logloss: 4.2432
[11]	valid_0's multi_logloss: 4.21928
[12]	valid_0's multi_logloss: 4.19781
[13]	valid_0's multi_logloss: 4.17814
[14]	valid_0's multi_logloss: 4.15991
[15]	valid_0's multi_logloss: 4.14304
[16]	valid_0's multi_logloss: 4.12778
[17]	valid_0's multi_logloss: 4.11392
[18]	valid_0's multi_logloss: 4.10109
[19]	valid_0's multi_logloss: 4.08927
[20]	valid_0's multi_logloss: 4.07777
[21]	valid_0's multi_logloss: 4.06755
[22]	valid_0's multi_logloss: 4.05798
[23]	valid_0's multi_logloss: 4.049
[24]	valid_0's multi_logloss: 4.0408
[25]	valid_0's multi_logloss: 4.03295
[26]	valid_0's multi_logloss: 4.02572
[27]	valid_0's multi_logloss

In [15]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_pred_max) / len(y_test)
print("accuracy:",accuracy)

# 回収率を計算
res=0
for i in range(len(y_test)):

    if y_test[i]==y_pred_max[i]:
        #print("i:",i,"result:",y_test[i],"forecast:",y_pred_max[i],"forecastProb:",y_pred[i][y_pred_max[i]],"return:",o_test[i],"expect:",y_pred[i][y_pred_max[i]]*o_test[i])
        res += o_test[i] -1
    else:
        #print("i:",i,"result:",y_test[i],"forecast:",y_pred_max[i],"forecastProb:",y_pred[i][y_pred_max[i]],"return:",o_test[i],"expect:",y_pred[i][y_pred_max[i]]*o_test[i])
        pass
print("resultReturn:",res/len(y_test))


accuracy: 0.08597285067873303
resultReturn: 0.6967068878833582


In [16]:
# trainの回収率を計算
y_pred = model.predict(X_train, num_iteration=model.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

#print(X_train.head())
#print(y_train[0:5])
#print(y_pred_max[0:5])
#c = collections.Counter(y_pred_max)
#print(len(c) )

# 精度 (Accuracy) を計算する
accuracy = sum(y_train == y_pred_max) / len(y_train)
print(accuracy)

# 回収率を計算
res=0
for i in range(len(y_train)):
    if y_train[i]==y_pred_max[i]:
        res += o_train[i] -1
    else:
        pass
print(res/len(y_train))


0.15418133065191889
1.7268686106921416


In [17]:
print(model.feature_importance())

[  90  835 1040 1027  960 5471  127 1223  915  853 1128 5577   68  889
  999 1056 1204 5295   74 1052  744 1054  964 6648   70 1188 1212 1143
 1173 7272   97  935 1041  952 1019 6913  578  493   56   49  165   32
  117   79  193   43  168  125  276   31  138   94   91   92   79   58
   61  243  172  114  108  144]
