In [1]:
# 選手情報・過去レース情報から3連単舟券120種をクラス分類する
# こちらではパラメタのベイズ最適化を試みる。

# 汎用ライブラリのimport
import sys
import os
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.api as sm
import math
import tensorflow as tf
import collections

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# 自作ライブラリのimport
if os.environ['BR_HOME']+"/boatrace" not in sys.path:
    sys.path.append(os.environ['BR_HOME']+"/boatrace")
#print(sys.path)

from setup.myUtil import dbHandler


In [3]:
# 分析期間の指定は一旦ここでまとめてみる。
trainStartDate="20180101"
trainEndDate="20180731"
# test はtrainからsplitする

In [4]:
dbh=dbHandler.getDBHandle()
#dbHandler.closeDBHandle(dbh)

In [5]:
# trainの元データを取得
with dbh.cursor() as cursor:
    sel_sql = "select * from raceabst_forml_v \
               where raceDate between '%s' and '%s' \
               order by raceId "\
               % (trainStartDate,trainEndDate)
    cursor.execute(sel_sql)
    loadList=cursor.fetchall()
print("traindata:",len(loadList))

traindata: 31824


In [6]:
df = pd.io.json.json_normalize(loadList)
df.head()

Unnamed: 0,funaken,l1Fcnt,l1boat2r,l1boat3r,l1motor2r,l1motor3r,l1rank,l1score,l2Fcnt,l2boat2r,...,l6boat3r,l6motor2r,l6motor3r,l6rank,l6score,odds,raceDate,raceId,raceWaveHeight,raceWindSpeed
0,2-1-3,1,0.3636,0.5152,0.3558,0.5092,A1,13.871549,0,0.4746,...,0.4545,0.5421,0.6789,B2,0.851669,30.9,2018-01-01,20180101-06-01,3,5.0
1,2-1-6,0,0.2462,0.4462,0.3466,0.483,B1,3.409304,0,0.3636,...,0.4925,0.3392,0.5088,B1,0.886552,26.2,2018-01-01,20180101-06-02,3,5.0
2,3-5-4,0,0.3692,0.5231,0.3135,0.4108,B2,2.836839,0,0.2714,...,0.4091,0.3472,0.487,B1,0.580764,99.2,2018-01-01,20180101-06-03,3,5.0
3,1-5-2,0,0.4308,0.6308,0.3676,0.5514,A2,7.488113,0,0.3636,...,0.3939,0.3176,0.4706,B1,0.583266,16.8,2018-01-01,20180101-06-04,3,5.0
4,4-1-3,0,0.3636,0.5152,0.4469,0.5754,B1,6.454111,0,0.3231,...,0.4925,0.3916,0.5663,B2,0.470489,51.5,2018-01-01,20180101-06-05,3,5.0


In [37]:
# 入力のデータ整形
xdf=df.drop(['funaken','odds','raceId','raceDate'],axis=1)
#xdf=pd.get_dummies(xdf,columns=['l1rank','l2rank','l3rank','l4rank','l5rank','l6rank'])
rankLabel=LabelEncoder()
rankLabel=rankLabel.fit(xdf['l1rank'])
xdf['l1rank']=rankLabel.transform(xdf['l1rank'])
xdf['l2rank']=rankLabel.transform(xdf['l2rank'])
xdf['l3rank']=rankLabel.transform(xdf['l3rank'])
xdf['l4rank']=rankLabel.transform(xdf['l4rank'])
xdf['l5rank']=rankLabel.transform(xdf['l5rank'])
xdf['l6rank']=rankLabel.transform(xdf['l6rank'])
xdf.head()


Unnamed: 0,l1Fcnt,l1boat2r,l1boat3r,l1motor2r,l1motor3r,l1rank,l1score,l2Fcnt,l2boat2r,l2boat3r,...,l5score,l6Fcnt,l6boat2r,l6boat3r,l6motor2r,l6motor3r,l6rank,l6score,raceWaveHeight,raceWindSpeed
0,1,0.3636,0.5152,0.3558,0.5092,0,13.871549,0,0.4746,0.661,...,0.827756,0,0.3182,0.4545,0.5421,0.6789,3,0.851669,3,5.0
1,0,0.2462,0.4462,0.3466,0.483,2,3.409304,0,0.3636,0.5758,...,0.671987,0,0.3284,0.4925,0.3392,0.5088,2,0.886552,3,5.0
2,0,0.3692,0.5231,0.3135,0.4108,3,2.836839,0,0.2714,0.5,...,4.669515,0,0.303,0.4091,0.3472,0.487,2,0.580764,3,5.0
3,0,0.4308,0.6308,0.3676,0.5514,1,7.488113,0,0.3636,0.5303,...,1.435647,0,0.2121,0.3939,0.3176,0.4706,2,0.583266,3,5.0
4,0,0.3636,0.5152,0.4469,0.5754,2,6.454111,0,0.3231,0.6154,...,1.682992,0,0.3582,0.4925,0.3916,0.5663,3,0.470489,3,5.0


In [38]:
# 結果のOne-Hot表現を作る⇒LGBMは数値配列なので数字にする。
ydf=df['funaken']
yLabel = LabelEncoder()
yLabel = yLabel.fit(ydf)
#ydf = pd.DataFrame(yLabel.transform(ydf))
ydf = yLabel.transform(ydf)
#ydf=pd.get_dummies(ydf,columns=['funaken'])
#ydf.head()
#ydf.describe()

In [39]:
# 重み付けのため、オッズのリストを作る
odf=df['odds'].values
#odf=pd.DataFrame(df['odds'])
#odf.describe()
print(type(odf))

<class 'numpy.ndarray'>


In [40]:
X_train, X_test, y_train, y_test,o_train,o_test = train_test_split(xdf, ydf,odf)

In [41]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [60]:
lgbm_params = {
    # 多値分類問題
    'objective': 'multiclass',
    # クラス数は 120
    'num_class': 120,
    #'class_weight':'balanced',
    'random_state':999,
    # 以下、ハイパーパラメタ
    'max_depth':7,
    'num_leaves':31,
    # 正則化
    'reg_alpha':10,
    'reg_lambda':10,
}


In [61]:
# デフォルトを見ているだけ。
lgb.LGBMClassifier()

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [62]:
model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval,num_boost_round=200,early_stopping_rounds=10)


[1]	valid_0's multi_logloss: 4.6636
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 4.57453
[3]	valid_0's multi_logloss: 4.50309
[4]	valid_0's multi_logloss: 4.44487
[5]	valid_0's multi_logloss: 4.39447
[6]	valid_0's multi_logloss: 4.35071
[7]	valid_0's multi_logloss: 4.31269
[8]	valid_0's multi_logloss: 4.27893
[9]	valid_0's multi_logloss: 4.24893
[10]	valid_0's multi_logloss: 4.22228
[11]	valid_0's multi_logloss: 4.19859
[12]	valid_0's multi_logloss: 4.17685
[13]	valid_0's multi_logloss: 4.1571
[14]	valid_0's multi_logloss: 4.13867
[15]	valid_0's multi_logloss: 4.12262
[16]	valid_0's multi_logloss: 4.10692
[17]	valid_0's multi_logloss: 4.09308
[18]	valid_0's multi_logloss: 4.08055
[19]	valid_0's multi_logloss: 4.06839
[20]	valid_0's multi_logloss: 4.05734
[21]	valid_0's multi_logloss: 4.04742
[22]	valid_0's multi_logloss: 4.03812
[23]	valid_0's multi_logloss: 4.02896
[24]	valid_0's multi_logloss: 4.02098
[25]	valid_0's multi_logloss: 4.01356

In [67]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_pred_max) / len(y_test)
print("accuracy:",accuracy)

# 回収率を計算
res=0
resTrueList =[]
resFalseList =[]
for i in range(len(y_test)):

    if y_test[i]==y_pred_max[i]:
        print("i:",i,"result:",y_test[i],"forecast:",y_pred_max[i],"forecastProb:",y_pred[i][y_pred_max[i]],"return:",o_test[i],"expect:",y_pred[i][y_pred_max[i]]*o_test[i])
        res += o_test[i] -1
        resTrueList.append(y_pred_max[i])
    else:
        #print("i:",i,"result:",y_test[i],"forecast:",y_pred_max[i],"forecastProb:",y_pred[i][y_pred_max[i]],"return:",o_test[i],"expect:",y_pred[i][y_pred_max[i]]*o_test[i])
        resFalseList.append(y_pred_max[i])
        pass
print("resultReturn:",res/len(y_test))


accuracy: 0.08484162895927602
i: 0 result: 0 forecast: 0 forecastProb: 0.16578988965413277 return: 3.6 expect: 0.596843602754878
i: 3 result: 1 forecast: 1 forecastProb: 0.1319685608233191 return: 19.2 expect: 2.5337963678077267
i: 24 result: 0 forecast: 0 forecastProb: 0.2238963304863558 return: 3.7 expect: 0.8284164227995164
i: 38 result: 2 forecast: 2 forecastProb: 0.15785835177838473 return: 5.4 expect: 0.8524350996032776
i: 47 result: 0 forecast: 0 forecastProb: 0.12646068334159252 return: 5.8 expect: 0.7334719633812367
i: 56 result: 0 forecast: 0 forecastProb: 0.13001073156494458 return: 10.5 expect: 1.365112681431918
i: 70 result: 0 forecast: 0 forecastProb: 0.16760652449324312 return: 4.9 expect: 0.8212719700168913
i: 80 result: 21 forecast: 21 forecastProb: 0.12186758134320927 return: 12.8 expect: 1.5599050411930788
i: 91 result: 0 forecast: 0 forecastProb: 0.08520357904319913 return: 7.6 expect: 0.6475472007283133
i: 123 result: 0 forecast: 0 forecastProb: 0.12315814986765522

i: 2672 result: 0 forecast: 0 forecastProb: 0.17204810014587016 return: 11.7 expect: 2.0129627717066807
i: 2689 result: 0 forecast: 0 forecastProb: 0.158736278554746 return: 5.8 expect: 0.9206704156175268
i: 2693 result: 7 forecast: 7 forecastProb: 0.07473535035365182 return: 12.2 expect: 0.9117712743145522
i: 2698 result: 0 forecast: 0 forecastProb: 0.10815630202578747 return: 7.7 expect: 0.8328035255985635
i: 2725 result: 1 forecast: 1 forecastProb: 0.09585231160246373 return: 7.4 expect: 0.7093071058582316
i: 2727 result: 0 forecast: 0 forecastProb: 0.145752239299542 return: 4.7 expect: 0.6850355247078475
i: 2756 result: 1 forecast: 1 forecastProb: 0.12415739017932145 return: 4.5 expect: 0.5587082558069465
i: 2764 result: 43 forecast: 43 forecastProb: 0.04328904697739534 return: 21.4 expect: 0.9263856053162602
i: 2790 result: 4 forecast: 4 forecastProb: 0.1521285095857371 return: 6.5 expect: 0.988835312307291
i: 2791 result: 8 forecast: 8 forecastProb: 0.0728595937231062 return: 12.

i: 5839 result: 0 forecast: 0 forecastProb: 0.13128498996040056 return: 6.6 expect: 0.8664809337386437
i: 5840 result: 9 forecast: 9 forecastProb: 0.11498029077336067 return: 10.3 expect: 1.184296994965615
i: 5857 result: 1 forecast: 1 forecastProb: 0.13575117038946363 return: 6.9 expect: 0.9366830756872991
i: 5870 result: 0 forecast: 0 forecastProb: 0.1202271376851844 return: 16.6 expect: 1.995770485574061
i: 5882 result: 6 forecast: 6 forecastProb: 0.10359070094558814 return: 7.9 expect: 0.8183665374701463
i: 5905 result: 0 forecast: 0 forecastProb: 0.10428834833489782 return: 6.7 expect: 0.6987319338438154
i: 5920 result: 3 forecast: 3 forecastProb: 0.0735846300343698 return: 15.8 expect: 1.162637154543043
i: 5935 result: 4 forecast: 4 forecastProb: 0.0705970381473503 return: 11.7 expect: 0.8259853463239984
i: 5941 result: 5 forecast: 5 forecastProb: 0.11189486646452537 return: 8.3 expect: 0.9287273916555606
i: 5942 result: 1 forecast: 1 forecastProb: 0.1043809192148785 return: 10.7

In [75]:
ct=collections.Counter(resTrueList)
cf=collections.Counter(resFalseList)
print(ct)
print(len(resTrueList))
print(cf)
print(len(resFalseList))

Counter({0: 231, 1: 109, 5: 42, 2: 40, 8: 38, 4: 36, 3: 30, 7: 20, 6: 19, 9: 16, 11: 10, 21: 9, 10: 7, 12: 7, 22: 6, 20: 6, 23: 5, 42: 5, 41: 5, 13: 4, 50: 3, 46: 3, 14: 3, 18: 2, 17: 2, 15: 2, 29: 2, 51: 1, 26: 1, 43: 1, 45: 1, 55: 1, 81: 1, 101: 1, 77: 1, 70: 1, 47: 1, 100: 1, 72: 1, 62: 1})
675
Counter({0: 1629, 1: 823, 5: 508, 4: 451, 8: 393, 2: 350, 3: 348, 7: 321, 6: 270, 9: 237, 10: 156, 20: 141, 12: 136, 11: 110, 21: 107, 22: 97, 23: 95, 15: 91, 42: 83, 25: 66, 41: 64, 26: 59, 29: 47, 16: 47, 45: 44, 30: 43, 13: 41, 62: 38, 46: 30, 47: 27, 72: 24, 18: 23, 32: 22, 43: 21, 50: 21, 14: 20, 36: 17, 17: 16, 55: 15, 66: 15, 75: 15, 24: 15, 51: 12, 85: 12, 73: 12, 49: 11, 77: 11, 67: 10, 33: 10, 37: 10, 70: 9, 28: 8, 81: 8, 40: 7, 19: 7, 31: 6, 27: 6, 61: 6, 101: 6, 53: 5, 52: 5, 65: 5, 63: 5, 38: 4, 60: 4, 100: 4, 64: 3, 56: 2, 57: 2, 44: 2, 59: 2, 92: 1, 48: 1, 69: 1, 54: 1, 83: 1, 68: 1, 96: 1, 104: 1, 105: 1, 80: 1, 71: 1})
7281


In [64]:
# trainの回収率を計算
y_pred = model.predict(X_train, num_iteration=model.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

#print(X_train.head())
#print(y_train[0:5])
#print(y_pred_max[0:5])
#c = collections.Counter(y_pred_max)
#print(len(c) )

# 精度 (Accuracy) を計算する
accuracy = sum(y_train == y_pred_max) / len(y_train)
print(accuracy)

# 回収率を計算
res=0
for i in range(len(y_train)):
    if y_train[i]==y_pred_max[i]:
        res += o_train[i] -1
    else:
        pass
print(res/len(y_train))


0.27124183006535946
3.9283517680576496


In [65]:
# Feature Importance
fti = model.feature_importance()

print('Feature Importances:')
print(fti )

Feature Importances:
[  203  2895  2361  2537  2664   743 10072   259  2792  2532  2702  2854
   855  9952   283  2433  2694  3050  2748  1102  9745   324  2583  2744
  3111  2652   842 11187   351  3004  3174  3065  2740  1048 11452   418
  2858  2825  2738  2578   930 10752  1824  1618]
