# 競馬予測

## 0. colabの環境を整える

### 0-1. git clone

In [None]:
!git clone https://github.com/yuugo0724/keiba_prediction.git

### 0-2. 作業ブランチの作成

In [None]:
# cloneしたディレクトリに移動
%cd ./keiba_prediction

# ブランチ名がmainであること
!git branch
# 作業ブランチの作成
!git branch [作業ブランチ名]
# 作業ブランチにチェックアウト
!git checkout [作業ブランチ名]
# 作業ブランチにチェックアウトできていることを確認
!git branch

### 0-2. ソースコードのディレクトリに移動

In [None]:
%cd src/

### 0-3. pythonのライブラリをインストール

In [None]:
pip install -r ../dockerfile/requirements.txt

## 1. モジュールやライブラリのインポート

### 1-1. インポート

In [25]:
"""
ライブラリ
"""
import os
import re
import subprocess
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#import lightgbm as lgb
#from optuna.integration import lightgbm as lgb
import optuna.integration.lightgbm as lgb
import lightgbm as lgb_orig

"""
モジュール(定数)
"""
# ローカルパス
from modules.constants import LocalPaths
# データフレームの列名
#from modules.constants import ResultsCols
# レース名のマスター
from modules.constants import RaceInfo
# スクレイピングのパス
#from modules.constants import ScrapyPath

"""
モジュール(前処理)
"""
# 前処理
#from modules.preprocess import shaping
from modules.preprocess import _scrapy_data


### 1-2. ローカルパスの定義

In [26]:
# インスタンス化(path_list => pl)
lp = LocalPaths()
ri = RaceInfo()
# プロジェクトのベースディレクトリ
base_dir = lp.BASE_DIR
# scrapyのベースディレクトリ
scrapy_dir = lp.SCRAPY_DIR
# scrapy keibaプロジェクトのパス
scrapy_keiba_dir = lp.SCRAPY_KEIBA_DIR
# dataディレクトリ
data_dir = lp.DATA_DIR
# レース結果スクレイピング用url格納ディレクトリ
data_url_dir = lp.DATA_URL_DIR
# masterデータ格納ディレクトリ
data_master_dir = lp.DATA_MASTER_DIR
data_tmp_dir = lp.DATA_TMP_DIR
# gradesデータ格納ディレクトリ
data_grades_dir = lp.DATA_GRADES_DIR
# horse_gradesデータ格納ディレクトリ
data_horse_grades_dir = lp.DATA_HORSE_GRADES_DIR
# pedigreeデータ格納ディレクトリ
data_pedigree_dir = lp.DATA_PEDIGREE_DIR
# gradesのmaster
data_grades_master = lp.DATA_GRADES_MASTER
# horse_idのmaster
data_horse_id_master = lp.DATA_HORSE_ID_MASTER

# 競馬場ID
place_dict = ri.PLACE_DICT
# urlの正常性チェックプログラムのパス
#sy_unc_path = base_path + 'scrapy/url_normality_check.py'


## 2. 学習データ作成


### 2-1. 成績マスターの作成

In [27]:
# 成績マスターの読み込み
grades_master = pd.read_pickle(data_grades_master)
# 欠損地の削除
grades_master = grades_master.dropna(how='any')
# 体重増減を整数化(記号を削除)
grades_master['馬体重増減'] = grades_master['馬体重増減'].replace("+","").astype('int')
# 性齢の分割
sexual_age = grades_master['性齢']
sex = sexual_age.replace('[0-9]+',"", regex=True)
age = sexual_age.replace("\D","", regex=True)
grades_master['性'] = sex
grades_master['齢'] = age.astype(int)
# レース名の処理
place_id_list = []
race_id_list = grades_master['レースID']
for place_id in race_id_list:
  place_id_list.append(place_dict[place_id[4:6]])
grades_master['競馬場'] = place_id_list
# 競馬場指定
grades_master = grades_master[grades_master['競馬場']=='中山']



#### 2-2-5. カテゴリ変数をダミー変数化

In [28]:
#grades_master = pd.get_dummies(grades_master,columns=['レース名'])
grades_master = pd.get_dummies(grades_master,columns=['回り'])
grades_master = pd.get_dummies(grades_master,columns=['天候'])
grades_master = pd.get_dummies(grades_master,columns=['タイプ'])
grades_master = pd.get_dummies(grades_master,columns=['馬場状態'])
grades_master = pd.get_dummies(grades_master,columns=['馬名'])
grades_master = pd.get_dummies(grades_master,columns=['騎手'])
grades_master = pd.get_dummies(grades_master,columns=['調教師'])
grades_master = pd.get_dummies(grades_master,columns=['性'])

#### 2-2-6. 不要な列を削除

In [29]:
grades_master = grades_master.drop('レースID', axis=1)
grades_master = grades_master.drop('レース名', axis=1)
grades_master = grades_master.drop('競馬場', axis=1)
grades_master = grades_master.drop('馬番', axis=1)
grades_master = grades_master.drop('性齢', axis=1)
grades_master = grades_master.drop('タイム', axis=1)
grades_master = grades_master.drop('単勝', axis=1)
grades_master = grades_master.drop('人気', axis=1)
grades_master = grades_master.drop('馬ID', axis=1)
grades_master = grades_master.drop('調教師ID', axis=1)

#### 2-2-7. データフレームの型をintに変換
変換対象列  
- 距離  
- 枠番  
- 斥量  
- 馬体重

スクレイピングのバグで距離列に空白が含まれていたのでそちらを削除  
※現時点では修正済み

In [30]:
grades_master['距離'] = grades_master['距離'].astype(int)
grades_master['枠番'] = grades_master['枠番'].astype(int)
grades_master['斥量'] = grades_master['斥量'].astype(float)
grades_master['馬体重'] = grades_master['馬体重'].astype(int)
grades_master['着順'] = pd.to_numeric(grades_master['着順'],errors='coerce')
grades_master = grades_master.dropna(how='any', axis=0)
grades_master['着順'] = grades_master['着順'].astype('int')

#### 2-2-7. 3着以内とそれ以外でデータを2分類化する

In [None]:
grades_master.loc[grades_master['着順']<=3,['着順']] = 1
grades_master.loc[grades_master['着順']>3,['着順']] = 0

#### １着、２着、３着、それ以外で多分類化する

In [31]:
grades_master.loc[grades_master['着順']==1,['着順']] = 1
grades_master.loc[grades_master['着順']==2,['着順']] = 2
grades_master.loc[grades_master['着順']==3,['着順']] = 3
grades_master.loc[grades_master['着順']>3,['着順']] = 0

### F値の調整

In [32]:
# データ不均衡を修正
from imblearn.under_sampling import RandomUnderSampler
#print(y_train[y_train['着順']==1].value_counts())
#print(y_train[y_train['着順']==2].value_counts())
#print(y_train[y_train['着順']==3].value_counts())
#print(y_train[y_train['着順']==0].value_counts())
x_train = grades_master.drop(['着順'], axis=1)
y_train = grades_master['着順']

y_train_1 = [i for i in y_train if i == 1]
y_train_2 = [i for i in y_train if i == 2]
y_train_3 = [i for i in y_train if i == 3]
y_train_0 = [i for i in y_train if i == 0]
y_train_min = []
print(len(y_train_1))
y_train_min.append(len(y_train_1))
print(len(y_train_2))
y_train_min.append(len(y_train_2))
print(len(y_train_3))
y_train_min.append(len(y_train_3))
print(len(y_train_0))
y_train_min.append(len(y_train_0))
y_train_min = min(y_train_min)

# 学習データをアンダーサンプリング
count_y = y_train_min
#count_1 = len(y_train_3)
#count_1 = y_train.value_counts()[1]
#count_2 = y_train.value_counts()[1]
#count_3 = y_train.value_counts()[1]
#count_0 = y_train.value_counts()[1]
rus = RandomUnderSampler(sampling_strategy={0:count_y, 1:count_y, 2:count_y, 3:count_y})
x_train_rus, y_train_rus = rus.fit_resample(x_train, y_train)
print(y_train_rus.value_counts())
#print(x_train_rus.value_counts())

x_train, x_test, y_train, y_test = train_test_split(x_train_rus, y_train_rus, test_size=0.3)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


580
581
581
6713
0    580
1    580
2    580
3    580
Name: 着順, dtype: int64
(1624, 5162)
(696, 5162)
(1624,)
(696,)


#### 1-5-4. 学習データと検証データに分ける

### 1-6. 学習モデルの作成・学習

In [33]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test)

#### 1-6-1. ハイパーパラメータの設定

In [None]:
params = {
  'objective': 'binary',
  'metric': 'auc',
}
best_params, histroy ={}, []
model = lgb.train(params,
                  lgb_train,
                  valid_sets=[lgb_train,lgb_eval],
                  num_boost_round=10,
                  early_stopping_rounds=10)
best_params_ = model.params

In [None]:
params = {
  'task': 'train',  #トレーニング用
  'boosting_type': 'gbdt',  #勾配ブースティング決定木
  'objective': 'multiclass',  #目的：多クラス分類
  'num_class': 4,   #分類クラス数
  'metric': 'multi_logloss',  #評価指標は多クラスのLog損失
}
model = lgb.train(params,
                  lgb_train,
                  valid_sets = [lgb_train,lgb_eval],
                  num_boost_round = 100,
                  early_stopping_rounds=10,
                  )
best_params_ = model.params

In [43]:
print(best_params)
for param in best_params_:
  print(param)
#print(histroy)

task
boosting_type
objective
num_class
metric
feature_pre_filter
lambda_l1
lambda_l2
num_leaves
feature_fraction
bagging_fraction
bagging_freq
min_child_samples
num_iterations
early_stopping_round


#### 1-6-2. 学習モデル作成

In [None]:
best_params, history = dict(), list()
model = lgb_orig.train(best_params_,
                        lgb_train,
                        valid_sets=lgb_eval,
                        num_boost_round=100,
                        early_stopping_rounds=10
#                        best_params=best_params,
#                        tuning_history=history
                        )

In [41]:
print(best_params)
print(history)

{}
[]


###

In [None]:
#評価指標は以下４つ
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

#pred = x_test.sample(n=1)
#print(pred)
preds = model.predict(x_test, num_iteration=model.best_iteration)
y_pred = []
for pred in preds:
  y_pred.append(np.argmax(pred))

#正解率など評価指標の計算
print('正解率(accuracy_score):{}'.format(accuracy_score(y_test, y_pred)))
#適合率、再現率、F1値はマクロ平均を取る
print('再現率(recall_score):{}'.format(recall_score(y_test, y_pred, average='macro')))
print('適合率(precision_score):{}'.format(precision_score(y_test, y_pred, average='macro')))
print('F1値(f1_score):{}'.format(f1_score(y_test, y_pred, average='macro')))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=5)) 

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred, labels=[0, 1, 2, 3]))

In [None]:
import pickle
model_file = '/home/keiba/src/models/f_preprocess.pkl'
pickle.dump(model, open(model_file, 'wb'))

In [None]:
model = pickle.load(open('/home/keiba/src/models/f_preprocess.pkl', 'rb'))

In [None]:
import matplotlib.pyplot as plt
plt.plot(best_params_["training"]["multi_logloss"], color = "red", label = "train")
plt.plot(best_params_["valid_1"]["multi_logloss"], color = "blue", label = "valid")
plt.legend()
plt.show()

In [None]:
lgb.plot_tree(lgb, tree_index=0)