# インポート

In [None]:
import preprocessing
from feature_engineering import FeatureCreator, PredictionFeatureCreator
from train import Trainer
import prediction
%load_ext autoreload

In [None]:
%autoreload

# データ加工

In [None]:
# レース結果テーブルの前処理
results_preprocessed = preprocessing.process_results()

In [None]:
# 重複チェック
results_preprocessed.duplicated(subset=["race_id", "horse_id"]).sum()

0

In [None]:
# 欠損チェック
results_preprocessed.isnull().sum()

race_id        0
horse_id       0
jockey_id      0
trainer_id     0
owner_id       0
rank           0
umaban         0
wakuban        0
tansho_odds    0
popularity     0
impost         0
sex            0
age            0
weight         0
weight_diff    0
dtype: int64

In [None]:
# 馬の過去成績テーブルの加工
horse_results_preprocessed = preprocessing.process_horse_results()

In [None]:
# 欠損チェック
horse_results_preprocessed.isnull().sum()

horse_id            0
date                0
rank                0
prize               0
rank_diff         234
weather           195
race_type           0
course_len          0
ground_state        0
race_class      48143
n_horses            0
dtype: int64

In [None]:
# 重複チェック
horse_results_preprocessed.duplicated(subset=["horse_id", "date"]).sum()

0

In [None]:
# レース情報テーブルの前処理
race_info_preprocessed = preprocessing.process_race_info()

In [None]:
race_info_preprocessed

Unnamed: 0,race_id,date,race_type,around,course_len,weather,ground_state,race_class,place
0,202306010307,2023-01-08,0,0.0,1800,1,0,2,6
1,202306010308,2023-01-08,0,0.0,1200,1,0,2,6
2,202306010309,2023-01-08,1,0.0,1200,1,0,2,6
3,202306010310,2023-01-08,1,0.0,1600,1,0,4,6
4,202306010311,2023-01-08,0,0.0,1800,1,0,5,6
...,...,...,...,...,...,...,...,...,...
5053,202402010408,2024-06-16,1,0.0,1200,2,0,2,2
5054,202402010409,2024-06-16,1,0.0,1800,3,0,5,2
5055,202402010410,2024-06-16,0,0.0,1700,3,0,5,2
5056,202402010411,2024-06-16,1,0.0,1200,3,0,4,2


#  特徴量作成

In [None]:
fc = FeatureCreator()
features = fc.create_features()

In [None]:
# 重複チェック
features.duplicated(subset=["race_id", "horse_id"]).sum()

0

# 学習

In [None]:
trainer = Trainer()
evaluation_df = trainer.run(test_start_date="2023-10-01")

[100]	training's binary_logloss: 0.151195	valid_1's binary_logloss: 0.206174


# 予測

## 事前準備
**当日出走馬が確定した時点**で実行できる

In [None]:
# 当日出走馬の過去成績テーブルの前処理
horse_results_preprocessed = preprocessing.process_horse_results(
    save_filename="horse_results_prediction.csv"
)

In [None]:
pfc = PredictionFeatureCreator()
# 馬の過去成績集計は事前に行うことができる
pfc.agg_horse_n_races()

In [None]:
%autoreload

## 当日の予測処理
レース直前出走直前に実行する

In [None]:
# 特徴量の更新
features = pfc.create_features(
    race_id="202408040811",  # 予測するレースidを指定
    skip_agg_horse=True  # 事前に集計した場合はスキップできる
)

  df = pd.read_html(html)[0]


In [None]:
# 予測
prediction.predict(features)

Unnamed: 0,race_id,umaban,tansho_odds,popularity,pred
1,202408040811,2,3.7,2,0.597261
3,202408040811,4,2.3,1,0.40609
11,202408040811,12,7.5,3,0.08531
2,202408040811,3,11.6,5,0.05069
8,202408040811,9,16.9,7,0.048155
9,202408040811,10,10.5,4,0.044886
12,202408040811,13,33.8,10,0.036524
6,202408040811,7,15.1,6,0.029658
0,202408040811,1,25.4,8,0.017493
4,202408040811,5,25.4,9,0.012412
