# インポート

In [1]:
import pandas as pd
from pathlib import Path

COMMON_DATA_DIR = Path("..", "..", "common", "data")
RAWDF_DIR = COMMON_DATA_DIR / "rawdf"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# 予測時の処理

## レース前日準備

In [4]:
import preprocessing

horse_results_preprocessed = preprocessing.process_horse_results(
    save_filename="horse_results_prediction.csv"
)

In [9]:
horse_results_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3084 entries, 0 to 3101
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   horse_id      3084 non-null   int64         
 1   date          3084 non-null   datetime64[ns]
 2   rank          3084 non-null   float64       
 3   prize         3084 non-null   float64       
 4   rank_diff     3082 non-null   float64       
 5   weather       3082 non-null   float64       
 6   race_type     3084 non-null   int64         
 7   course_len    3084 non-null   int64         
 8   ground_state  3084 non-null   int64         
 9   race_class    2869 non-null   float64       
 10  n_horses      3084 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(5)
memory usage: 289.1 KB


In [10]:
horse_results_preprocessed[horse_results_preprocessed["rank_diff"].isnull()]

Unnamed: 0,horse_id,date,rank,prize,rank_diff,weather,race_type,course_len,ground_state,race_class,n_horses
2816,2018105192,2023-04-15,7.0,0.0,,,1,1400,1,8.0,11
2968,2020110136,2023-02-25,12.0,0.0,,,0,1600,0,6.0,13


In [1]:
from feature_engineering import PredictionFeatureCreator
%load_ext autoreload

In [2]:
pfc = PredictionFeatureCreator()

In [5]:
pfc.agg_horse_n_races()

In [6]:
pfc.agg_horse_n_races_df

Unnamed: 0,date,race_id,horse_id,rank_3races,prize_3races,rank_5races,prize_5races,rank_10races,prize_10races,rank_1000races,prize_1000races
0,2024-06-01,202405030101,2019100108,6.666667,66.666667,7.6,40.0,5.900000,208.000000,4.538462,218.500000
1,2024-06-01,202405030101,2019104899,8.666667,0.000000,10.2,0.0,9.300000,134.710000,6.764706,396.276471
2,2024-06-01,202405030101,2016103092,8.666667,0.000000,8.8,0.0,7.700000,12.000000,6.945946,8.500000
3,2024-06-01,202405030101,2020102800,6.666667,80.000000,8.0,48.0,6.900000,184.900000,6.461538,186.230769
4,2024-06-01,202405030101,2019105143,13.000000,0.000000,13.4,0.0,11.600000,158.640000,8.687500,193.525000
...,...,...,...,...,...,...,...,...,...,...,...
296,2024-06-01,202408040112,2019104288,2.000000,470.000000,3.2,306.0,3.000000,277.000000,5.681818,176.409091
297,2024-06-01,202408040112,2018104208,9.000000,76.666667,7.4,104.0,7.800000,63.400000,5.648649,145.200000
298,2024-06-01,202408040112,2020104845,8.666667,0.000000,7.8,0.0,6.500000,140.500000,6.615385,114.076923
299,2024-06-01,202408040112,2021100161,4.333333,443.333333,4.4,310.0,5.666667,258.333333,5.666667,258.333333


### htmlの取得

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

options = Options()
# ヘッドレスモード（バックグラウンド）で起動
options.add_argument("--headless")
# その他のクラッシュ対策
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver_path = ChromeDriverManager().install()

url = "https://race.netkeiba.com/race/shutuba.html?race_id=202405030101"
with webdriver.Chrome(service=Service(driver_path), options=options) as driver:
    driver.implicitly_wait(10)
    driver.get(url)
    html = driver.page_source

In [8]:
html



In [12]:
%autoreload

In [13]:
pfc = PredictionFeatureCreator()
pfc.fetch_shutuba_table_html(race_id="202405030101")

In [15]:
html = pfc.html

### レース結果テーブルの作成
PredictionFeatureCreator.fetch_results()で、
- rawデータの作成
- 前処理

をどちらもやるイメージ

In [16]:
import pandas as pd

df = pd.read_html(html)[0]
df

  df = pd.read_html(html)[0]


Unnamed: 0_level_0,枠,馬 番,印,馬名,性齢,斤量,騎手,厩舎,馬体重 (増減),オッズ,人気,お気に入り馬,お気に入り馬
Unnamed: 0_level_1,枠,馬 番,印,馬名,性齢,斤量,騎手,厩舎,馬体重 (増減),オッズ,人気,登録,メモ
0,1,1,--,プリモスペランツァ,牡5,60.0,田村,栗東中竹,500(-6),3.1,1,,
1,2,2,--,インプレス,牡5,60.0,小牧加,栗東佐々木,526(-12),5.1,3,,
2,3,3,--,コルドゥーン,牡8,60.0,伴,美浦石栗,474(+12),121.9,8,,
3,4,4,--,オメガリッチマン,牡4,60.0,高田,栗東安田,446(-2),3.5,2,,
4,5,5,--,インディゴブラック,セ5,60.0,小坂,栗東奥村豊,512(+2),10.0,6,,
5,6,6,--,スペキュレーター,牡5,60.0,江田勇,美浦伊藤伸,506(+22),146.9,9,,
6,7,7,--,キタノブレイド,牡5,60.0,石神深,美浦萱野,476(+4),6.4,4,,
7,7,8,--,フジフォンテ,牡5,60.0,大江圭,美浦粕谷,464(-4),163.3,10,,
8,8,9,--,ダノンジャッカル,牡5,60.0,西谷誠,栗東中内田,462(+12),7.5,5,,
9,8,10,--,ヴラディア,牡5,60.0,上野,美浦小西,496(+2),19.9,7,,


In [20]:
df.columns = df.columns.get_level_values(1)
df

Unnamed: 0,枠,馬 番,印,馬名,性齢,斤量,騎手,厩舎,馬体重 (増減),オッズ,人気,登録,メモ
0,1,1,--,プリモスペランツァ,牡5,60.0,田村,栗東中竹,500(-6),3.1,1,,
1,2,2,--,インプレス,牡5,60.0,小牧加,栗東佐々木,526(-12),5.1,3,,
2,3,3,--,コルドゥーン,牡8,60.0,伴,美浦石栗,474(+12),121.9,8,,
3,4,4,--,オメガリッチマン,牡4,60.0,高田,栗東安田,446(-2),3.5,2,,
4,5,5,--,インディゴブラック,セ5,60.0,小坂,栗東奥村豊,512(+2),10.0,6,,
5,6,6,--,スペキュレーター,牡5,60.0,江田勇,美浦伊藤伸,506(+22),146.9,9,,
6,7,7,--,キタノブレイド,牡5,60.0,石神深,美浦萱野,476(+4),6.4,4,,
7,7,8,--,フジフォンテ,牡5,60.0,大江圭,美浦粕谷,464(-4),163.3,10,,
8,8,9,--,ダノンジャッカル,牡5,60.0,西谷誠,栗東中内田,462(+12),7.5,5,,
9,8,10,--,ヴラディア,牡5,60.0,上野,美浦小西,496(+2),19.9,7,,


In [21]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, "lxml").find("table", class_="Shutuba_Table")
soup

<table class="Shutuba_Table RaceTable01 ShutubaTable tablesorter tablesorter-default" role="grid">
<thead>
<tr class="Header tablesorter-headerRow" role="row">
<th aria-disabled="true" aria-label="枠: No sort applied, sorting is disabled" aria-sort="none" class="Waku tablesorter-header sorter-false tablesorter-headerUnSorted" data-column="0" role="columnheader" rowspan="2" scope="col" style="user-select: none;" unselectable="on"><div class="tablesorter-header-inner">枠</div></th>
<th aria-disabled="false" aria-label="馬番: No sort applied, activate to apply an ascending sort" aria-sort="none" class="Umaban sort_common tablesorter-header tablesorter-headerUnSorted" data-column="1" role="columnheader" rowspan="2" scope="col" style="user-select: none;" tabindex="0" unselectable="on"><div class="tablesorter-header-inner"><div class="Inner_Shutuba">馬<br/>番<span class="sort_icon" style="display: block;"><i class="fas fa-sort"></i></span></div></div></th>
<th aria-disabled="true" aria-label="印: N

In [23]:
import re

soup.find_all("a", href=re.compile(r"/horse/"))

[<a href="https://db.netkeiba.com/horse/2019100108" target="_blank" title="プリモスペランツァ">プリモスペランツァ<img alt="" class="disp_none Favorite" id="myhorse_2019100108" src="https://cdn.netkeiba.com/img.race/common/img/icon/icon_horse.png?2019073001" width="18"/></a>,
 <a href="https://db.netkeiba.com/horse/2019104899" target="_blank" title="インプレス">インプレス<img alt="" class="disp_none Favorite" id="myhorse_2019104899" src="https://cdn.netkeiba.com/img.race/common/img/icon/icon_horse.png?2019073001" width="18"/></a>,
 <a href="https://db.netkeiba.com/horse/2016103092" target="_blank" title="コルドゥーン">コルドゥーン<img alt="" class="disp_none Favorite" id="myhorse_2016103092" src="https://cdn.netkeiba.com/img.race/common/img/icon/icon_horse.png?2019073001" width="18"/></a>,
 <a href="https://db.netkeiba.com/horse/2020102800" target="_blank" title="オメガリッチマン">オメガリッチマン<img alt="" class="disp_none Favorite" id="myhorse_2020102800" src="https://cdn.netkeiba.com/img.race/common/img/icon/icon_horse.png?2019073001" wi

In [37]:
%autoreload

In [34]:
pfc = PredictionFeatureCreator()
pfc.fetch_shutuba_table_html(race_id="202405030101")

In [36]:
pfc.htmls["202405030101"]



In [None]:
pfc.create_features(race_id="202405030101")

In [40]:
pfc = PredictionFeatureCreator()
pfc.fetch_results(race_id="202405030101", html=html)

  


In [41]:
pfc.results

Unnamed: 0,race_id,horse_id,jockey_id,trainer_id,umaban,wakuban,tansho_odds,popularity,impost,sex,age,weight,weight_diff
0,202405030101,2019100108,1105,1039,1,1,3.1,1,60.0,0,5,500,-6
1,202405030101,2019104899,1196,429,2,2,5.1,3,60.0,0,5,526,-12
2,202405030101,2016103092,1149,1043,3,3,121.9,8,60.0,0,8,474,12
3,202405030101,2020102800,1046,1164,4,4,3.5,2,60.0,0,4,446,-2
4,202405030101,2019105143,1063,1146,5,5,10.0,6,60.0,2,5,512,2
5,202405030101,2019102792,1023,1031,6,6,146.9,9,60.0,0,5,506,22
6,202405030101,2019100510,1059,1024,7,7,6.4,4,60.0,0,5,476,4
7,202405030101,2019105759,1120,1096,8,7,163.3,10,60.0,0,5,464,-4
8,202405030101,2019103005,1005,1137,9,8,7.5,5,60.0,0,5,462,12
9,202405030101,2019105578,1087,405,10,8,19.9,7,60.0,0,5,496,2


### レース情報テーブルの取得

race_id	date	race_type	around	course_len	weather	ground_state	race_class	place

In [50]:
soup = BeautifulSoup(html, "lxml").find("div", class_="RaceList_Item02")
soup

<div class="RaceList_Item02">
<h1 class="RaceName">3歳以上障害未勝利

















</h1>
<div class="RaceData01">
10:05発走 /<!-- <span class="Turf"> --><span> 障3000m</span> (芝 ダート)
/ 天候:晴<span class="Icon_Weather Weather01"></span>
<span class="Item03">/ 馬場:稍</span>
<span class="Item04">/ 馬場:重</span>
</div>
<div class="RaceData02">
<span>3回</span>
<span>東京</span>
<span>1日目</span>
<span>障害３歳以上</span>
<span>未勝利</span>
     
<span>(混)</span>
<span>定量</span>
<span>10頭</span>
<br/>
<span>本賞金:790,320,200,120,79万円</span>
</div>
</div>

In [51]:
# タイトル
soup.find("h1").text.strip()

'3歳以上障害未勝利'

In [52]:
div0 = soup.find_all("div")[0]
div0.text

'\n10:05発走 / 障3000m (芝 ダート)\n/ 天候:晴\n/ 馬場:稍\n/ 馬場:重\n'

In [55]:
info1 = re.findall(r"[\w:]+", div0.text)
info1

['10:05発走', '障3000m', '芝', 'ダート', '天候:晴', '馬場:稍', '馬場:重']

In [56]:
info1[1][0]

'障'

In [59]:
int(re.findall(r"\d+", info1[1])[0])

3000

In [60]:
re.findall(r"天候:(\w+)", div0.text)

['晴']

In [62]:
re.findall(r"馬場:(\w+)", div0.text)[0]

'稍'

In [64]:
soup.find_all("div")[1].text

'\n3回\n東京\n1日目\n障害３歳以上\n未勝利\n\xa0\xa0\xa0\xa0\xa0\n(混)\n定量\n10頭\n\n本賞金:790,320,200,120,79万円\n'

In [66]:
import json
from pathlib import Path


# commonディレクトリのパス
COMMON_DATA_DIR = Path("..", "..", "common", "data")
POPULATION_DIR = COMMON_DATA_DIR / "prediction_population"
MAPPING_DIR = COMMON_DATA_DIR / "mapping"
# v3_0_0ディレクトリのパス
DATA_DIR = Path("..", "data")
INPUT_DIR = DATA_DIR / "01_preprocessed"
OUTPUT_DIR = DATA_DIR / "02_features"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# カテゴリ変数を数値に変換するためのマッピング
with open(MAPPING_DIR / "race_class.json", "r") as f:
    race_class_mapping = json.load(f)

In [69]:
regex_race_class = "|".join(race_class_mapping)

In [73]:
race_class_mapping

{'新馬': 0,
 '未勝利': 1,
 '1勝クラス': 2,
 '2勝クラス': 3,
 '3勝クラス': 4,
 'オープン': 5,
 'G3': 6,
 'G2': 7,
 'G1': 8,
 '特別': 5,
 '500万下': 2,
 '1000万下': 3,
 '1600万下': 4,
 'OP': 5}

In [71]:
re.findall(rf"{regex_race_class}", soup.find_all("div")[1].text)

['未勝利']

In [92]:
%autoreload

In [77]:
pfc = PredictionFeatureCreator()
pfc.fetch_race_info(race_id="202405030101", html=html)

In [79]:
pfc.race_info

Unnamed: 0,race_id,race_type,around,course_len,weather,ground_state,race_class,place
0,202405030101,2,,3000,1,2,1,5


In [82]:
pfc.population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328 entries, 0 to 327
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      328 non-null    object
 1   race_id   328 non-null    int64 
 2   horse_id  328 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.8+ KB


In [93]:
pfc = PredictionFeatureCreator()
features = pfc.create_features(race_id="202405030101")

  df.columns = df.columns.get_level_values(1)


## 予測

In [96]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             10 non-null     object 
 1   race_id          10 non-null     int64  
 2   horse_id         10 non-null     int64  
 3   jockey_id        10 non-null     int64  
 4   trainer_id       10 non-null     int64  
 5   umaban           10 non-null     int64  
 6   wakuban          10 non-null     int64  
 7   tansho_odds      10 non-null     float64
 8   popularity       10 non-null     int64  
 9   impost           10 non-null     float64
 10  sex              10 non-null     int64  
 11  age              10 non-null     int64  
 12  weight           10 non-null     int64  
 13  weight_diff      10 non-null     int64  
 14  race_type        10 non-null     int64  
 15  around           0 non-null      float64
 16  course_len       10 non-null     int64  
 17  weather          10

In [97]:
import prediction

prediction.predict(features, config_filepath="config.yaml")

Unnamed: 0,race_id,umaban,tansho_odds,popularity,pred
3,202405030101,4,3.5,2,0.325319
0,202405030101,1,3.1,1,0.239033
8,202405030101,9,7.5,5,0.138848
1,202405030101,2,5.1,3,0.131458
6,202405030101,7,6.4,4,0.086023
4,202405030101,5,10.0,6,0.084488
9,202405030101,10,19.9,7,0.054749
5,202405030101,6,146.9,9,0.001212
2,202405030101,3,121.9,8,0.000887
7,202405030101,8,163.3,10,0.000821
