# ライブラリのインポート

In [220]:
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from bs4 import BeautifulSoup
from urllib.request import urlopen
from itertools import combinations, permutations
import pandas as pd
import numpy as np
import datetime
import lightgbm as lgb
import requests
import time
import re
import os
import optuna.integration.lightgbm as lgb_o
import matplotlib.pyplot as plt
import pickle

# モデルのインポート

In [266]:
model = pickle.load(open('./pickle/model.pkl', 'rb'))
model

# ラベルの辞書のインポート

In [267]:
jockey_dict = pickle.load(open('./pickle/jockey_dict.pkl', 'rb'))
horse_dict = pickle.load(open('./pickle/horse_dict.pkl', 'rb'))

# 指定日の開催レースのIDを取得

In [453]:
YEAR = '2023'
MONTH = '2'
DATE = '0211'

url = f'https://www.jra.go.jp/keiba/calendar{YEAR}/{YEAR}/{MONTH}/{DATE}.html'
r = requests.get(url)
content_type_encoding = r.encoding if r.encoding != 'ISO-8859-1' else None
soup = BeautifulSoup(r.content, 'html.parser', from_encoding=content_type_encoding)

In [407]:
spot_dict = {'札幌': '01', '函館': '02', '福島': '03', '新潟': '04', '東京': '05', '中山': '06', '中京': '07', '京都': '08', '阪神': '09', '小倉': '10'}
num_dict = {'1': '01', '2': '02', '3': '03', '4': '04', '5': '05', '6': '06', '7': '07'}
date_dict = {'1': '01', '2': '02', '3': '03', '4': '04', '5': '05', '6': '06', '7': '07', '8': '08', '9': '09', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14'}

race_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

In [408]:
seireki_list = [str(s) for s in list(range(10, 24))]
# 01：札幌、02：函館、03：福島、04：新潟、05：東京、06：中山、07：中京、08：京都、09：阪神、10：小倉
spot_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']
num_list = ['01', '02', '03', '04', '05', '06', '07']
date_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14']
race_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

In [409]:
table_list = soup.find_all("table", attrs={'class': 'basic'})
text_list = list()
for td in table_list:
    text_list.append(td.find('div', class_='main').text)

pattern = '(\d+)回(\D+)(\d+)日'

detail_list = list()
for text in text_list:
    result = re.match(pattern, text)
    num = result.group(1)
    spot = result.group(2)
    date = result.group(3)
    detail_list.append([spot, num, date])

race_id_list = list()

for detail in detail_list:
    for i in race_list:
        race_id = '2023' + spot_dict[detail[0]] + num_dict[detail[1]] + date_dict[detail[2]] + i
        race_id_list.append(race_id)

In [410]:
race_id_list[:5]

['202305010501',
 '202305010502',
 '202305010503',
 '202305010504',
 '202305010505']

# 出馬票のスクレイピング

In [424]:
df_list = list()

for ID in tqdm(race_id_list):
    url = f'https://race.netkeiba.com/race/result.html?race_id={ID}&rf=race_list'
    df = pd.read_html(url)[0]
    html = requests.get(url)
    html.encoding = "EUC-JP"
    race_id = ID

    soup = BeautifulSoup(html.text, "html.parser")
    #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
    texts = soup.find('div', attrs={'class': 'RaceData01'}).text
    texts = re.findall(r'\w+', texts)
    for text in texts:
        if 'm' in text:
            df['course_len'] = [int(re.findall(r'\d+', text)[-1])] * len(df) #20211212：[0]→[-1]に修正
        if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
            df["weather"] = [text] * len(df)
        if text in ["良", "稍重", "重"]:
            df["ground_state"] = [text] * len(df)
        if '不' in text:
            df["ground_state"] = ['不良'] * len(df)
        # 2020/12/13追加
        if '稍' in text:
            df["ground_state"] = ['稍重'] * len(df)
        if '芝' in text:
            df['race_type'] = ['芝'] * len(df)
        if '障' in text:
            df['race_type'] = ['障害'] * len(df)
        if 'ダ' in text:
            df['race_type'] = ['ダート'] * len(df)


    # horse_id
    horse_id_list = []
    horselist_tr_list = soup.find_all("tr", attrs={'class': 'HorseList'})
    for tr in horselist_tr_list:
        horse_td = tr.find("span", attrs={'class': 'Horse_Name'})
        horse_id = re.findall(r'\d+', horse_td.find('a')['href'])[0]
        horse_id_list.append(horse_id)
    # jockey_id
    jockey_id_list = []
    for tr in horselist_tr_list:
        jockey_td = tr.find("td", attrs={'class': 'Jockey'})
        jockey_id = re.findall(r'\d+', jockey_td.find('a')['href'])[0]
        jockey_id_list.append(jockey_id)

    df['horse_id'] = horse_id_list
    df['jockey_id'] = jockey_id_list

    df['race_id'] = [race_id] * len(df)
    
    df_list.append(df)

  0%|          | 0/36 [00:00<?, ?it/s]

In [425]:
all_data = pd.concat(df_list)

In [392]:
# ID = '202309010211'
# url = f'https://race.netkeiba.com/race/result.html?race_id={ID}&rf=race_list'

In [393]:
# df = pd.read_html(url)[0]
# html = requests.get(url)
# html.encoding = "EUC-JP"

In [404]:
# race_id = ID

# soup = BeautifulSoup(html.text, "html.parser")
# #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
# texts = soup.find('div', attrs={'class': 'RaceData01'}).text
# texts = re.findall(r'\w+', texts)
# for text in texts:
#     if 'm' in text:
#         df['course_len'] = [int(re.findall(r'\d+', text)[-1])] * len(df) #20211212：[0]→[-1]に修正
#     if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
#         df["weather"] = [text] * len(df)
#     if text in ["良", "稍重", "重"]:
#         df["ground_state"] = [text] * len(df)
#     if '不' in text:
#         df["ground_state"] = ['不良'] * len(df)
#     # 2020/12/13追加
#     if '稍' in text:
#         df["ground_state"] = ['稍重'] * len(df)
#     if '芝' in text:
#         df['race_type'] = ['芝'] * len(df)
#     if '障' in text:
#         df['race_type'] = ['障害'] * len(df)
#     if 'ダ' in text:
#         df['race_type'] = ['ダート'] * len(df)


# # horse_id
# horse_id_list = []
# horselist_tr_list = soup.find_all("tr", attrs={'class': 'HorseList'})
# for tr in horselist_tr_list:
#     horse_td = tr.find("span", attrs={'class': 'Horse_Name'})
#     horse_id = re.findall(r'\d+', horse_td.find('a')['href'])[0]
#     horse_id_list.append(horse_id)
# # jockey_id
# jockey_id_list = []
# for tr in horselist_tr_list:
#     jockey_td = tr.find("td", attrs={'class': 'Jockey'})
#     jockey_id = re.findall(r'\d+', jockey_td.find('a')['href'])[0]
#     jockey_id_list.append(jockey_id)
# # for td in jockey_td_list:
# #     jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
# #     jockey_id_list.append(jockey_id)
# df['horse_id'] = horse_id_list
# df['jockey_id'] = jockey_id_list

# df.index = [race_id] * len(df)

# 前処理

In [433]:
# まとめ

def preprocessing(df):
    results_df = df.copy()
    results_df['着 順'].unique()
    # 文字列が入っているものをNoneに処理
    results_df['着順'] = pd.to_numeric(results_df['着 順'], errors='coerce')
    # 着順に欠損値が存在する行を削除
    results_df.dropna(subset=['着順'], inplace=True)
    results_df['着順'] = results_df['着順'].astype(int)
    # 性齢を性と齢に分ける
    results_df['性'] = results_df['性齢'].map(lambda x: str(x)[0])
    results_df['齢'] = results_df['性齢'].map(lambda x: str(x)[1:]).astype(int)
    # 馬の体重を体重と体重変化に分ける
    results_df['体重'] = results_df['馬体重 (増減)'].str.split('(', expand=True)[0]
    results_df['体重変化'] = results_df['馬体重 (増減)'].str.split("(", expand=True)[1].str[:-1]
    # 数値型にする
    results_df['体重'] = pd.to_numeric(results_df['体重'], errors='coerce')
    results_df['体重変化'] = pd.to_numeric(results_df['体重変化'], errors='coerce')
    # float型に変換
    results_df['単勝'] = results_df['単勝 オッズ'].astype(float)
    # 桁数を少なくする
    results_df['course_len'] = results_df['course_len'].astype(float) // 100
    # dateを日付型に
#     results_df["date"] = pd.to_datetime(results_df["date"], format="%Y年%m月%d日")
    #開催場所（race_idの5番目と6番目の文字）
    results_df['開催'] = results_df['race_id'].map(lambda x:str(x)[4:6])
    # 走る馬の数をレース毎に追加
    results_df['n_horses'] = results_df.index.map(results_df.index.value_counts())
    
    results_df['jockey_label'] = results_df['jockey_id'].apply(lambda x: jockey_dict[x] if x in jockey_dict.keys() else -1)
    results_df['horse_label'] = results_df['horse_id'].apply(lambda x: horse_dict[x] if x in horse_dict.keys() else -1)
    # カテゴリ変数
    weathers = results_df['weather'].unique()
    race_types = results_df['race_type'].unique()
    ground_states = results_df['ground_state'].unique()
    sexes = results_df['性'].unique()
    
    weather = results_df['weather'].iloc[0]
    results_df[['weather_晴', 'weather_曇', 'weather_小雨', 'weather_雨', 'weather_小雪', 'weather_雪']] = 0
    if weather == '晴':
        results_df['weather_晴'] = 1
    elif weather == '曇':
        results_df['weather_曇'] = 1
    elif weather == '小雨':
        results_df['weather_小雨'] = 1
    elif weather == '小雪':
        results_df['weather_小雪'] = 1
    elif weather == '雪':
        results_df['weather_雪'] = 1
    
    results_df[['race_type_芝', 'race_type_ダート', 'race_type_障害']] = 0
    race_type = results_df['race_type'].iloc[0]
    if race_type == '芝':
        results_df['race_typ_芝'] = 1
    elif race_type == 'ダート':
        results_df['race_typ_ダート'] = 1
    elif race_type == '障害':
        results_df['race_typ_障害'] = 1

    results_df[['ground_state_良', 'ground_state_稍重', 'ground_state_不良', 'ground_state_重']] = 0
    ground_state = results_df['ground_state'].iloc[0]
    if ground_state == '良':
        results_df['ground_state_良'] = 1
    elif ground_state == '稍重':
        results_df['ground_state_稍重'] = 1
    elif ground_state == '不良':
        results_df['ground_state_不良'] = 1
    elif ground_state == '重':
        results_df['ground_state_重'] = 1
    
    results_df['性'] = pd.Categorical(results_df['性'], sexes)
    
    results_df[['性_牡', '性_牝']] = 0
    results_df['性_牡' ] = results_df['性_牡' ].apply(lambda x: 1 if x == '牡' else 0)
    results_df['性_牡' ] = results_df['性_牝' ].apply(lambda x: 1 if x == '牝' else 0)
    
    # 予測する際にリークとなるカラム or 不要なカラムを削除
    results_df.drop(["タイム", '後3F', '単勝 オッズ', 'コーナー 通過順', "着差", "厩舎", "性齢", "馬体重 (増減)", '馬名', '騎手', '人 気', '着順', '着 順', 'race_type', 'weather', 'ground_state', '性'], axis=1, inplace=True)
    
    return results_df

In [434]:
preprocessed_df = preprocessing(all_data)

In [437]:
preprocessed_df = preprocessed_df.rename(columns={'枠': '枠番', '馬 番': '馬番', '単勝 オッズ': '単勝'})
preprocessed_df[['horse_id', 'jockey_id', '体重', '体重変化']] = preprocessed_df[['horse_id', 'jockey_id', '体重', '体重変化']].astype(float)
preprocessed_df[['開催']] = preprocessed_df[['開催']].astype(int)

In [436]:
preprocessed_df.head()

Unnamed: 0,枠,馬 番,斤量,course_len,horse_id,jockey_id,race_id,齢,体重,体重変化,...,race_type_芝,race_type_ダート,race_type_障害,race_typ_ダート,ground_state_良,ground_state_稍重,ground_state_不良,ground_state_重,性_牡,性_牝
0,1,2,55.0,13.0,2020104132,1188,202305010501,3,450,0,...,0,0,0,1,0,0,1,0,0,0
1,5,10,54.0,13.0,2020107129,1184,202305010501,3,460,12,...,0,0,0,1,0,0,1,0,0,0
2,2,4,56.0,13.0,2020105021,1126,202305010501,3,470,2,...,0,0,0,1,0,0,1,0,0,0
3,6,12,54.0,13.0,2020105136,5386,202305010501,3,508,0,...,0,0,0,1,0,0,1,0,0,0
4,1,1,54.0,13.0,2020105948,5339,202305010501,3,458,-4,...,0,0,0,1,0,0,1,0,0,0


# AI予測

In [438]:
results_df = preprocessed_df.copy()
results_df['predict_proba'] = model.predict_proba(preprocessed_df.drop(['race_id'], axis=1).values)[:,1]

# 予測結果の保存

In [448]:
spot_num_dict = {'01': '札幌', '02': '函館', '03': '福島', '04': '新潟', '05': '東京', '06': '中山', '07': '中京', '08': '京都', '09': '阪神', '10': '小倉'}
num_num_dict = {'01': '1', '02': '2', '03': '3', '04': '4', '05': '5', '06': '6', '07': '7'}
date_num_dict = {'01': '1', '02': '2', '03': '3', '04': '4', '05': '5', '06': '6', '07': '7', '08': '8', '09': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14'}

# レースIDからタイトルを取得（例： 2023年1回 東京 1日目 01R）
def race_id_to_title(race_id):
    year = race_id[:4]
    spot = spot_num_dict[race_id[4:6]]
    num = num_num_dict[race_id[6:8]]
    date = date_num_dict[race_id[8:10]]
    race = race_id[10:]
    
    return year + '年' + num + '回' + ' ' + spot + ' ' + date + '日目' + ' ' + race + 'R'

In [454]:
if not os.path.exists(f'./predict_csv'):
    os.mkdir(f'./predict_csv')

if not os.path.exists(f'./predict_csv/{YEAR}{DATE}'):
    os.mkdir(f'./predict_csv/{YEAR}{DATE}')

for race_id in race_id_list:
    title = race_id_to_title(race_id)
    results_df[results_df['race_id'] == race_id][['race_id', '馬番', '枠番', '単勝', 'predict_proba']].sort_values(['race_id', 'predict_proba'], ascending=[True, False]).to_csv(f'./predict_csv/{YEAR}{DATE}/{title}.csv')
    

In [422]:
# results_df.sort_values('predict_proba', ascending=False)[['馬番', '枠番', '単勝', 'predict_proba']]

# results_df[['race_id', '馬番', '枠番', '単勝', 'predict_proba']].to_csv('predict.csv')