# インポート

In [None]:
# import
import re
from tqdm import tqdm
import time
import random

import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb 
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
ss = preprocessing.StandardScaler()

import warnings
warnings.simplefilter('ignore')

In [None]:
# url = "https://db.netkeiba.com/race/"
# keibajou = {'01':'札幌', '02':'函館', '03':'福島', '04':'新潟', '05':'東京', '06':'中山', '07':'中京', '08':'京都', '09':'阪神', '10':'小倉'}

# 関数の定義

In [None]:
'''スクレイピング'''
# メインテーブルのスクレイピング
def scrape_netkeiba_results(year, course='全部', pre_race_results={}):
    
    #URLの番号部分１２桁の作成
    race_course = {'札幌': 1, '函館': 2, '福島': 3, '新潟': 4, '東京': 5, '中山': 6, '中京': 7, '京都': 8, '阪神': 9, '小倉': 10, '全部': 11}
    place = race_course[course]
    race_id_list = []
    #馬場の指定
    if place != 11:
        for kai in range(1, 7, 1):
            for y in year:
                for day in range(1, 13, 1):
                    for r in range(1, 13, 1):
                        race_id = str(y) + str(place).zfill(2) + str(kai).zfill(2) + str(day).zfill(2) + str(r).zfill(2)
                        race_id_list.append(race_id)
    else:
        for place in range(1, 11, 1):
            for kai in range(1, 7, 1):
                for y in year:
                    for day in range(1, 13, 1):
                        for r in range(1, 13, 1):
                            race_id = str(y) + str(place).zfill(2) + str(kai).zfill(2) + str(day).zfill(2) + str(r).zfill(2)
                            race_id_list.append(race_id)
    #スクレイピング開始                    
    race_results = pre_race_results.copy()
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            url = 'https://db.netkeiba.com/race/' + race_id
            df = pd.read_html(url)[0]
            #horse_id,jockey_idをスクレイピング
            html = requests.get(url)
            html.encoding = 'EUC-JP'
            soup = BeautifulSoup(html.text, 'html.parser')
            horse_id_list = []
            horse_a_list = soup.find(
                'table', attrs = {'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile('^/horse')})
            for a in horse_a_list:
                horse_id = re.findall(r'\d+', a['href'])
                horse_id_list.append(horse_id[0])
            jockey_id_list = []
            jockey_a_list = soup.find(
                'table', attrs = {'summary': 'レース結果'}).find_all('a', attrs={'href': re.compile('^/jockey')})
            for a in jockey_a_list:
                jockey_id = re.findall(r'\d+', a['href'])
                jockey_id_list.append(jockey_id[0])    
            
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list
            race_results[race_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except:
            break
    #dataframeに整形
    for key in race_results:
        race_results[key].index = [key] * len(race_results[key])
    results = pd.concat([race_results[key] for key in race_results], sort=False)
    
    now = datetime.now()
    file_name = '{0:%Y%m%d_%H%M%S}.pickle'.format(now)
    results.to_pickle(file_name)
    return results
# 天気や馬場状態や日付を取得する
def scrape_race_info(race_id_list):
    race_infos = {}
    for race_id in tqdm(race_id_list):
        try:
            url = 'https://db.netkeiba.com/race/' + str(race_id)
            html = requests.get(url)
            html.encoding = 'EUC-JP'
            soup = BeautifulSoup(html.text, 'html.parser')
            texts = soup.find_all('p')[3].text + soup.find_all('p')[4].text
            info = re.findall(r'\w+', texts)

            info_dict = {}
            for text in info:
                if text in ['芝', 'ダート']:
                    info_dict['race_type'] = text
                if '障' in text:
                    info_dict['race_type'] = '障害'
                if 'm' in text:
                    info_dict['course_len'] = int(re.findall(r"\d+", text)[0])
                if text in ['良', '稍重', '重', '不良']:
                    info_dict['ground_state'] = text
                if text in ['曇', '晴', '雨', '小雨', '小雪', '雪']:
                    info_dict['weather'] = text
                if '年' in text:
                    info_dict['date'] = text
                if '歳' in text:
                    info_dict['race_name'] = text

            race_infos[race_id] = info_dict
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
    return race_infos


# 払い戻し情報をスクレイピング
def scrape_return_tables(race_id_list, pre_return_tables={}):
    return_tables = pre_return_tables
    for race_id in tqdm(race_id_list):
        try:
            url = 'https://db.netkeiba.com/race/' + race_id
            
            #普通にスクレイピングすると複勝やワイドなどが区切られないで繋がってしまう。
            #そのため、改行コードを文字列brに変換して後でsplitする
            f = urlopen(url)
            html = f.read()
            html = html.replace(b'<br />', b'br')
            dfs = pd.read_html(html)

            #dfsの1番目に単勝〜馬連、2番目にワイド〜三連単がある
            df = pd.concat([dfs[1], dfs[2]])

            df.index = [race_id] * len(df)
            return_tables[race_id] = df
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e: #捕捉できるエラーは原因がわかるようにprintしてからbreak
            print(e)
            break
        except:
            break
    return return_tables
# 血統データをスクレイピング
def scrape_peds(horse_id_list, pre_peds={}):
    peds = pre_peds
    for horse_id in tqdm(horse_id_list):
        try:
            url = 'https://db.netkeiba.com/horse/ped/' + horse_id
            df = pd.read_html(url)[0]

            generations = {}
            for i in reversed(range(5)):
                generations[i] = df[i]
                df.drop([i], axis=1, inplace=True)
                df = df.drop_duplicates()
            ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)
            peds[horse_id] = ped.reset_index(drop=True)
            time.sleep(1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
    return peds

In [None]:
'''データクレンジング'''
def pre_main(results):
    df = results.copy()

    df['race_id'] = df.index
    df.reset_index(inplace=True)
    df.drop(columns='index', inplace=True)

    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)

    df['性'] = df['性齢'].map(lambda x: str(x)[0])
    df = pd.get_dummies(data=df, columns=['性'], drop_first=True)
    df['年齢'] = df['性齢'].map(lambda x: str(x)[1:]).astype(int)
    df['体重'] = df['馬体重'].str.split('(', expand=True)[0].astype(int)
    df['増減'] = df['馬体重'].str.split('(', expand=True)[1].str[:-1].astype(int)
    df['単勝'] = df['単勝'].astype(float)
    
    df['rank'] = df['着順'].map(lambda x: 1 if x<4 else 0)
    df.drop(['タイム', '着差', '性齢', '馬体重', '着順', '馬名', '騎手', '調教師', '人気', '単勝'], axis=1, inplace=True)
    return df

def pre_info(infos):
    df = infos.copy()

    df['course_len'] = df['course_len'].astype(int)
    df['race_id'] = df.index
    df.reset_index(inplace=True)
    df.drop(columns='index', inplace=True)
    df['date'] = pd.to_datetime(df['date'], format='%Y年%m月%d日')
    df = pd.get_dummies(data=df, columns=['weather', 'race_type', 'ground_state', 'race_name'], drop_first=True)

    return df

In [None]:
'''モデル評価、収益計算'''
class Return:
    def __init__(self, return_tables):
        self.return_tables = return_tables

    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1, 2]]
        wins = fukusho[1].str.split('br', expand=True)[[0, 1, 2]]
        wins.columns = ['win_0', 'win_1', 'win_2']
        returns = fukusho[2].str.split('br', expand=True)[[0, 1, 2]]
        returns.columns = ['return_0', 'return_1', 'return_2']
        df = pd.concat([wins, returns], axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',', '')
        return df.fillna(0).astype(int)
    
    @property
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1, 2]]
        tansho.columns = ['win', 'return']
        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column], errors='coerce')
        return tansho
# return class内に入れるの検討↓
def calculate_fukusho_return(df_y, fukusho, threshold=0.5):
    pred_table = df_y.copy()[['race_id', '馬番', 'pred']]
    fukusho_table = fukusho.copy()
    pred_table['pred_threshold'] = pred_table['pred'].map(lambda x: 1 if x > threshold else 0)
    pred_bet_table = pred_table[pred_table['pred_threshold']==1]
    out_money = 100 * len(pred_bet_table)
    fukusho_table.reset_index(inplace=True)
    fukusho_table.columns = ['race_id', 'win_0', 'win_1', 'win_2', 'return_0', 'return_1', 'return_2']
    df_calculate = pd.merge(pred_bet_table, fukusho_table, on='race_id', how='left')
    in_money = 0
    for i in range(3):
        in_money += df_calculate[df_calculate['win_{}'.format(i)]==df_calculate['馬番']]['return_{}'.format(i)].sum()
    return round(in_money/out_money, 3)*100, in_money-out_money


'''運用フェーズ'''
def update_data(old, new):
    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old, new])



In [None]:
'''運用フェーズ'''

# 出馬データを作成
class Shutuba:
    def __init__(self, race_id_list):
        self.race_id_list = race_id_list

    def scrape(self):
        data = pd.DataFrame()
        for race_id in tqdm(self.race_id_list):
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T

            html = requests.get(url)
            html.encoding = 'EUC-JP'
            soup = BeautifulSoup(html.text, 'html.parser')

            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ['曇', '晴', '雨', '小雨', '小雪', '雪']:
                    df["weather"] = [text] * len(df)
                if text in ['良', '稍重', '重']:
                    df['ground_state'] = [text] * len(df)
                if '不' in text:
                    df['ground_state'] = ['不良'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            horse_id_list = []
            horse_td_list = soup.find_all('td', attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all('td', attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list
            df.index = [race_id] * len(df)
            data = data.append(df)
            time.sleep(1)
            now = datetime.now()
            file_name = 'shutuba_{0:%Y%m%d_%H%M%S}.pickle'.format(now)
            data.to_pickle(file_name)
    
    def preprocessing(self):
        df = self.data.copy()
        df['性'] = df['性齢'].map(lambda x: str(x)[0])
        df['年齢'] = df['性齢'].map(lambda x: str(x)[1:]).astype(int)
        df['体重'] = df['馬体重'].str.split('(', expand=True)[0].astype(int)
        df['増減'] = df['馬体重'].str.split('(', expand=True)[1].str[:-1].astype(int)
        
        
        df = pd.get_dummies(data=df, columns=['性', 'race_type','weather', 'ground_state'], drop_first=True)
        df.drop(columns=['印', '馬名', '騎手', '厩舎', 'Unnamed: 9_level_1', '人気', '登録', 'メモ'])


# スクレイピングによるデータ収集

In [None]:
# 阪神競馬場10年分のmainデータをスクレイピング
# year = [y for y in range(2012, 2022)]
# results = scrape_netkeiba_results(year=year, course='阪神')

# df = pd.read_pickle('2012-2021hanshin.pickle')
# df['race_id'] = df.index
# df.reset_index(inplace=True)
# df.drop(columns='index', inplace=True)

# race_id_list = list(df.race_id.unique())
# horse_id_list = list(df.horse_id.unique())
# print(len(horse_id_list))
# print(len(race_id_list))

In [None]:
# 天気などの情報をスクレイピング
# dict_info = scrape_race_info(race_id_list)

# df_info = pd.DataFrame(dict_info).T
# df_info.to_pickle('df_info.pickle')

In [None]:
# 競走馬情報をスクレイピング
# スクレイピングの都合上、取得年を分ける

# hr_id_list2014 = make_horse_id(2014, 2014)
# dict_horse_2014 = scrape_horse_results(hr_id_list2014)
# df_horse_2014 = pre_horse(dict_horse_2014)
# df_horse_2014.to_pickle('df_horse_2014.pickle')
# df_horse_2014 = pd.read_pickle('df_horse_2014')

# hr_id_list2015 = make_horse_id(2015, 2015)
# dict_horse_2015 = scrape_horse_results(hr_id_list2015)
# df_horse_2015 = pre_horse(dict_horse_2015)
# df_horse_2015.to_pickle('df_horse_2015.pickle')
# df_horse_2015 = pd.read_pickle('df_horse_2015')

# hr_id_list2016 = make_horse_id(2016, 2016)
# dict_horse_2016 = scrape_horse_results(hr_id_list2016)
# df_horse_2016 = pre_horse(dict_horse_2016)
# df_horse_2016.to_pickle('df_horse_2016.pickle')
# df_horse_2016 = pd.read_pickle('df_horse_2016')

# hr_id_list2017 = make_horse_id(2017, 2017)
# dict_horse_2017 = scrape_horse_results(hr_id_list2017)
# df_horse_2017 = pre_horse(dict_horse_2017)
# df_horse_2017.to_pickle('df_horse_2017.pickle')
# df_horse_2017 = pd.read_pickle('df_horse_2017')

# hr_id_list2018 = make_horse_id(2018, 2018)
# dict_horse_2018 = scrape_horse_results(hr_id_list2018)
# df_horse_2018 = pre_horse(dict_horse_2018)
# df_horse_2018.to_pickle('df_horse_2018.pickle')
# df_horse_2018 = pd.read_pickle('df_horse_2018')

# hr_id_list2019 = make_horse_id(2019, 2019)
# dict_horse_2019 = scrape_horse_results(hr_id_list2019)
# df_horse_2019 = pre_horse(dict_horse_2019)
# df_horse_2019.to_pickle('df_horse_2019.pickle')
# df_horse_2019 = pd.read_pickle('df_horse_2019')

# df_horse_alldata = pd.concat([df_horse_2014, df_horse_2015, df_horse_2016, df_horse_2017, df_horse_2018, df_horse_2019])
# df_horse_alldata.to_pickle('df_horse_alldata.pickle')

In [None]:
# 払い戻し情報をスクレイピング
# dict_return = scrape_return_tables(race_id_list)

# dict_returnをデータフレーム型に変換しpickleファイルで保存
# df_return = pd.DataFrame()
# for i in tqdm(dict_return.keys()):
#     dict_return[i]['race_id'] = i
#     df_return = pd.concat([df_return, dict_return[i]])

# df_return.to_pickle('df_return.pickle')

In [None]:
# 出馬表のスクレイピング
# shutuba_table = scrape(df.race_id.unique())

# データクレンジング

In [None]:
# データの読み込み
df = pd.read_pickle('2012-2021hanshin.pickle')
df_info = pd.read_pickle('df_info.pickle')
df_return = pd.read_pickle('df_return.pickle')
df_horse_all = pd.read_pickle('df_horse_alldata.pickle')

# 複勝データを使用
rt = Return(df_return)
df_fukusho = rt.fukusho

# データの前処理
df = pre_main(df)
df_info = pre_info(df_info)
df = pd.merge(df, df_info, on='race_id', how='left')
df = pd.merge(df, df_horse_all, on='horse_id', how='left')
df.jockey_id = df.jockey_id.astype('int')


In [None]:
df.columns

In [None]:
select_columns = ['枠番', '馬番', '斤量', 'jockey_id', '性_牝', '性_牡', '年齢', '体重', '増減', 
       'course_len', 'weather_小雪', 'weather_晴', 'weather_曇', 'weather_雨', 'race_type_芝', 'race_type_障害',
       'ground_state_稍重', 'ground_state_良', 'ground_state_重',
       'race_name_2歳500万下', 'race_name_2歳オープン', 'race_name_2歳新馬',
       'race_name_2歳未勝利', 'race_name_3歳1勝クラス', 'race_name_3歳500万下',
       'race_name_3歳オープン', 'race_name_3歳以上1000万下', 'race_name_3歳以上1600万下',
       'race_name_3歳以上1勝クラス', 'race_name_3歳以上2勝クラス', 'race_name_3歳以上3勝クラス',
       'race_name_3歳以上500万下', 'race_name_3歳以上オープン', 'race_name_3歳新馬',
       'race_name_3歳未勝利', 'race_name_4歳以上1000万下', 'race_name_4歳以上1600万下',
       'race_name_4歳以上1勝クラス', 'race_name_4歳以上2勝クラス', 'race_name_4歳以上3勝クラス',
       'race_name_4歳以上500万下', 'race_name_4歳以上オープン', 'race_name_障害3歳以上オープン',
       'race_name_障害3歳以上未勝利', 'race_name_障害4歳以上オープン', 'race_name_障害4歳以上未勝利',
       'ALLprize', 'ALLcounts', 'ALLrank', 'HANSHINprize', 'HANSHINcounts',
       'HANSHINrank', 'SPLINTprize', 'SPLINTcounts', 'SPLINTrank', 'MILEprize',
       'MILEcounts', 'MILErank', 'CLASSICprize', 'CLASSICcounts',
       'CLASSICrank', 'STAYERprize', 'STAYERcounts', 'STAYERrank', 'TURFprize',
       'TURFcounts', 'TURFrank', 'DIRTprize', 'DIRTcounts', 'DIRTrank']

In [None]:
# データ分割
train, test = df[df.date.dt.strftime('%Y')!='2021'], df[df.date.dt.strftime('%Y')=='2021']
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

train_x, train_t, test_x, test_t = train[select_columns], train['rank'], test[select_columns], test['rank']

# モデル構築 

In [None]:
# 閾値の設定
threshold = 0.5

# lightGBM
model_lgbm = lgb.LGBMClassifier(random_state=58)
model_lgbm.fit(train_x, train_t)
lgbm_pred = model_lgbm.predict_proba(test_x)[:, 1]
print(f'LightGMB score: {roc_auc_score(test_t, lgbm_pred)}')
test['pred'] = lgbm_pred
money1, money2 = calculate_fukusho_return(test, df_fukusho, threshold=threshold)
print(f'回収率:{money1}%, 収支:{money2}円')
print()

# RandomForest
model_rfc = RandomForestClassifier(n_jobs=-1, random_state=58)
model_rfc.fit(train_x, train_t)
rfc_pred = model_rfc.predict_proba(test_x)[:, 1]
print(f'RandomForest score: {roc_auc_score(test_t, rfc_pred)}')
test['pred'] = rfc_pred
money1, money2 = calculate_fukusho_return(test, df_fukusho, threshold=threshold)
print(f'回収率:{money1}%, 収支:{money2}円')
print()

# catboost
model_cat = CatBoostClassifier(verbose=False, random_state=58)
model_cat.fit(train_x, train_t)
cat_pred = model_cat.predict_proba(test_x)[:, 1]
print(f'CatBoost score: {roc_auc_score(test_t, cat_pred)}')
test['pred'] = cat_pred
money1, money2 = calculate_fukusho_return(test, df_fukusho, threshold=threshold)
print(f'回収率:{money1}%, 収支:{money2}円')

# 出馬データの作成

# 最新情報への更新