# import

In [1]:
import pandas as pd
from tqdm.notebook import tqdm as tqdm
import requests
from bs4 import BeautifulSoup
import re
import time
import urllib.request


# class ShutubaTable

In [2]:
class ShutubaTable():
    
    
    def __init__(self):
        self.shutuba_table = pd.DataFrame()
        self.shutuba_table_p = pd.DataFrame()
        self.shutuba_table_h = pd.DataFrame()
        self.shutuba_table_pe = pd.DataFrame()
        self.no_peds = pd.DataFrame()
        
    def scrape(self, race_id_list, date):
        for race_id in race_id_list:
        
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T

            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = soup.find('div', attrs={'class':'RaceData01'}).text
            texts = re.findall(r'\w+',texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+',text)[0])] * len(df)
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df['weather'] = [text] * len(df)
                if text in ["良","稍重","重","不良"]:
                    df['ground_state'] = [text] * len(df)
                if '不' in text:
                    df['ground_state'] = ['不良'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)

            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class':'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)

            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class':'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(horse_id)

            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list

            df.index = [race_id] * len(df)
            self.shutuba_table = self.shutuba_table.append(df)
                
                       
    def preprocessing(self):
        df = self.shutuba_table.copy()
        
        df['性'] = df['性齢'].map(lambda x:str(x)[0])
        df['年齢'] = df['性齢'].map(lambda x:str(x)[1:]).astype(int)
        
        df['体重'] = df['馬体重(増減)'].str.split("(",expand=True)[0].astype(int)
        df['体重変化'] = df['馬体重(増減)'].str.split("(",expand=True)[1].str[:-1].astype(int)
        df['date'] = pd.to_datetime(df['date'])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        
        df = df[['枠','馬番','斤量','course_len','weather','race_type',
                'ground_state','date','horse_id','jockey_id','性','年齢','体重','体重変化']]
        
        self.shutuba_table_p = df.rename(columns={'枠':'枠番'})
            
    def merge_horse_results(self, hr, n_samples_list=[5,9, 'all']):
        self.shutuba_table_h = self.shutuba_table_p.copy()
        for n_samples in n_sample_list:
            self.shutuba_table_h = hr.merge_all(self.shutuba_table_h,n_samples=n_samples)
            
    def merge_peds(self,peds):
        self.shutuba_tables_pe = self.shutuba_table_h.merge(peds, left_on='horse_id',right_index=True,how='left')
        self.no_peds = self.shutuba_table_pe[self.shutuba_table_pe['peds0'].isnull()]['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds" ')

In [20]:
# url = 'https://race.netkeiba.com/race/shutuba.html?race_id=202106040801'
# pd.read_html(url)
st = ShutubaTable()
st.scrape(['202004020601'],'2020/08/09')


In [22]:
st = ShutubaTable()
st.scrape(['202004020601'],'2020/08/09')
st.preprocessing()

Unnamed: 0,枠番,馬番,斤量,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,性,年齢,体重,体重変化
202004020601,1,1,51,1800,雨,芝,不良,2020-08-09,2018102805,2018102130,牝,2,422,-2
202004020601,1,2,54,1800,雨,芝,不良,2020-08-09,2018102165,2018102130,牝,2,452,-2
202004020601,2,3,54,1800,雨,芝,不良,2020-08-09,2018105670,2018102130,牡,2,502,-6
202004020601,2,4,54,1800,雨,芝,不良,2020-08-09,2018102090,2018102130,牝,2,462,4
202004020601,3,5,54,1800,雨,芝,不良,2020-08-09,2018102131,2018102130,牝,2,422,0
202004020601,3,6,53,1800,雨,芝,不良,2020-08-09,2018101443,2018102130,牝,2,468,-8
202004020601,4,7,54,1800,雨,芝,不良,2020-08-09,2018101804,2018102130,牝,2,504,0
202004020601,4,8,54,1800,雨,芝,不良,2020-08-09,2018104106,2018102130,牡,2,488,-4
202004020601,5,9,54,1800,雨,芝,不良,2020-08-09,2018103769,2018102130,牝,2,484,4
202004020601,5,10,54,1800,雨,芝,不良,2020-08-09,2018103175,2018102130,牡,2,490,2


In [12]:
st.shutuba_table

Unnamed: 0,枠,馬番,印,馬名,性齢,斤量,騎手,厩舎,馬体重(増減),Unnamed: 9_level_1,人気,登録,メモ,course_len,race_type,weather,ground_state,date,horse_id,jockey_id
202004020601,1,1,,アルバトリア,牝2,51.0,小林脩,美浦小桧山,422(-2),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018102805,2018102130
202004020601,1,2,,レースアルカーナ,牝2,54.0,国分優,栗東梅田,452(-2),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018102165,2018102130
202004020601,2,3,,トーセンマーク,牡2,54.0,武藤,美浦古賀慎,502(-6),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018105670,2018102130
202004020601,2,4,,アイリッシュムーン,牝2,54.0,福永,美浦武井,462(+4),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018102090,2018102130
202004020601,3,5,,スウィートブルーム,牝2,54.0,柴田大,美浦和田雄,422(0),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018102131,2018102130
202004020601,3,6,,サンキューベイリー,牝2,53.0,岩田望,美浦相沢,468(-8),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018101443,2018102130
202004020601,4,7,,グローリアスカペラ,牝2,54.0,Ｍデムーロ,美浦土田,504(0),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018101804,2018102130
202004020601,4,8,,ミエノピュア,牡2,54.0,鮫島駿,栗東高橋亮,488(-4),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018104106,2018102130
202004020601,5,9,,ミヤジオシャラク,牝2,54.0,和田竜,栗東川村,484(+4),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018103769,2018102130
202004020601,5,10,,タカノアメージング,牡2,54.0,荻野極,栗東北出,490(+2),---.-,**,,,1800,芝,雨,不良,2020/08/09,2018103175,2018102130


In [4]:
df.T.reset_index(level=0, drop=True).T

Unnamed: 0,枠,馬番,印,馬名,性齢,斤量,騎手,厩舎,馬体重(増減),Unnamed: 9_level_1,人気,登録,メモ
0,1,1,,アポロファントム,牡4,60.0,金子,美浦鈴木伸,,---.-,**,,
1,2,2,,ブラックジャッカル,セ3,58.0,熊沢,栗東須貝,,---.-,**,,
2,3,3,,アノ,牡5,60.0,難波,栗東鈴木孝,,---.-,**,,
3,3,4,,アドラメレク,牡7,60.0,西谷誠,栗東長谷川,,---.-,**,,
4,4,5,,ピュアヒカリ,セ5,60.0,上野,美浦竹内,,---.-,**,,
5,4,6,,グリニッチヴィレジ,牝8,58.0,伴,美浦天間,,---.-,**,,
6,5,7,,アルトリウス,セ7,60.0,蓑島,美浦高橋文,,---.-,**,,
7,5,8,,ドリームソルジャー,牡7,60.0,植野,栗東渡辺,,---.-,**,,
8,6,9,,ブールバール,牝5,58.0,大江原,栗東大久保,,---.-,**,,
9,6,10,,ワンダーサーイター,牡4,60.0,草野,美浦畠山,,---.-,**,,
