In [35]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from lxml import html
from bs4 import BeautifulSoup

In [36]:
#https://db.netkeiba.com/race/202005020106/
#レース情報のスクレイピング

ID = '202005020102'
URL = f'https://db.netkeiba.com/race/{ID}/'
HTML = requests.get(URL).content
COURSE_INFO_X_PATH = '/html/body/div[1]/div[2]/div[1]/div/div/div/diary_snap/div/div/dl/dd/p/diary_snap_cut/span'

def get_elements_around_tabel(soup):
    table_html = soup.find('table')
    elements = table_html.find_all(nowrap="nowrap")
    return elements

def get_texts_from_elements(elements):
    texts = [element.text.replace('\n', '') for element in elements]
    return texts

def reshape_texts_to_df(texts):
#上手い書き方求む
    array = np.array(texts).reshape(int(len(texts)/21), 21)
    df = pd.DataFrame(array[1:], columns = array[0])
    return df

def get_course_info(soup):
    lxml = html.fromstring(str(soup))
    course_info_element = lxml.xpath(COURSE_INFO_X_PATH)
    course_info = course_info_element[0].text.replace('\xa0', '')
    return course_info

def separate_course_info(course_info):
#ハードコーディングになっちゃった
    road_type = course_info[0]
    curve_info = course_info[1]
    course_length = course_info[2:course_info.find('m')]
    weather = course_info[course_info.find('天候')+5]
    road_situation = course_info[course_info.rfind(road_type)+6]
    time = course_info[course_info.find('発走')+5:]
    return road_type, curve_info, weather, road_situation, time

soup = BeautifulSoup(HTML, 'html.parser')
elements = get_elements_around_tabel(soup)
texts = get_texts_from_elements(elements)
df = reshape_texts_to_df(texts)

course_info = get_course_info(soup)
road_type, curve_info, weather, road_situation, time = separate_course_info(course_info)

df['road_type'] = road_type
df['curve_info'] = curve_info
df['weather'] = weather
df['road_situation'] = road_situation
df['time'] = time
df['race_id'] = ID

In [37]:
df.head()

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,ﾀｲﾑ指数,...,備考,調教師,馬主,賞金(万円),road_type,curve_info,weather,road_situation,time,race_id
0,1,4,7,グローサーベア,セ3,56,ルメール,1:26.1,,**,...,,[東]矢野英一,社台レースホース,510.0,ダ,左,晴,良,10:40,202005020102
1,2,2,4,カッチョカバロ,牡3,56,津村明秀,1:26.3,1.1/4,**,...,,[東]牧光二,平田修,200.0,ダ,左,晴,良,10:40,202005020102
2,3,2,3,ペガサスターボ,牡3,56,横山和生,1:26.3,ハナ,**,...,,[東]勢司和浩,千明牧場,130.0,ダ,左,晴,良,10:40,202005020102
3,4,1,1,ノーブルウルフ,牡3,56,横山武史,1:26.9,3.1/2,**,...,,[東]久保田貴,吉木伸彦,77.0,ダ,左,晴,良,10:40,202005020102
4,5,6,11,シャークスコーブ,牡3,56,レーン,1:27.2,1.3/4,**,...,,[東]上原博之,シルクレーシング,51.0,ダ,左,晴,良,10:40,202005020102


In [38]:
df.to_csv('race_result.csv')

In [39]:
# 2010 ~ 2023年
seireki_list = [str(s) for s in list(range(2010, 2024))]
# 01：札幌、02：函館、03：福島、04：新潟、05：東京、06：中山、07：中京、08：京都、09：阪神、10：小倉
spot_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']
num_list = ['01', '02', '03', '04', '05', '06', '07']
date_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14']
race_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

In [40]:
concat_df = pd.DataFrame()
concat_race_df = pd.DataFrame()

for seireki in tqdm(seireki_list):
    for spot in spot_list:
        for num in num_list:
            for date in date_list:
                for race in race_list:
                    try:
                        ID = seireki + spot + num + date + race
                        URL = f'https://db.netkeiba.com/race/{ID}/'
                        HTML = requests.get(URL).content
                        COURSE_INFO_X_PATH = '/html/body/div[1]/div[2]/div[1]/div/div/div/diary_snap/div/div/dl/dd/p/diary_snap_cut/span'
                        soup = BeautifulSoup(HTML, 'html.parser')
                        elements = get_elements_around_tabel(soup)
                        texts = get_texts_from_elements(elements)
                        df = reshape_texts_to_df(texts)

                        course_info = get_course_info(soup)
                        road_type, curve_info, weather, road_situation, time = separate_course_info(course_info)

                        df['road_type'] = road_type
                        df['curve_info'] = curve_info
                        df['weather'] = weather
                        df['road_situation'] = road_situation
                        df['time'] = time
                        df['race_id'] = ID
                        concat_df = pd.concat([concat_df, df], ignore_index=True)
                    except:
                        continue

100%|███████████████████████████████████████████████████████| 14/14 [15:08:29<00:00, 3893.53s/it]


In [41]:
concat_df.to_csv('race.csv')

In [42]:
len(concat_df)

643764

In [43]:
concat_df.tail()

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,ﾀｲﾑ指数,...,備考,調教師,馬主,賞金(万円),road_type,curve_info,weather,road_situation,time,race_id
643759,12,6,12,タイニープライド,牝4,55,川又賢治,1:48.2,クビ,,...,,[西]杉山佳明,富永剛,,芝,右,晴,発,16:01,202310010812
643760,13,7,13,フィルムスコア,牝5,52,永島まな,1:48.4,1.1/4,,...,,[西]平田修,水上行雄,,芝,右,晴,発,16:01,202310010812
643761,14,2,4,ヨンウォニ,牝6,53,水沼元輝,1:48.5,クビ,,...,,[東]伊藤伸一,ミルファーム,,芝,右,晴,発,16:01,202310010812
643762,15,3,5,クオンタム,牝5,56,黛弘人,1:48.7,1.1/4,,...,,[東]村田一誠,ミルファーム,,芝,右,晴,発,16:01,202310010812
643763,16,8,16,マイネルパリオート,牡5,58,丸山元気,1:49.2,3.1/2,,...,,[東]高橋裕,サラブレッドクラブ・ラフィアン,,芝,右,晴,発,16:01,202310010812
