In [39]:
from bs4 import BeautifulSoup
import re
import os
import platform
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
from tqdm import tqdm
from typing import BinaryIO
from __future__ import annotations

In [40]:
if platform.system() == 'Windows':
    DRIVER_SITE: str = 'chromedriver.exe'
else:
    DRIVER_SITE: str = '/Applications/chromedriver'
CURRENT_DIR: str = 'retired_player_data'
ORIGINAL_URL: str = 'https://keirin.netkeiba.com/db/search_result/player.html?word=&grad=&pref=&age=&sex=&class=&act%5B%5D=2&sort=1&submit='
YEARLY_RESULTS_FILENAME:str = 'yearly_results'
YEARLY_RESULTS_HEADER: str = (
    '選手名,年齢,性別,身長,体重,級班,脚質,登録番号,'
    '年度,1着,2着,3着,4着〜,出走回数,優勝回数,勝率,2連対率,3連対率\n'
)
PLAYER_RESULTS_FILENAME: str = 'player_results'
PLAYER_RESULTS_HEADER: str = (
    '選手名,年齢,性別,身長,体重,級班,脚質,登録番号,'
    'レース名,着順1,着順2,着順3,着順4,着順5\n'
)
MAX_RESULTS: int = 5

In [41]:
options: Options = Options()
options.add_argument('--headless')
driver: webdriver.Chrome = webdriver.Chrome(DRIVER_SITE, options=options)
# driver: webdriver.chrome = webdriver.Chrome(DRIVER_SITE)
driver.get(ORIGINAL_URL)

In [42]:
def end_page_correction(driver: webdriver.Chrome, end_page_num: int) -> int:
    # 全ページ数の取得
    driver.find_element_by_link_text('最後').click()
    sleep(1)
    total_page_num: int = int(driver.find_element_by_xpath('//a[@class="Page_Active"]').text)
    driver.get(ORIGINAL_URL)
    sleep(1)

    # Webスクレイピング終了ページの補正
    if end_page_num > total_page_num:
        end_page_num: int = total_page_num

    return end_page_num

In [43]:
def start_page_transition(driver: webdriver.Chrome, start_page_num: int) -> None:
    if start_page_num > 1:
        for _ in range(1, start_page_num):
            driver.find_element_by_link_text('次へ').click()
            sleep(1)

In [44]:
def get_player_basic_data(soup: BeautifulSoup) -> list[str]:
    
    player_data: list[str] = []

    # 選手名の取得
    player_name: str = soup.find(class_ = 'ProfileHeader_box_name').get_text(strip=True)
    player_name: str = re.sub('引退', '', player_name)
    player_data.append(player_name)

    # 年齢、性別の取得
    for i, data in enumerate(soup.find_all(class_ = 'ProfileHeader_box_dataText')):
        if i == 0:
            player_data.append('不明')
        elif i == 3:
            player_data.append(data.get_text(strip=True))

    # 身長、体重、級班、脚質、登録番号の取得
    for i, data in enumerate(soup.find_all(class_ = 'profile_list_data')):
        if i == 0:
            player_data.append('不明')
            player_data.append('不明')
        elif i == 3 or i == 4 or i == 5:
            player_data.append(data.get_text(strip=True))

    return player_data

In [45]:
def get_yearly_resluts(soup: BeautifulSoup) -> dict[str, list[str]]:

    yearly_results: dict[str, list[str]] = {}

    try:
        soup.find(class_ = 'DB_DataNone').get_text(strip=True)
        return yearly_results

    except AttributeError:
        # 年度の取得
        years: list[str] = []
        for year in soup.find_all(class_ = 'detail_table_tbodyInner'):
            years.append(year.get_text(strip=True))

        # 年度、1着/2着/3着/4着〜、出走回数、優勝回数、勝率、2連対率、3連対率の取得
        detail_table = soup.find(class_ = 'mode_detail_table table_slide_body ResultsByYears')
        for i, data in enumerate(detail_table.find(class_ = 'detail_table_tbody').find_all('tr')):
            yearly_result: list[str] = []
            for detail in data.find_all('td'):
                yearly_result.append(detail.get_text(strip=True))
            yearly_results[years[i]] = yearly_result

        return yearly_results

In [46]:
def get_race_results(soup: BeautifulSoup) -> dict[str,list[str]]:
    
    race_results: dict[str,list[str]] = {}

    # レース名の取得
    race_name: list[str] = []
    for data in soup.find_all(lambda tag: tag.get('class') == ['detail_table_tbodyInner']):
        race_name.append(data.get_text(strip=True))
    
    # 着順の取得、レース名と着順の結合
    for i, data in enumerate(soup.find(id = 'view-result').find_all('tr')):
        results: list[str] = []
        for result in data.find_all('td'):
            results.append(result.get_text(strip=True).replace('\u3000', ''))
        
        if len(results) < MAX_RESULTS:
            for _ in range(MAX_RESULTS - len(results)):
                results.append('')
                
        race_results[race_name[i]] = results

    return race_results

In [47]:
try:
    # Webスクレイピング開始・終了ページの設定
    start_page_num: int = 1
    end_page_num: int = 135
    
    # Webスクレイピング終了ページの補正
    end_page_num: int = end_page_correction(driver, end_page_num)

    # Webスクレイピング開始ページへの遷移
    start_page_transition(driver, start_page_num)

    current_url: str = driver.current_url

    for page_num in tqdm(range(start_page_num, end_page_num + 1)):
        yearly_results_file: BinaryIO = open(os.path.join(CURRENT_DIR, YEARLY_RESULTS_FILENAME + str(page_num) + '.csv'), 'w')
        yearly_results_file.write(YEARLY_RESULTS_HEADER)

        race_results_file: BinaryIO = open(os.path.join(CURRENT_DIR, PLAYER_RESULTS_FILENAME + str(page_num) + '.csv'), 'w')
        race_results_file.write(PLAYER_RESULTS_HEADER)

        player_list = driver.find_elements_by_xpath('//ul[@class="playerList CommonList_01"]/li/a')

        for player_count in range(len(player_list)):
            # 選手ページへの遷移
            trans_player_list = driver.find_elements_by_xpath('//ul[@class="playerList CommonList_01"]/li/a')
            trans_player_list[player_count].click()
            sleep(1)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # 基本データの取得
            player_basic_data: list[str] = get_player_basic_data(soup)

            # 詳細ページへの遷移
            if len(driver.find_elements_by_xpath('//div[@class="DB_DataNone"]')) == 0:
                driver.find_element_by_link_text('成績').click()
                sleep(1)
                soup = BeautifulSoup(driver.page_source, 'html.parser')

                # 年度別成績の取得
                yearly_results: dict[str, list[str]] = get_yearly_resluts(soup)

                # 近況成績ページへの遷移
                driver.find_element_by_link_text('近況成績').click()
                sleep(1)
                soup = BeautifulSoup(driver.page_source, 'html.parser')

                # 近況成績の取得
                race_results: dict[str,list[str]] = get_race_results(soup)

                for year, result in yearly_results.items():
                    yearly_results_file.write(
                        ','.join(player_basic_data) + 
                        ',' + year  +
                        ',' + ','.join(result) + '\n'
                    )
                
                for race_name, result in race_results.items():
                    race_results_file.write(
                        ','.join(player_basic_data) + 
                        ',' + race_name  +
                        ',' + ','.join(result) + '\n'
                    )

            # 選手検索ページへ戻る
            driver.get(current_url)
            sleep(1)

        # 次の選手検索ページへ遷移する
        if page_num < end_page_num:
            driver.find_element_by_link_text('次へ').click()
            sleep(1)
            current_url: str = driver.current_url

except Exception as e:
    import traceback
    print('エラー発生!')
    print(driver.current_url)
    traceback.print_exc()
finally:
    yearly_results_file.close()
    race_results_file.close()

100%|██████████| 135/135 [2:21:05<00:00, 62.70s/it]


In [48]:
driver.quit()