In [43]:
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from time import sleep

In [44]:
DRIVER_SITE = '/Applications/chromedriver'
ORIGINAL_URL = 'https://keirin.netkeiba.com/db/search_result/player.html?word=&grad=&pref=&age=&sex=&class=&act%5B%5D=1&sort=1&submit='
CURRENT_DIR = '/Users/yamazakiyuuta/Library/Mobile Documents/com~apple~CloudDocs/Product/keirin'
YEARLY_RESULTS_FILENAME = 'yearly_results.csv'
YEARLY_RESULTS_HEADER = (
    '選手名, 年齢, 性別, 身長, 体重, 級班, 脚質, 登録番号,'
    '年度, 1着, 2着, 3着, 4着〜, 出走回数, 優勝回数, 勝率, 2連対率, 3連対率\n'
)
PLAYER_RESULTS_FILENAME = 'player_results.csv'
PLAYER_RESULTS_HEADER = (
    '選手名, 年齢, 性別, 身長, 体重, 級班, 脚質, 登録番号,'
    'レース名, 着順1, 着順2, 着順3, 着順4\n'
)

In [45]:
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome('/Applications/chromedriver', options=options)
# driver = webdriver.Chrome(DRIVER_SITE)
driver.get(ORIGINAL_URL)

In [46]:
def get_player_basic_data(soup):
    
    player_data = []

    # 選手名の取得
    player_name = soup.find(class_ = 'ProfileHeader_box_name').get_text(strip=True)
    player_name = re.sub('\(.*\)', '', player_name)
    player_data.append(player_name)

    # 年齢、性別の取得
    for i, data in enumerate(soup.find_all(class_ = 'ProfileHeader_box_dataText')):
        if i == 0 or i == 4:
            player_data.append(data.get_text(strip=True))

    # 身長、体重、級班、脚質、登録番号の取得
    for i, data in enumerate(soup.find_all(class_ = 'profile_list_data')):
        if i == 0:
            height_weight = data.get_text(strip=True).split(sep='/')
            player_data.append(height_weight[0])
            player_data.append(height_weight[1])
        elif i == 3 or i == 5 or i == 6:
            player_data.append(data.get_text(strip=True))

    return player_data

In [47]:
def get_yearly_resluts(soup):

    yearly_results = {}

    # 年度の取得
    years = []
    for year in soup.find_all(class_ = 'detail_table_tbodyInner'):
        years.append(year.get_text(strip=True))

    # 年度、1着/2着/3着/4着〜、出走回数、優勝回数、勝率、2連対率、3連対率の取得
    detail_table = soup.find(class_ = 'mode_detail_table table_slide_body ResultsByYears')
    for i, data in enumerate(detail_table.find(class_ = 'detail_table_tbody').find_all('tr')):
        yearly_result = []
        for detail in data.find_all('td'):
            yearly_result.append(detail.get_text(strip=True))
        yearly_results[years[i]] = yearly_result

    return yearly_results

In [48]:
def get_race_results(soup):
    
    race_results = {}

    # レース名の取得
    race_name = []
    for data in soup.find_all(lambda tag: tag.get('class') == ['detail_table_tbodyInner']):
        race_name.append(data.get_text(strip=True))
    
    # 着順の取得、レース名と着順の結合
    for i, data in enumerate(soup.find(id = 'view-result').find_all('tr')):
        results = []
        for result in data.find_all('td'):
            results.append(result.get_text(strip=True).replace('\u3000', ''))
        race_results[race_name[i]] = results

    return race_results

In [49]:
yearly_results_file = open(os.path.join(CURRENT_DIR, YEARLY_RESULTS_FILENAME), 'w')
yearly_results_file.write(YEARLY_RESULTS_HEADER)

race_results_file = open(os.path.join(CURRENT_DIR, PLAYER_RESULTS_FILENAME), 'w')
race_results_file.write(PLAYER_RESULTS_HEADER)

current_url = ORIGINAL_URL

try:
    n = 0
    while(n := n + 1) < 2:
        player_list = driver.find_elements_by_xpath('//ul[@class="playerList CommonList_01"]/li/a')

        for player_count in range(len(player_list)):
            # 選手ページへの遷移
            player_list = driver.find_elements_by_xpath('//ul[@class="playerList CommonList_01"]/li/a')
            player_list[player_count].click()
            sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # 基本データの取得
            player_basic_data = get_player_basic_data(soup)

            # 詳細ページへの遷移
            driver.find_element_by_link_text('もっと見る').click()
            sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # 年度別成績の取得
            yearly_results = get_yearly_resluts(soup)

            # 近況成績ページへの遷移
            driver.find_element_by_link_text('近況成績').click()
            sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # 近況成績の取得
            race_results = get_race_results(soup)

            for year, result in yearly_results.items():
                yearly_results_file.write(
                    ','.join(player_basic_data) + 
                    ',' + year  +
                    ',' + ','.join(result) + '\n'
                )
            
            for race_name, result in race_results.items():
                race_results_file.write(
                    ','.join(player_basic_data) + 
                    ',' + race_name  +
                    ',' + ','.join(result) + '\n'
                )

            # 選手検索ページへ戻る
            driver.get(current_url)

        # 次の選手検索ページへ遷移する
        driver.find_element_by_link_text('次へ').click()
        sleep(3)
        current_url = driver.current_url

except NoSuchElementException:
    print('Finish!')
finally:
    yearly_results_file.close()
    race_results_file.close()

In [50]:
driver.quit()