In [43]:
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from time import sleep

In [44]:
DRIVER_SITE = '/Applications/chromedriver'
ORIGINAL_URL = 'https://keirin.netkeiba.com/db/search_result/race.html?word=&start_year=none&start_mon=none&end_year=none&end_mon=none&jyo=&sort=1&submit='
CURRENT_DIR = '/Users/yamazakiyuuta/Library/Mobile Documents/com~apple~CloudDocs/Product/keirin'
RACE_RESULTS_FILENAME = 'race_results.csv'
RACE_RESULTS_HEADER = (
    '日付, 開催場所, ラウンド, グレード, レースグループ, レース名, 距離, 周回,'
    '選手名, 着順, 枠番, 車番, 着差, 上り, 決, SB\n'
)

In [45]:
options = Options()
options.add_argument('--headless')
# driver = webdriver.Chrome('/Applications/chromedriver', options=options)
driver = webdriver.Chrome(DRIVER_SITE)
driver.get(ORIGINAL_URL)

In [1]:
def get_date_and_venue(soup):
    
    date_and_venue = []

    # 開催日と開催場所の取得
    for data in soup.find_all(class_ = 'DataBox_01'):
        (date, venue) = data.p.get_text(strip=True).split()
        date_and_venue.append((date, venue))

    return date_and_venue

In [47]:
def get_yearly_resluts(soup):

    yearly_results = {}

    # 年度の取得
    years = []
    for year in soup.find_all(class_ = 'detail_table_tbodyInner'):
        years.append(year.get_text(strip=True))

    # 年度、1着/2着/3着/4着〜、出走回数、優勝回数、勝率、2連対率、3連対率の取得
    detail_table = soup.find(class_ = 'mode_detail_table table_slide_body ResultsByYears')
    for i, data in enumerate(detail_table.find(class_ = 'detail_table_tbody').find_all('tr')):
        yearly_result = []
        for detail in data.find_all('td'):
            yearly_result.append(detail.get_text(strip=True))
        yearly_results[years[i]] = yearly_result

    return yearly_results

In [48]:
def get_race_results(soup):
    
    race_results = {}

    # レース名の取得
    race_name = []
    for data in soup.find_all(lambda tag: tag.get('class') == ['detail_table_tbodyInner']):
        race_name.append(data.get_text(strip=True))
    
    # 着順の取得、レース名と着順の結合
    for i, data in enumerate(soup.find(id = 'view-result').find_all('tr')):
        results = []
        for result in data.find_all('td'):
            results.append(result.get_text(strip=True).replace('\u3000', ''))
        race_results[race_name[i]] = results

    return race_results

In [49]:
yearly_results_file = open(os.path.join(CURRENT_DIR, YEARLY_RESULTS_FILENAME), 'w')
yearly_results_file.write(YEARLY_RESULTS_HEADER)

race_results_file = open(os.path.join(CURRENT_DIR, PLAYER_RESULTS_FILENAME), 'w')
race_results_file.write(PLAYER_RESULTS_HEADER)

current_url = ORIGINAL_URL

try:
    n = 0
    while(n := n + 1) < 2:
        # 開催日と開催場所の取得
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        date_and_venue = get_date_and_venue(soup)
        
        # レースグループリンクの取得
        race_group_list = driver.find_elements_by_xpath('//ul[@class="CommonList_01"]/li/div/a')

        # レースグループ単位での処理
        for race_group_count in range(len(race_group_list)):
            # レース一覧ページへの遷移
            # race_group_list = driver.find_elements_by_xpath('//ul[@class="CommonList_01"]/li/div/a')
            race_group_url = driver.current_url
            race_group_list[race_group_count].click()
            sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # 開催日リンクの取得
            date_list = driver.find_elements_by_xpath('//div[@class="Tab_RaceDaySelect p00"]/ul/li/a')

            # 開催日単位での処理
            for date_count in range(len(date_list)):
                # 開催日ページへの遷移
                date_and_race_url = driver.current_url
                date_list[date_count].click()
                sleep(3)

                # レース詳細ページリンクの取得
                race_list = driver.find_elements_by_xpath('//div[@class="RaceList_Main_Box"]/a')

                # レース単位での処理
                for race_count in range(len(race_list)):
                    # レース詳細ページへの遷移




            # 基本データの取得
            player_basic_data = get_player_basic_data(soup)

            # 詳細ページへの遷移
            driver.find_element_by_link_text('もっと見る').click()
            sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # 年度別成績の取得
            yearly_results = get_yearly_resluts(soup)

            # 近況成績ページへの遷移
            driver.find_element_by_link_text('近況成績').click()
            sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # 近況成績の取得
            race_results = get_race_results(soup)

            for year, result in yearly_results.items():
                yearly_results_file.write(
                    ','.join(player_basic_data) + 
                    ',' + year  +
                    ',' + ','.join(result) + '\n'
                )
            
            for race_name, result in race_results.items():
                race_results_file.write(
                    ','.join(player_basic_data) + 
                    ',' + race_name  +
                    ',' + ','.join(result) + '\n'
                )

            # 選手検索ページへ戻る
            driver.get(current_url)

        # 次の選手検索ページへ遷移する
        driver.find_element_by_link_text('次へ').click()
        sleep(3)
        current_url = driver.current_url

except NoSuchElementException:
    print('Finish!')
finally:
    yearly_results_file.close()
    race_results_file.close()

In [50]:
driver.quit()