In [20]:
import datetime
import time
import re
import numpy as np
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
import tqdm
import concurrent.futures

# ID = '202205050307'
# ID_for_URL = ID[2:]
# URL = f'http://jiro8.sakura.ne.jp/index.php?code={ID_for_URL}'
INDEX_X_PATH = '/html/body/table[3]/tbody/tr/td[3]/table[3]/tbody/tr[{NUM}]'
INDEX_X_PATHS = [INDEX_X_PATH.format(NUM = num) for num in [33, 34, 35, 36, 49, 50, 51, 52, 65, 66, 67, 68, 81, 82, 83, 84, 97, 98, 99, 100]]
RETURN_DF_COLS = ['lead_index_1','pase_index_1','up_index_1','speed_index_1'
           ,'lead_index_2','pase_index_2','up_index_2','speed_index_2'
           ,'lead_index_3','pase_index_3','up_index_3','speed_index_3'
           ,'lead_index_4','pase_index_4','up_index_4','speed_index_4'
           ,'lead_index_5','pase_index_5','up_index_5','speed_index_5']

def launch_chrome_driver(is_headless = False):
    options = webdriver.ChromeOptions()
    if is_headless:
        options.add_argument('--headless')
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    return driver

def clean_data(data:str) -> list:
    data_list = data.split(' ')
    return data_list

def delete_extra_str(hourse_number):
    return hourse_number[:-1]

def get_horse_number(driver):
    raw_data = driver.find_element_by_xpath('/html/body/table[3]/tbody/tr/td[3]/table[3]/tbody/tr[2]').text
    data_list =  clean_data(raw_data)
    data_list = delete_extra_str(data_list)
    return data_list

def get_index_list(driver, horse_number):
    index_list = []
    for path in INDEX_X_PATHS:
        for i in range(1, len(horse_number) + 1):
            driver = driver.find_element_by_xpath(path + '/td[{num}]'.format(num = i))
            index_list.append(driver.text)
    return index_list

def convert_list_to_df(index_list, horse_number):
    return pd.DataFrame(np.array(index_list).reshape(int(len(index_list)/len(horse_number)), len(horse_number)))
    
def add_columns(df):
    df.columns = RETURN_DF_COLS
    return df

def get_index_df(id:str):
    id_for_url = id[2:]
    url = f'http://jiro8.sakura.ne.jp/index.php?code={id_for_url}'
    Chrome = launch_chrome_driver(True)
    Chrome.get(url)
    horse_number = get_horse_number(Chrome)
    indexs = get_index_list(Chrome, horse_number)
    Chrome.close()
    index_df = convert_list_to_df(indexs, horse_number)
    df = index_df.T
    df = add_columns(df)
    df['horse_number'] = horse_number
    df['race_id'] = id
    return df

In [21]:
seireki_list = [str(s) for s in list(range(2010, 2024))]
# 01：札幌、02：函館、03：福島、04：新潟、05：東京、06：中山、07：中京、08：京都、09：阪神、10：小倉
spot_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']
num_list = ['01', '02', '03', '04', '05', '06', '07']
date_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14']
race_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
id_list = []
for seireki in seireki_list:
    for spot in spot_list:
        for num in num_list:
            for date in date_list:
                for race in race_list:
                    id_list.append(seireki + spot + num + date + race)

In [22]:
len(id_list)

164640

In [23]:
all_df = pd.DataFrame(columns = RETURN_DF_COLS)
faild_list = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = [executor.submit(get_index_df, id) for id in id_list[:10]]
    for result, id in zip(results, id_list):
        try:
            all_df = pd.concat([all_df,result.result()])
        except Exception as e:
            print(e)
            faild_list.append(id)

馬の番号：['11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010101
馬の番号：['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010108
馬の番号：['11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010107
馬の番号：['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010102
馬の番号：['10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010110
馬の番号：['16', '15', '14', '13', '12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010106
馬の番号：['12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010104
馬の番号：['13', '12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010105
馬の番号：['13', '12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010103
馬の番号：['16', '15', '14', '13', '12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2', '1']ID: 1001010109


In [24]:
all_df['race_id'].unique()

array(['201001010101', '201001010102', '201001010103', '201001010104',
       '201001010105', '201001010106', '201001010107', '201001010108',
       '201001010109', '201001010110'], dtype=object)

In [25]:
faild_list

[]

In [26]:
all_df.columns

Index(['lead_index_1', 'pase_index_1', 'up_index_1', 'speed_index_1',
       'lead_index_2', 'pase_index_2', 'up_index_2', 'speed_index_2',
       'lead_index_3', 'pase_index_3', 'up_index_3', 'speed_index_3',
       'lead_index_4', 'pase_index_4', 'up_index_4', 'speed_index_4',
       'lead_index_5', 'pase_index_5', 'up_index_5', 'speed_index_5',
       'horse_number', 'race_id'],
      dtype='object')

In [31]:
columns = ['race_id', 'horse_number']
columns.extend(RETURN_DF_COLS)
all_df = all_df[columns]
all_df['horse_number'] = all_df['horse_number'].astype(int)
all_df = all_df[columns].sort_values(['race_id', 'horse_number'], ascending=[True, True])
all_df.head(20)

Unnamed: 0,race_id,horse_number,lead_index_1,pase_index_1,up_index_1,speed_index_1,lead_index_2,pase_index_2,up_index_2,speed_index_2,...,up_index_3,speed_index_3,lead_index_4,pase_index_4,up_index_4,speed_index_4,lead_index_5,pase_index_5,up_index_5,speed_index_5
10,201001010101,1,-22.58,-18.58,-12.4,49.01,,,,,...,,,,,,,,,,
9,201001010101,2,-47.75,-51.75,-2.92,25.32,,,,,...,,,,,,,,,,
8,201001010101,3,-19.53,-12.53,-15.06,52.39,-25.22,-18.22,-9.37,52.39,...,,,,,,,,,,
7,201001010101,4,-54.09,-58.09,1.41,23.32,,,,,...,,,,,,,,,,
6,201001010101,5,-38.99,-34.99,-12.86,32.14,,,,,...,,,,,,,,,,
5,201001010101,6,-6.73,0.26,-17.91,62.35,-29.49,-22.49,-27.86,29.63,...,,,,,,,,,,
4,201001010101,7,-25.44,-18.44,-34.79,26.75,-22.38,-15.38,-10.79,53.81,...,,,,,,,,,,
3,201001010101,8,-41.97,-45.97,-7.93,26.09,,,,,...,,,,,,,,,,
2,201001010101,9,-15.26,-8.26,-2.26,69.46,,,,,...,,,,,,,,,,
1,201001010101,10,-27.41,-23.41,2.08,58.67,,,,,...,,,,,,,,,,


In [None]:
# import multiprocessing
# import pandas as pd


# seireki_list = [str(s) for s in list(range(2010, 2024))]
# # 01：札幌、02：函館、03：福島、04：新潟、05：東京、06：中山、07：中京、08：京都、09：阪神、10：小倉
# spot_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']
# num_list = ['01', '02', '03', '04', '05', '06', '07']
# date_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14']
# race_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# id_list = []
# for seireki in seireki_list:
#     for spot in spot_list:
#         for num in num_list:
#             for date in date_list:
#                 for race in race_list:
#                     id_list.append(seireki + spot + num + date + race)


# def scrape(page_num):
#     """ページからスクレイピングしてDataFrame型で結果を返す関数"""
#     # スクレイピングの処理を行う
#     result = ...
#     # スクレイピング結果をDataFrameに変換する
#     df = pd.DataFrame(result)
#     return df

# if __name__ == '__main__':
#     # 並列処理でスクレイピングするためのプールを作成する
#     with multiprocessing.Pool(processes=4) as pool:
#         # スクレイピングするページ番号のリストを作成する
#         page_nums = list(range(1, 101))
#         # 各ページからのスクレイピング結果を格納するリストを作成する
#         df_list = []
#         # スクレイピング処理を並列化して実行する
#         for df in pool.imap_unordered(scrape, page_nums):
#             # スクレイピング結果をリストに追加する
#             df_list.append(df)
