In [9]:
import datetime
import time
import re
import numpy as np
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
import tqdm
import concurrent.futures

# ID = '202205050307'
# ID_for_URL = ID[2:]
# URL = f'http://jiro8.sakura.ne.jp/index.php?code={ID_for_URL}'
INDEX_X_PATH = '/html/body/table[3]/tbody/tr/td[3]/table[3]/tbody/tr[{NUM}]'
INDEX_X_PATHS = [INDEX_X_PATH.format(NUM = num) for num in [33, 34, 35, 36, 49, 50, 51, 52, 65, 66, 67, 68, 81, 82, 83, 84, 97, 98, 99, 100]]
RETURN_DF_COLS = ['lead_index_1','pase_index_1','up_index_1','speed_index_1'
           ,'lead_index_2','pase_index_2','up_index_2','speed_index_2'
           ,'lead_index_3','pase_index_3','up_index_3','speed_index_3'
           ,'lead_index_4','pase_index_4','up_index_4','speed_index_4'
           ,'lead_index_5','pase_index_5','up_index_5','speed_index_5']

def launch_chrome_driver(is_headless = False):
    options = webdriver.ChromeOptions()
    if is_headless:
        options.add_argument('--headless')
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    return driver

def clean_data(data:str) -> list:
    data_list = data.split(' ')
    return data_list

def delete_extra_str(hourse_number):
    return hourse_number[:-1]

def get_horse_number(driver):
    raw_data = driver.find_element_by_xpath('/html/body/table[3]/tbody/tr/td[3]/table[3]/tbody/tr[2]').text
    data_list =  clean_data(raw_data)
    data_list = delete_extra_str(data_list)
    return data_list

def get_index_list(driver, horse_number):
    index_list = []
    for path in INDEX_X_PATHS:
        for i in range(1, len(horse_number) + 1):
            driver = driver.find_element_by_xpath(path + '/td[{num}]'.format(num = i))
            index_list.append(driver.text)
    return index_list

def convert_list_to_df(index_list, horse_number):
    return pd.DataFrame(np.array(index_list).reshape(int(len(index_list)/len(horse_number)), len(horse_number)))
    
def add_columns(df):
    df.columns = RETURN_DF_COLS
    return df

def get_index_df(id:str):
    id_for_url = id[2:]
    url = f'http://jiro8.sakura.ne.jp/index.php?code={id_for_url}'
    Chrome = launch_chrome_driver(True)
    Chrome.get(url)
    horse_number = get_horse_number(Chrome)
    indexs = get_index_list(Chrome, horse_number)
    Chrome.close()
    index_df = convert_list_to_df(indexs, horse_number)
    df = index_df.T
    df = add_columns(df)
    df['race_id'] = id
    return df

In [10]:
seireki_list = [str(s) for s in list(range(2010, 2024))]
# 01：札幌、02：函館、03：福島、04：新潟、05：東京、06：中山、07：中京、08：京都、09：阪神、10：小倉
spot_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']
num_list = ['01', '02', '03', '04', '05', '06', '07']
date_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14']
race_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
id_list = []
for seireki in seireki_list:
    for spot in spot_list:
        for num in num_list:
            for date in date_list:
                for race in race_list:
                    id_list.append(seireki + spot + num + date + race)

In [19]:
id_list = id_list[:5]
all_df = pd.DataFrame(columns = RETURN_DF_COLS)
faild_list = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = [executor.submit(get_index_df, id) for id in id_list]
    for result, id in zip(results, id_list):
        try:
            all_df = all_df.append(result.result())
        except NoSuchElementException:
            faild_list.append(id)








In [17]:
all_df

Unnamed: 0,lead_index_1,pase_index_1,up_index_1,speed_index_1,lead_index_2,pase_index_2,up_index_2,speed_index_2,lead_index_3,pase_index_3,...,speed_index_3,lead_index_4,pase_index_4,up_index_4,speed_index_4,lead_index_5,pase_index_5,up_index_5,speed_index_5,race_id
0,6.57,6.57,-5.42,81.15,21.92,21.92,2.03,103.96,-8.34,-8.34,...,88.43,-9.69,-5.69,3.69,78.0,-19.83,-19.83,18.87,79.04,0
1,16.73,16.73,-7.31,89.42,-14.12,-10.12,23.82,93.7,-15.72,-11.72,...,81.64,,,,,,,,,0
2,1.32,1.32,2.97,84.3,-27.49,-20.49,1.3,60.8,-25.06,-21.06,...,50.34,,,,,,,,,0
3,-6.96,-10.96,19.94,88.98,-59.41,-59.41,13.89,34.47,,,...,,,,,,,,,,0
4,-9.21,-13.21,-0.66,66.12,-31.6,-35.6,3.79,48.19,,,...,,,,,,,,,,0
5,-12.92,-8.92,-0.32,70.75,-9.59,-2.59,-8.13,69.26,-3.77,3.22,...,92.55,-8.35,-1.35,-7.19,71.45,-11.26,-4.26,-4.28,71.45,0
6,15.8,9.8,0.43,90.23,-17.74,-17.74,10.52,72.78,,,...,,,,,,,,,,0
7,-8.18,-8.18,16.56,88.38,-12.87,-18.87,11.96,73.09,,,...,,,,,,,,,,0
8,1.9,10.9,-23.46,67.43,10.23,19.23,-30.6,68.63,-18.15,-9.15,...,71.5,-28.9,-24.9,-7.31,47.77,,,,,0
9,19.85,19.85,7.22,107.07,-7.23,-11.23,4.66,73.43,-7.23,-11.23,...,71.61,-30.95,-34.95,8.31,53.36,,,,,0


In [18]:
faild_list

['5956656469']