# 1.レース結果のスクレイピング


## パッケージ


In [25]:
import pandas as pd
import requests
import io, re
from bs4 import BeautifulSoup as bs
from lxml import html
from tqdm import tqdm
import time
import os

## クラス


In [28]:
class RaceResult:
    def __init__(self, url: str) -> None:
        self.url = url
        self.race_result = self.fetch_race_results(self.url)

    def fetch_race_results(self, url: str) -> pd.DataFrame:
        """レース結果を取得する

        Args:
            url (str): レース結果のURL

        Raises:
            ValueError: レース結果の取得できなかった場合はエラーを返す

        Returns:
            pd.DataFrame: レース結果
        """
        # 準備＆結果取得
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        response = requests.get(url, headers=headers)
        response.encoding = "EUC-JP"
        html_string = io.StringIO(response.text)
        results = pd.read_html(html_string)[0]
        soup = bs(html_string, "html.parser")
        # 馬のID
        horse_id_list = []
        horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
            "a", attrs={"href": re.compile("^/horse")}
        )
        for a in horse_a_list:
            horse_id = re.findall(r"[0-9]+", a["href"])
            horse_id_list.append(horse_id[0])
        # 騎手のID
        jockey_id_list = []
        jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
            "a", attrs={"href": re.compile("^/jockey")}
        )
        for a in jockey_a_list:
            jockey_id = re.findall(r"[0-9]+", a["href"])
            jockey_id_list.append(jockey_id[0])

        results["horse_id"] = horse_id_list
        results["jockey_id"] = jockey_id_list
        # データ整形
        results.columns = results.columns.str.replace(" ", "")
        results = results.sort_values(by="馬番")
        results = results.reset_index(drop=True)

        # レース情報
        lxml_data = html.fromstring(str(soup))
        raw_info1 = lxml_data.xpath(
            "//*[@id='main']/div/div/div/diary_snap/div/div/dl/dd/p/diary_snap_cut/span"
        )[0]
        raw_info_text1 = re.sub(r"\s", "", raw_info1.text)
        for item in raw_info_text1.split("/"):
            # 障害レースの時の距離と向き
            if re.match(r"(障.*)[0-9]{,4}m", item):
                results["course_type"] = "障"
                results["course_way"] = "無"
                results["course_length"] = re.search(r"[0-9]{0,4}m", item).group()[:-1]
            # 芝かダートの時の距離と向き
            elif re.match(r"(芝|ダ)(右|左|直線).*[0-9]{,4}m", item):
                item_replace = item.replace(" ", "").replace("直線", "直")
                results["course_type"] = item_replace[0]
                results["course_way"] = item_replace[1]
                results["course_length"] = re.search(
                    r"[0-9]{0,4}m", item_replace
                ).group()[:-1]
            # 天候取得
            elif "天候:" in item:
                results["weather"] = item[-1]
            # 馬場状態の取得
            elif "芝:" in item or "ダート:" in item:
                if any(results["course_type"] == "障"):
                    results["state_grass"] = item[2]
                    results["state_dirt"] = item[-1]
                elif any(results["course_type"] == "芝"):
                    results["state_grass"] = item[-1]
                    results["state_dirt"] = "無"
                elif any(results["course_type"] == "ダ"):
                    results["state_grass"] = "無"
                    results["state_dirt"] = item[-1]
        raw_info2 = lxml_data.xpath("//*[@id='main']/div/div/div/diary_snap/div/div/p")[
            0
        ]
        raw_info_text2 = raw_info2.text
        for item in raw_info_text2.split(" "):
            item = (
                item.replace("500万下", "1勝")
                .replace("1000万下", "2勝")
                .replace("1600万下", "3勝")
            )
            # レースのクラス
            match_race_class = re.search(r"新馬|未勝利|1勝|2勝|3勝|オープン", item)
            if match_race_class:
                results["class"] = match_race_class.group()
            # レースの日付
            elif re.match(r"[0-9]{4}年[0-9]{,2}月[0-9]{,2}日", item):
                results["date"] = item
            # レースの開催場所
            elif re.match(r"[0-9]*回.*[0-9]*日目", item):
                text = re.sub(r"[0-9]*回", "", item)
                text = re.sub(r"[0-9]*日目", "", text)
                results["place"] = text
        race_name = lxml_data.xpath(
            '//*[@id="main"]/div/div/div/diary_snap/div/div/dl/dd/h1/text()'
        )[0]
        match_race_class = re.search(r"G1|G2|G3|L", race_name)
        if match_race_class:
            results["class"] = match_race_class.group()
        if not ("class" in results.columns):
            print(raw_info_text1.split("/"))
            print(raw_info_text2.split(" "))
            print(race_name)
            raise ValueError("row of class is not exist")
        # ラウンド
        race_round = lxml_data.xpath(
            '//*[@id="main"]/div/div/div/diary_snap/div/div/dl/dt'
        )[0].text
        race_round = re.sub(r"[R\s\n]", "", race_round)
        results["round"] = race_round
        return results

    def save_race_results_pkl(self, path: str) -> None:
        """レース結果をpklで保存する

        Args:
            path (str): 保存するパス
        """
        self.race_result.to_pickle(path)

    def save_race_results_csv(self, path: str) -> None:
        """レース結果をcsvで保存する

        Args:
            path (str): 保存するパス
        """
        self.race_result.to_csv(path)


# url = "https://db.netkeiba.com/race/202209050411/"  # 芝
# url = "https://db.netkeiba.com/race/202207050803/"  # ダート
# url = "https://db.netkeiba.com/race/202206040801/"  # 障害
# fetch_race_results(url)

In [32]:
url = "https://db.netkeiba.com/race/201604020410/"
scraping = RaceResult(url)
scraping.save_race_results_pkl("../Raw-Data/Race-Results/2016/04020410.pkl")

## 実行


In [4]:
def get_all_files_in_directory(path):
    """
    指定したパスの配下にあるすべてのファイル（子孫ファイルも含む）のリストを返します。

    :param path: ファイルを検索するディレクトリのパス
    :return: ファイルのパスのリスト
    """
    files_list = []
    for root, dirs, files in os.walk(path):
        for file_name in files:
            files_list.append(f"{root}/{file_name}")
    return files_list


def write_path(path: str, write_list: list[str]) -> None:
    """パスを書き込む

    Args:
        path (str): 書き込むパス
        write_list (list[str]): 書き込むリスト
    """
    with open(path, "a") as f:
        for i in write_list:
            f.write(f"{i}\n")


def read_file(path: str) -> list[str]:
    """ファイルを読み込む

    Args:
        path (str): 読み込むパス

    Returns:
        list[str]: 読み込んだリスト
    """
    with open(path, "r") as f:
        read_list = f.readlines()
    new_list = []
    for i in read_list:
        new_list.append(i.replace("\n", ""))
    return new_list


len(get_all_files_in_directory("../Raw-Data/Race-Results/2017/"))

3455

In [5]:
# レースIDの構造：西暦(４桁)＋レース場ID＋開催回数＋何日目＋ラウンド
race_id_list = []
for place in range(1, 11, 1):
    for kai in range(1, 7, 1):
        for day in range(1, 13, 1):
            for r in range(1, 13, 1):
                race_id = (
                    str(place).zfill(2)
                    + str(kai).zfill(2)
                    + str(day).zfill(2)
                    + str(r).zfill(2)
                )
                race_id_list.append(race_id)
len(race_id_list)

8640

In [6]:
year = os.listdir("../Raw-Data/Race-Results/")
exits_file = get_all_files_in_directory("../Raw-Data/Race-Results/")
skip_files_add = []
skip_files = read_file("../tmp/skip_file_list.txt")

for i in year:
    for j in tqdm(race_id_list, desc=f"{i}"):
        if f"../Raw-Data/Race-Results/{i}/{j}.pkl" in skip_files:
            continue
        elif f"../Raw-Data/Race-Results/{i}/{j}.pkl" in exits_file:
            skip_files_add.append(f"../Raw-Data/Race-Results/{i}/{j}.pkl")
            if len(skip_files_add) > 10:
                write_path("../tmp/skip_file_list.txt", skip_files_add)
                skip_files_add = []
                skip_files = read_file("../tmp/skip_file_list.txt")
            continue
        try:
            time.sleep(1)
            scraping = RaceResult(f"https://db.netkeiba.com/race/{i}{j}/")
            scraping.save_race_results_pkl(f"../Raw-Data/Race-Results/{i}/{j}.pkl")
            skip_files_add.append(f"../Raw-Data/Race-Results/{i}/{j}.pkl")
            if len(skip_files_add) > 10:
                write_path("../tmp/skip_file_list.txt", skip_files_add)
                skip_files_add = []
                skip_files = read_file("../tmp/skip_file_list.txt")
        except IndexError as e:
            skip_files_add.append(f"../Raw-Data/Race-Results/{i}/{j}.pkl")
            if len(skip_files_add) > 10:
                write_path("../tmp/skip_file_list.txt", skip_files_add)
                skip_files_add = []
                skip_files = read_file("../tmp/skip_file_list.txt")
            continue

        except Exception as e:
            print(f"{i}/{j}")
            raise e

2015: 100%|██████████| 8640/8640 [00:00<00:00, 28063.93it/s]
2016: 100%|██████████| 8640/8640 [00:00<00:00, 10090.36it/s]
2017: 100%|██████████| 8640/8640 [00:01<00:00, 6221.92it/s]
2018: 100%|██████████| 8640/8640 [00:01<00:00, 4506.42it/s]
2019: 100%|██████████| 8640/8640 [00:02<00:00, 3476.78it/s]
2020: 100%|██████████| 8640/8640 [1:40:48<00:00,  1.43it/s]
2021: 100%|██████████| 8640/8640 [3:29:33<00:00,  1.46s/it]  
2022: 100%|██████████| 8640/8640 [1:59:54<00:00,  1.20it/s]  
2023: 100%|██████████| 8640/8640 [3:28:54<00:00,  1.45s/it]  


In [5]:
skip_race = []
for i in tqdm(race_id_list):
    if os.path.isfile(f"../data/Race-Results/2022/{i}.pkl"):
        continue
    try:
        scraping = RaceResult(f"https://db.netkeiba.com/race/2022{i}/")
        scraping.save_race_results_pkl(f"../Raw-Data/Race-Results/2022/{i}.pkl")
        time.sleep(1)
    except IndexError as e:
        time.sleep(1)
        skip_race.append(f"{i}")
        continue
    except Exception as e:
        print(i)
        raise e
with open("../skip-race.txt", "w") as f:
    for i in skip_race:
        f.write(f"{i}\n")

100%|██████████| 8640/8640 [3:31:08<00:00,  1.47s/it]  


# 2.競走馬の過去成績


## パッケージ


In [1]:
import os
import time
import pandas as pd
import io

import requests
from tqdm import tqdm

## クラス


In [2]:
class HorseResult:
    def __init__(self, horse_path: str, save_path: str) -> None:
        self.race_result = pd.read_pickle(horse_path)
        self.horse_ids = self.race_result["horse_id"]
        self.save_path = save_path

    def scraping(self, horse_id: str) -> pd.DataFrame | None:
        """レース結果を取得する

        Args:
            horse_id (str): 騎手のID

        Raises:
            e: レース結果の取得できなかった場合はエラーを返す

        Returns:
            pd.DataFrame | None: レース結果
        """
        try:
            url = f"https://db.netkeiba.com/horse/{horse_id}"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            df = pd.read_html(html_string)[3]
            if df.columns[0] == "受賞歴":
                df = pd.read_html(html_string)[4]
            time.sleep(1)
            return df
        except Exception as e:
            print(horse_id)
            raise e

    def save(self, data: pd.DataFrame, name: str) -> None:
        """レース結果を保存する

        Args:
            data (pd.DataFrame): レース結果
            name (str): 保存するファイル名
        """
        if data is None:
            return
        df: pd.DataFrame = data
        df.to_pickle(f"{self.save_path}/{name}.pkl")

    def scrape_save(self) -> None:
        """馬の過去成績を取得して保存する"""
        for horse_id in self.horse_ids:
            if os.path.isfile(f"{self.save_path}/{horse_id}.pkl"):
                continue
            df = self.scraping(horse_id)
            self.save(df, horse_id)

## 実行


In [3]:
def get_all_files_in_directory(path):
    """
    指定したパスの配下にあるすべてのファイル（子孫ファイルも含む）のリストを返します。

    :param path: ファイルを検索するディレクトリのパス
    :return: ファイルのパスのリスト
    """
    files_list = []
    for root, dirs, files in os.walk(path):
        for file_name in files:
            files_list.append(f"{root}/{file_name}")
    return files_list


def write_path(path: str, write_list: list[str]) -> None:
    """パスを書き込む

    Args:
        path (str): 書き込むパス
        write_list (list[str]): 書き込むリスト
    """
    with open(path, "a") as f:
        for i in write_list:
            f.write(f"{i}\n")


def read_file(path: str) -> list[str]:
    """ファイルを読み込む

    Args:
        path (str): 読み込むパス

    Returns:
        list[str]: 読み込んだリスト
    """
    with open(path, "r") as f:
        read_list = f.readlines()
    new_list = []
    for i in read_list:
        new_list.append(i.replace("\n", ""))
    return new_list


file_list = get_all_files_in_directory("../Raw-Data/Race-Results/")
did_race_list = read_file("../tmp/did_race_list.txt")
race_list = []
save_path = "../Raw-Data/Horse-Results"
for filename in tqdm(file_list):
    if filename in did_race_list:
        continue
    else:
        horse_results = HorseResult(f"{filename}", save_path=save_path)
        horse_results.scrape_save()
        race_list.append(f"{filename}")
        if len(race_list) >= 10:
            write_path("../tmp/did_race_list.txt", race_list)
            did_race_list = read_file("../tmp/did_race_list.txt")
            race_list = []

100%|██████████| 31093/31093 [3:01:26<00:00,  2.86it/s]  


## 確認用


In [None]:
save_path = "../data/Horse-Resutls"
race_result_path = "../data/Race-Results/2022/01020305.pkl"
test = HorseResult(race_result_path, save_path=save_path)
test.scrape_save()

In [None]:
url = "https://db.netkeiba.com/horse/2020101533"
response = requests.get(url)
response.encoding = "EUC-JP"
html_string = io.StringIO(response.text)
df = pd.read_html(html_string)[3]
if df.columns[0] == "受賞歴":
    df = pd.read_html(html_string)[4]
df

In [18]:
test = pd.read_pickle("../data/Horse-Results/2020100678.pkl")
test

Unnamed: 0,日付,開催,天 気,R,レース名,映 像,頭 数,枠 番,馬 番,オ ッ ズ,...,着差,ﾀｲﾑ 指数,通過,ペース,上り,馬体重,厩舎 ｺﾒﾝﾄ,備考,勝ち馬 (2着馬),賞金
0,2023/09/16,4阪神3,晴,4,3歳以上1勝クラス,,18,8,18,18.4,...,0.6,**,14-14,34.1-34.9,34.3,442(-6),,,マイネルラッシュ,
1,2023/08/19,2札幌3,小雨,9,千歳特別(1勝クラス),,14,3,3,6.2,...,0.8,**,6-5-5,29.5-35.9,36.1,448(+4),,,ビジュノワール,
2,2023/07/23,1札幌2,晴,8,3歳以上1勝クラス,,11,6,6,3.5,...,0.8,**,5-5-5-4,35.3-34.8,34.9,444(+4),,,クレバーテースト,80.0
3,2023/05/13,1京都7,曇,9,あずさ賞(1勝クラス),,12,5,6,25.2,...,0.1,**,1-1-1-1,36.0-33.7,33.8,440(+4),,,リミットバスター,437.6
4,2023/04/22,1京都1,晴,9,あやめ賞(1勝クラス),,8,8,8,43.4,...,0.6,**,5-4,33.1-35.5,35.6,436(0),,,プッシュオン,272.6
5,2022/12/28,6阪神9,晴,6,2歳1勝クラス,,11,8,10,57.0,...,3.5,**,7-7-7-7,37.3-37.6,40.4,436(-4),,,ユティタム,
6,2022/12/04,6中京2,晴,9,こうやまき賞(1勝クラス),,12,2,2,10.3,...,0.9,**,4-5-4,35.2-35.3,35.8,440(+4),,,コレペティトール,
7,2022/09/03,2札幌7,晴,11,札幌2歳S(G3),,14,5,8,42.7,...,1.6,**,8-8-7-7,36.7-36.3,37.2,436(-4),,,ドゥーラ,
8,2022/08/13,2札幌1,曇,9,コスモス賞(OP),,9,5,5,7.0,...,1.0,**,8-8-7-6,36.8-36.2,36.2,440(+4),,,モリアーナ,240.0
9,2022/07/23,1札幌1,曇,1,2歳未勝利,,5,1,1,1.2,...,-0.1,**,2-2-2-2,38.3-36.1,35.9,436(0),,,(トーセンウォルト),520.0


# 3.血統情報


## パッケージ


In [1]:
import os
import time
import pandas as pd
import io
from bs4 import BeautifulSoup as bs
import pickle

import requests
from tqdm import tqdm

## クラス


In [5]:
class PedigreeInfo:
    def __init__(self, horse_path: str, save_path: str) -> None:
        self.race_result = pd.read_pickle(horse_path)
        self.horse_ids = self.race_result["horse_id"]
        self.save_path = save_path

    def scrape(self, horse_id: str) -> list[str] | None:
        """血統情報を取得する

        Args:
            horse_id (str): 馬のID

        Raises:
            e: 血統情報の取得ができなかった場合はエラーをそのまま返す

        Returns:
            list[str] | None: 血統情報
        """
        try:
            url = f"https://db.netkeiba.com/horse/{horse_id}"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            soup = bs(html_string, "html.parser")
            peds = []
            id_list = soup.find("table", attrs={"class": "blood_table"}).find_all("a")
            for i in id_list[0], id_list[4]:
                peds.append(i.get("href").replace("/horse/ped/", "").replace("/", ""))
            peds
            time.sleep(1)
            return peds
        except Exception as e:
            print(horse_id)
            raise e

    def save(self, data: list[str] | None, name: str) -> None:
        """血統情報を保存する

        Args:
            data (list[str]): 血統情報
            name (str): 保存するファイル名
        """
        if data is None:
            return
        with open(f"{self.save_path}/{name}.pickle", "wb") as f:
            pickle.dump(data, f)

    def scrape_save(self) -> None:
        for horse_id in self.horse_ids:
            # 既にファイルが存在しているなら飛ばす
            if os.path.isfile(f"{self.save_path}/{horse_id}.pickle"):
                continue
            peds_list = self.scrape(horse_id)
            self.save(peds_list, horse_id)

## 実行


In [10]:
def get_all_files_in_directory(path):
    """
    指定したパスの配下にあるすべてのファイル（子孫ファイルも含む）のリストを返します。

    :param path: ファイルを検索するディレクトリのパス
    :return: ファイルのパスのリスト
    """
    files_list = []
    for root, dirs, files in os.walk(path):
        for file_name in files:
            files_list.append(f"{root}/{file_name}")
    return files_list


def write_path(path: str, write_list: list[str]) -> None:
    """パスを書き込む

    Args:
        path (str): 書き込むパス
        write_list (list[str]): 書き込むリスト
    """
    with open(path, "a") as f:
        for i in write_list:
            f.write(f"{i}\n")


def read_file(path: str) -> list[str]:
    """ファイルを読み込む

    Args:
        path (str): 読み込むパス

    Returns:
        list[str]: 読み込んだリスト
    """
    with open(path, "r") as f:
        read_list = f.readlines()
    new_list = []
    for i in read_list:
        new_list.append(i.replace("\n", ""))
    return new_list


file_list = get_all_files_in_directory("../Raw-Data/Race-Results/")
did_peds_list = read_file("../tmp/did_peds_list.txt")
ped_list = []
save_path = "../Raw-Data/Pedigree"
for filename in tqdm(file_list):
    # 既にファイルが存在しているなら飛ばす
    if filename in did_peds_list:
        continue
    else:
        try:
            pedigree = PedigreeInfo(f"{filename}", save_path=save_path)
            pedigree.scrape_save()
        except Exception as e:
            print(filename)
            raise e

100%|██████████| 31093/31093 [00:03<00:00, 8614.80it/s] 


## 確認


In [39]:
test_path = "../data/Pedigree/2020100678.pickle"
with open(test_path, "rb") as f:
    test = pickle.load(f)

test

['2011100655', '2002100816']

# 4.父・母父の過去成績


## パッケージ


In [20]:
import os
import time
import pandas as pd
import io
import pickle
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm

## クラス

In [2]:
class PedigreeResults:
    columns = [
        "日付",
        "開催",
        "天 気",
        "R",
        "レース名",
        "映 像",
        "頭 数",
        "枠 番",
        "馬 番",
        "オ ッ ズ",
        "人 気",
        "着 順",
        "騎手",
        "斤 量",
        "距離",
        "馬 場",
        "馬場 指数",
        "タイム",
        "着差",
        "ﾀｲﾑ 指数",
        "通過",
        "ペース",
        "上り",
        "馬体重",
        "厩舎 ｺﾒﾝﾄ",
        "備考",
        "勝ち馬 (2着馬)",
        "賞金",
    ]

    def __init__(self, horse_path: str, save_path: str) -> None:
        with open(horse_path, "rb") as f:
            self.horse_ids = pickle.load(f)
        self.save_path = save_path

    def df_process_sire_results(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            df.columns = ["_".join(col).strip() for col in df.columns.values]
            df = df[df["年度_年度"] != "累計"]
        except:
            df = None
        return df

    def scraping_result(self, horse_id: str) -> pd.DataFrame | None:
        """親馬の過去成績を取得する

        Args:
            horse_id (str): 親馬のID

        Raises:
            e: 親馬の過去成績の取得ができなかった場合はエラーをそのまま返す

        Returns:
            pd.DataFrame | None: 親馬の過去成績
        """
        try:
            url = f"https://db.netkeiba.com/horse/{horse_id}"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            df = pd.read_html(html_string)[3]
            # 年度代表馬のテーブルだったら取り直す
            if df.columns[0] != "日付":
                time.sleep(1)
                df = pd.read_html(html_string)[4]
                # それでも過去成績が取れなければ0埋めのデータとする
                if df.columns[0] != "日付":
                    df = pd.DataFrame(0, index=range(1), columns=self.columns)
            return df
        except Exception as e:
            print(horse_id)
            raise e

    def scraping_sire_results(self, horse_id: str) -> pd.DataFrame | None:
        try:
            url = f"https://db.netkeiba.com/?pid=horse_sire&id={horse_id}&course=1&mode=1&type=0"
            response = requests.get(url)

            response.encoding = "EUC-JP"
            soup = bs(response.text, "html.parser")
            target_element1 = soup.select_one('table[summary="産駒成績"]')
            target_element2 = soup.select_one('table[summary="成績"]')
            df_tmp1 = (
                pd.read_html(io.StringIO(str(target_element1)))[0]
                if target_element1 != None
                else None
            )
            df_tmp2 = (
                pd.read_html(io.StringIO(str(target_element2)))[0]
                if target_element2 != None
                else None
            )
            df = {
                "sire_results": self.df_process_sire_results(df_tmp1),
                "sire_results_BMS": self.df_process_sire_results(df_tmp2),
            }
        except Exception as e:
            print(horse_id)
            raise e
        return df

    def scraping_sire_course(self, horse_id: str) -> pd.DataFrame | None:
        try:
            url = f"https://db.netkeiba.com/?pid=horse_sire&id={horse_id}&course=1&mode=1&type=1"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            try:
                tables = pd.read_html(html_string)
            except:
                return [None, None]
            dfs = []
            first_loop = True
            for i in tables:
                tmp = i.copy()
                if tmp.iloc[0, 0] in ["輸入年", "供用開始年"]:
                    continue
                # マルチインデックスを1つにする
                tmp.columns = ["_".join(col).strip() for col in tmp.columns.values]
                # 累計の行は削除
                tmp = tmp[tmp["年度_年度"] != "累計"]
                # 初回ループはデータを格納する
                if first_loop:
                    first_loop = False
                    dfs.append(tmp)
                    continue
                # 次のブロックに移る
                elif tmp.columns[1] == "芝・良_1着":
                    dfs.append(tmp)
                    continue
                else:
                    dfs[-1] = pd.merge(dfs[-1], tmp, on="年度_年度", how="inner")
            # BeautifulSoupでパース
            soup = bs(response.text, "html.parser")
            # テーブルを含む要素がなければNoneを入れる
            target_element1 = soup.select_one('table[summary="産駒成績"]')
            target_element2 = soup.select_one('table[summary="成績"]')
            if target_element1 == None:
                dfs.insert(0, None)
            if target_element2 == None:
                dfs.append(None)
        except Exception as e:
            print(horse_id)
            raise e
        return dfs

    def scraping_sire_distance(self, horse_id: str) -> pd.DataFrame | None:
        try:
            url = f"https://db.netkeiba.com/?pid=horse_sire&id={horse_id}&course=1&mode=1&type=2"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            try:
                tables = pd.read_html(html_string)
            except:
                return [None, None]
            dfs = []
            first_loop = True
            for i in tables:
                tmp = i.copy()
                if tmp.iloc[0, 0] in ["輸入年", "供用開始年"]:
                    continue
                # マルチインデックスを1つにする
                tmp.columns = ["_".join(col).strip() for col in tmp.columns.values]
                # 累計の行は削除
                tmp = tmp[tmp["年度_年度"] != "累計"]
                # 全データ型をintに変換
                tmp = tmp.astype(int)
                # 初回ループはデータを格納する
                if first_loop:
                    first_loop = False
                    dfs.append(tmp)
                    continue
                # 次のブロックに移る
                elif tmp.columns[1] == "-1400(芝)_1着":
                    dfs.append(tmp)
                    continue
                else:
                    dfs[-1] = pd.merge(dfs[-1], tmp, on="年度_年度", how="inner")
            # BeautifulSoupでパース
            soup = bs(response.text, "html.parser")
            # テーブルを含む要素がなければNoneを入れる
            target_element1 = soup.select_one('table[summary="産駒成績"]')
            target_element2 = soup.select_one('table[summary="成績"]')
            if target_element1 == None:
                dfs.insert(0, None)
            if target_element2 == None:
                dfs.append(None)
        except Exception as e:
            print(horse_id)
            raise e
        return dfs

    def save(self, data: dict | None, name: str) -> None:
        """レース結果を保存する

        Args:
            data (dict): レース結果の辞書
            name (str): 保存するファイル名
        """
        if data is None:
            return
        with open(f"{self.save_path}/{name}.pickle", "wb") as f:
            pickle.dump(data, f)

    def scrape_save(self) -> None:
        for horse_id in self.horse_ids:
            # 既にファイルが作成されていればスルー
            if os.path.isfile(f"{self.save_path}/{horse_id}.pickle"):
                continue
            df_result = self.scraping_result(horse_id)
            time.sleep(1)
            df_sire_results = self.scraping_sire_results(horse_id)
            time.sleep(1)
            df_sire_course = self.scraping_sire_course(horse_id)
            time.sleep(1)
            df_sire_distance = self.scraping_sire_distance(horse_id)
            df = {
                "result": df_result,
                "sire_results": df_sire_results,
                "sire_course": df_sire_course,
                "sire_distance": df_sire_distance,
            }
            self.save(df, horse_id)

## メモ
保存ファイルの中身はdict、  
```json
{  
    result：過去成績(従来と変わらず)  
    sire_results{  
        sire_results：競争成績別の産駒成績,  
        sire_results_BMS：競争成績別の産駒成績(BMS)  
    },  
    sire_course{  
        sire_course：コース・馬場別の産駒成績,  
        sire_course_BMS：コース・馬場別の産駒成績(BMS)  
    },  
    sire_distance{  
        sire_distance：距離別の産駒成績,  
        sire_distance_BMS：距離別の産駒成績(BMS)  
    }  
}
```



## 確認


In [33]:
dir_path = "../Raw-Data/Pedigree"
save_path = "../Raw-Data/Pedigree-Results"
dir_list = os.listdir("../Raw-Data/Pedigree/")
pedigree_results = PedigreeResults(f"{dir_path}/2013110035.pickle", save_path=save_path)
pedigree_results.scrape_save()

In [135]:
test_path1 = "../Raw-Data/Pedigree-Results/1994109686.pickle"
test_path2 = "../Raw-Data/Pedigree-Results/000a002040.pickle"
with open(test_path2, "rb") as f:
    test = pickle.load(f)
test.keys()

dict_keys(['result', 'sire_results', 'sire_course', 'sire_distance'])

### 競争成績別

In [22]:
def df_process(df: pd.DataFrame) -> pd.DataFrame:
    try:
        df.columns = ["_".join(col).strip() for col in df.columns.values]
        df = df[df["年度_年度"] != "累計"]
    except:
        df = None
    return df


url = "https://db.netkeiba.com/?pid=horse_sire&id=000a01372e&course=1&mode=1&type=0"
response = requests.get(url)

response.encoding = "EUC-JP"
html_string = io.StringIO(response.text)
try:
    tables = pd.read_html(html_string)
except Exception as e:
    df = {"sire_results": None, "sire_results_BMS": None}
soup = bs(response.text, "html.parser")
target_element1 = soup.select_one('table[summary="産駒成績"]')
target_element2 = soup.select_one('table[summary="成績"]')
df_tmp1 = (
    pd.read_html(io.StringIO(str(target_element1)))[0]
    if target_element1 != None
    else None
)
df_tmp2 = (
    pd.read_html(io.StringIO(str(target_element1)))[0]
    if target_element1 != None
    else None
)
df = {
    "sire_results": df_process(df_tmp1),
    "sire_results_BMS": df_process(df_tmp2),
}
df

<Response [400]>


{'sire_results': None, 'sire_results_BMS': None}

### コース・馬場別

In [15]:
url = "https://db.netkeiba.com/?pid=horse_sire&id=000a01372e&course=1&mode=1&type=1"
response = requests.get(url)
response.encoding = "EUC-JP"
html_string = io.StringIO(response.text)
tables = pd.read_html(html_string)
dfs = []
first_loop = True
for i in tables:
    tmp = i.copy()
    if tmp.iloc[0, 0] in ["輸入年", "供用開始年"]:
        continue
    # マルチインデックスを1つにする
    tmp.columns = ["_".join(col).strip() for col in tmp.columns.values]
    # 累計の行は削除
    tmp = tmp[tmp["年度_年度"] != "累計"]
    # 初回ループはデータを格納する
    if first_loop:
        first_loop = False
        dfs.append(tmp)
        continue
    # 次のブロックに移る
    elif tmp.columns[1] == "芝・良_1着":
        dfs.append(tmp)
        continue
    else:
        dfs[-1] = pd.merge(dfs[-1], tmp, on="年度_年度", how="inner")
# BeautifulSoupでパース
soup = bs(response.text, "html.parser")

# テーブルを含む要素がなければNoneを入れる
target_element1 = soup.select_one('table[summary="産駒成績"]')
target_element2 = soup.select_one('table[summary="成績"]')
if target_element1 == None:
    dfs.insert(0, None)
if target_element2 == None:
    dfs.append(None)
dfs

[  年度_年度  芝・良_1着  芝・良_2着  芝・良_3着  芝・良_着外  芝・稍重_1着  芝・稍重_2着  芝・稍重_3着  芝・稍重_着外  \
 0  2022       0       0       0       0        0        0        0        0   
 1  2021       0       0       0       0        0        0        0        0   
 2  2020       0       0       0       0        0        0        0        0   
 3  2019       0       0       0       0        0        0        0        0   
 
    芝・重_1着  ...  障害・稍重_3着  障害・稍重_着外  障害・重_1着  障害・重_2着  障害・重_3着  障害・重_着外  \
 0       0  ...         0         0        0        0        0        0   
 1       0  ...         0         0        0        0        0        0   
 2       0  ...         0         0        0        0        0        0   
 3       0  ...         0         0        0        0        0        0   
 
    障害・不良_1着  障害・不良_2着  障害・不良_3着  障害・不良_着外  
 0         0         0         0         0  
 1         0         0         0         0  
 2         0         0         0         0  
 3         0         0         0         

### 距離別

In [17]:
url = "https://db.netkeiba.com/?pid=horse_sire&id=000a01372e&course=1&mode=1&type=2"
response = requests.get(url)
response.encoding = "EUC-JP"
html_string = io.StringIO(response.text)
tables = pd.read_html(html_string)
dfs = []
first_loop = True
for i in tables:
    tmp = i.copy()
    if tmp.iloc[0, 0] in ["輸入年", "供用開始年"]:
        continue
    # マルチインデックスを1つにする
    tmp.columns = ["_".join(col).strip() for col in tmp.columns.values]
    # 累計の行は削除
    tmp = tmp[tmp["年度_年度"] != "累計"]
    # 全データ型をintに変換
    tmp = tmp.astype(int)
    # 初回ループはデータを格納する
    if first_loop:
        first_loop = False
        dfs.append(tmp)
        continue
    # 次のブロックに移る
    elif tmp.columns[1] == "-1400(芝)_1着":
        dfs.append(tmp)
        continue
    else:
        dfs[-1] = pd.merge(dfs[-1], tmp, on="年度_年度", how="inner")

# BeautifulSoupでパース
soup = bs(response.text, "html.parser")

# テーブルを含む要素がなければNoneを入れる
target_element1 = soup.select_one('table[summary="産駒成績"]')
target_element2 = soup.select_one('table[summary="成績"]')
if target_element1 == None:
    dfs.insert(0, None)
if target_element2 == None:
    dfs.append(None)
dfs

[   年度_年度  -1400(芝)_1着  -1400(芝)_2着  -1400(芝)_3着  -1400(芝)_着外  -1800(芝)_1着  \
 0   2022            0            0            0            0            0   
 1   2021            0            0            0            0            0   
 2   2020            0            0            0            0            0   
 3   2019            0            0            0            0            0   
 
    -1800(芝)_2着  -1800(芝)_3着  -1800(芝)_着外  -2200(芝)_1着  ...  -2600(ダート)_3着  \
 0            0            0            0            0  ...              0   
 1            0            0            0            0  ...              0   
 2            0            0            0            0  ...              0   
 3            0            0            0            0  ...              0   
 
    -2600(ダート)_着外  2600-(ダート)_1着  2600-(ダート)_2着  2600-(ダート)_3着  2600-(ダート)_着外  \
 0              0              0              0              0              0   
 1              0              0              0       

## 実行


In [3]:
dir_path = "../Raw-Data/Pedigree"
save_path = "../Raw-Data/Pedigree-Results"
dir_list = os.listdir("../Raw-Data/Pedigree/")
for filename in tqdm(dir_list):
    try:
        pedigree_results = PedigreeResults(
            f"{dir_path}/{filename}", save_path=save_path
        )
        pedigree_results.scrape_save()
    except Exception as e:
        print(filename)
        raise e

100%|██████████| 49160/49160 [53:31<00:00, 15.31it/s]   


# 5.騎手情報

## パッケージ

In [1]:
import os
import time
import pandas as pd
import io
import pickle
from bs4 import BeautifulSoup as bs
import requests
import re
from tqdm import tqdm

## クラス

In [13]:
class JockeyResults:
    def __init__(self, read_path: str) -> None:
        self.read_path = read_path
        self.jockey_ids = self.get_jockey_ids(self.read_path)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
        }

    def get_jockey_ids(self, path: str) -> list[str]:
        df = pd.read_pickle(path)
        return df["jockey_id"].to_list()

    def scrape_results(self, jockey_id: str) -> dict:
        url = f"https://db.netkeiba.com/jockey/result/{jockey_id}"
        response = requests.get(url, headers=self.headers)
        response.encoding = "EUC-JP"
        html_string = io.StringIO(response.text)
        soup = bs(response.text, "html.parser")
        name = soup.select_one("div.db_head_name.fc > h1").text.strip()
        clean_name = re.sub(r"\s*\(.*\)", "", name)
        try:
            tables = pd.read_html(html_string)[0]
        except:
            tables = None
        if tables is not None:
            tables.columns = ["_".join(col).strip() for col in tables.columns.values]
        jockey = {"id": jockey_id, "name": clean_name, "results": tables}
        return jockey

    def save(self, path: str) -> None:
        for jockey_id in self.jockey_ids:
            file_name = f"{path}/{jockey_id}.pickle"
            if os.path.exists(file_name):
                continue
            result = self.scrape_results(jockey_id)
            with open(file_name, "wb") as f:
                pickle.dump(result, f)
            time.sleep(1.5)

## 確認

In [30]:
with open("../Raw-Data/Race-Results/2016/04020409.pkl", "rb") as f:
    data = pickle.load(f)
data

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,...,course_type,course_way,course_length,weather,state_grass,state_dirt,date,place,class,round
0,7,1,1,アルスフェルト,牝3,52,戸崎圭太,1:47.5,クビ,3.4,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
1,15,2,2,ラトゥール,牝4,55,柴田善臣,1:48.9,5,221.1,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
2,14,2,3,プレシャスメイト,牝4,55,田辺裕信,1:48.1,ハナ,10.1,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
3,4,3,4,ムーンドロップ,牡5,57,横山典弘,1:47.3,1.1/2,25.3,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
4,8,3,5,クラウンディバイダ,牡3,54,武豊,1:47.6,クビ,5.6,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
5,1,4,6,ロッカフェスタ,牡4,57,Ｍ．デム,1:46.8,,3.7,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
6,3,4,7,コメットシーカー,牡6,57,石橋脩,1:47.1,1/2,12.3,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
7,11,5,8,タニセンビクトリー,牡3,54,蛯名正義,1:47.8,1,15.3,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
8,9,5,9,ストライクショット,牝3,52,田中勝春,1:47.6,クビ,42.0,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9
9,13,6,10,ブレイクエース,牡4,57,吉田豊,1:48.1,クビ,39.5,...,芝,左,1800,晴,良,無,2016年8月7日,新潟,1勝,9


In [15]:
url = "https://db.netkeiba.com/jockey/result/00666"
url = "https://db.netkeiba.com/jockey/result/01109"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)
response.encoding = "EUC-JP"
html_string = io.StringIO(response.text)
soup = bs(response.text, "html.parser")
name = soup.select_one("div.db_head_name.fc > h1").text.strip()
clean_name = re.sub(r"\s*\(.*\)", "", name)
try:
    tables = pd.read_html(html_string)[0]
except:
    tables = None
if tables is not None:
    tables.columns = ["_".join(col).strip() for col in tables.columns.values]
jockey = {"name": clean_name, "results": tables}
jockey

{'name': '黛弘人',
 'results':    年度_年度  順位_順位  1着_1着  2着_2着  3着_3着  着外_着外  重賞_出走  重賞_勝利  特別_出走  特別_勝利  ...  \
 0     累計    NaN    200    271    289   5575     50      2   1383     60  ...   
 1   2024   77.0     10     13     12    281      4      0     77      3  ...   
 2   2023   83.0      9     10     15    279      4      0     76      6  ...   
 3   2022   86.0      8     13     21    285      0      0     81      2  ...   
 4   2021   68.0     13     16     24    337      4      0    112      7  ...   
 5   2020   87.0      8     14     10    326      1      0     79      1  ...   
 6   2019   88.0      8     19     14    282      4      0     70      1  ...   
 7   2018   63.0     12     22     16    329      3      0     87      2  ...   
 8   2017   95.0      5     23     19    332      5      0     97      2  ...   
 9   2016   48.0     21     12     17    345      6      1    111     11  ...   
 10  2015   62.0     15     16     21    381      7      1    103      6  ...   
 

## 実行

In [33]:
# レース結果のpklファイルを読み込む
race_results_dir = "../Raw-Data/Race-Results"
race_results_files = []

# サブディレクトリを含めて全てのpklファイルを取得
for root, dirs, files in os.walk(race_results_dir):
    for file_name in files:
        if file_name.endswith(".pkl"):
            race_results_files.append(os.path.join(root, file_name))

# 各ファイルを読み込んでDataFrameを取得
for file_name in tqdm(race_results_files):
    try:
        jockey_results = JockeyResults(file_name)
        jockey_results.save("../Raw-Data/Jockey-Results/")
    except Exception as e:
        print(file_name)
        raise e

100%|██████████| 31093/31093 [05:48<00:00, 89.26it/s] 


In [12]:
# 01014.pickleを読み込む
with open("../Raw-Data/Jockey-Results/01014.pickle", "rb") as f:
    jockey_data = pickle.load(f)

# 読み込んだデータを表示
jockey_data

{'id': '01014',
 'name': '福永祐一',
 'results':    年度_年度  順位_順位  1着_1着  2着_2着  3着_3着  着外_着外  重賞_出走  重賞_勝利  特別_出走  特別_勝利  ...  \
 0     累計    NaN   2636   2211   2064  12586   1502    160   5056    617  ...   
 1   2023   58.0     18     14     10     63     10      0     30      7  ...   
 2   2022    7.0    101     72     82    355     60      3    165     26  ...   
 3   2021    4.0    123     92     92    380     68     10    179     33  ...   
 4   2020    3.0    134     91     85    388     62     11    176     29  ...   
 5   2019    4.0    107    110     84    412     70      6    186     26  ...   
 6   2018    4.0    103     83     84    419     70      5    173     24  ...   
 7   2017    4.0    116     85     81    458     53      3    195     25  ...   
 8   2016    5.0    106    101     70    426     59     11    191     27  ...   
 9   2015    2.0    121    112     75    430     49      5    198     32  ...   
 10  2014    4.0    118    111     92    430     50      3    192