# 1.レース結果のスクレイピング


## パッケージ


In [1]:
import pandas as pd
import requests
import io, re
from bs4 import BeautifulSoup as bs
from lxml import html
from tqdm import tqdm
import time
import os

## クラス


In [2]:
class RaceResult:
    def __init__(self, url: str) -> None:
        self.url = url
        self.race_result = self.fetch_race_results(self.url)

    def fetch_race_results(self, url: str) -> pd.DataFrame:
        # 準備＆結果取得
        response = requests.get(url)
        response.encoding = "EUC-JP"
        html_string = io.StringIO(response.text)
        results = pd.read_html(html_string)[0]
        soup = bs(html_string, "html.parser")
        # 馬データ
        horse_id_list = []
        horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
            "a", attrs={"href": re.compile("^/horse")}
        )
        for a in horse_a_list:
            horse_id = re.findall(r"[0-9]+", a["href"])
            horse_id_list.append(horse_id[0])
        # 騎手データ
        jockey_id_list = []
        jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
            "a", attrs={"href": re.compile("^/jockey")}
        )
        for a in jockey_a_list:
            jockey_id = re.findall(r"[0-9]+", a["href"])
            jockey_id_list.append(jockey_id[0])

        results["horse_id"] = horse_id_list
        results["jockey_id"] = jockey_id_list
        # データ整形
        results.columns = results.columns.str.replace(" ", "")
        results = results.sort_values(by="馬番")
        results = results.reset_index(drop=True)

        # レース情報
        lxml_data = html.fromstring(str(soup))
        raw_info1 = lxml_data.xpath(
            "//*[@id='main']/div/div/div/diary_snap/div/div/dl/dd/p/diary_snap_cut/span"
        )[0]
        raw_info_text1 = re.sub(r"\s", "", raw_info1.text)
        for item in raw_info_text1.split("/"):
            # 障害レースの時の距離と向き
            if re.match(r"(障.*)[0-9]{,4}m", item):
                results["course_type"] = "障"
                results["course_way"] = "無"
                results["course_length"] = re.search(r"[0-9]{0,4}m", item).group()[:-1]
            # 芝かダートの時の距離と向き
            elif re.match(r"(芝|ダ)(右|左|直線).*[0-9]{,4}m", item):
                item_replace = item.replace(" ", "").replace("直線", "直")
                results["course_type"] = item_replace[0]
                results["course_way"] = item_replace[1]
                results["course_length"] = re.search(
                    r"[0-9]{0,4}m", item_replace
                ).group()[:-1]
            # 天候取得
            elif "天候:" in item:
                results["weather"] = item[-1]
            # 馬場状態の取得
            elif "芝:" in item or "ダート:" in item:
                if any(results["course_type"] == "障"):
                    results["state_grass"] = item[2]
                    results["state_dirt"] = item[-1]
                elif any(results["course_type"] == "芝"):
                    results["state_grass"] = item[-1]
                    results["state_dirt"] = "無"
                elif any(results["course_type"] == "ダ"):
                    results["state_grass"] = "無"
                    results["state_dirt"] = item[-1]
        raw_info2 = lxml_data.xpath("//*[@id='main']/div/div/div/diary_snap/div/div/p")[
            0
        ]
        raw_info_text2 = raw_info2.text
        for item in raw_info_text2.split(" "):
            # レースのクラス
            match_race_class = re.search(r"新馬|未勝利|1勝|2勝|3勝|オープン", item)
            if match_race_class:
                results["class"] = match_race_class.group()
            # レースの日付
            elif re.match(r"[0-9]{4}年[0-9]{,2}月[0-9]{,2}日", item):
                results["date"] = item
            # レースの開催場所
            elif re.match(r"[0-9]*回.*[0-9]*日目", item):
                text = re.sub(r"[0-9]*回", "", item)
                text = re.sub(r"[0-9]*日目", "", text)
                results["place"] = text
        race_name = lxml_data.xpath(
            '//*[@id="main"]/div/div/div/diary_snap/div/div/dl/dd/h1/text()'
        )[0]
        match_race_class = re.search(r"G1|G2|G3|L", race_name)
        if match_race_class:
            results["class"] = match_race_class.group()
        if not ("class" in results.columns):
            print(raw_info_text1.split("/"))
            print(raw_info_text2.split(" "))
            print(race_name)
            raise ValueError("row of class is not exist")
        # ラウンド
        race_round = lxml_data.xpath(
            '//*[@id="main"]/div/div/div/diary_snap/div/div/dl/dt'
        )[0].text
        race_round = re.sub(r"[R\s\n]", "", race_round)
        results["round"] = race_round
        return results

    def save_race_results_pkl(self, path: str):
        self.race_result.to_pickle(path)

    def save_race_results_csv(self, path: str):
        self.race_result.to_csv(path)


# url = "https://db.netkeiba.com/race/202209050411/"  # 芝
# url = "https://db.netkeiba.com/race/202207050803/"  # ダート
# url = "https://db.netkeiba.com/race/202206040801/"  # 障害
# fetch_race_results(url)

In [3]:
url = "https://db.netkeiba.com/race/202206040801/"  # 障害
scraping = RaceResult(url)
scraping.race_result

Unnamed: 0,着順,枠番,馬番,馬名,性齢,斤量,騎手,タイム,着差,単勝,...,course_type,course_way,course_length,weather,state_grass,state_dirt,date,place,class,round
0,2,1,1,グラスディアブロ,牡5,60,大江原圭,3:14.8,クビ,4.7,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
1,10,2,2,フィアレスハート,セ5,60,上野翔,3:17.6,4,20.6,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
2,9,3,3,シーザワールド,牡9,60,石神深一,3:17.0,2.1/2,11.6,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
3,12,4,4,コルドゥーン,牡6,60,大庭和弥,3:18.3,3,93.6,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
4,3,5,5,スマートキャノン,牡3,56,小牧加矢,3:14.8,アタマ,4.9,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
5,11,5,6,アサクサトラマル,牡4,60,蓑島靖典,3:17.8,3/4,43.5,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
6,4,6,7,エクスパートラン,牡7,60,高田潤,3:14.9,3/4,13.8,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
7,1,6,8,メイショウアツイタ,牡4,60,難波剛健,3:14.7,,2.8,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
8,6,7,9,ルレーヴドゥリリ,牝5,58,植野貴也,3:16.2,6,6.0,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1
9,7,7,10,ワンダーサーイター,牡5,60,草野太郎,3:16.5,1.3/4,125.7,...,障,無,2880,晴,良,良,2022年10月1日,中山,未勝利,1


## 実行


In [4]:
# レースIDの構造：西暦(４桁)＋レース場ID＋開催回数＋何日目＋ラウンド
race_id_list = []
for place in range(1, 11, 1):
    for kai in range(1, 7, 1):
        for day in range(1, 13, 1):
            for r in range(1, 13, 1):
                race_id = (
                    str(place).zfill(2)
                    + str(kai).zfill(2)
                    + str(day).zfill(2)
                    + str(r).zfill(2)
                )
                race_id_list.append(race_id)
len(race_id_list)

8640

In [5]:
skip_race = []
for i in tqdm(race_id_list):
    if os.path.isfile(f"../data/Race-Results/2022/{i}.pkl"):
        continue
    try:
        scraping = RaceResult(f"https://db.netkeiba.com/race/2022{i}/")
        scraping.save_race_results_pkl(f"../Raw-Data/Race-Results/2022/{i}.pkl")
        time.sleep(1)
    except IndexError as e:
        time.sleep(1)
        skip_race.append(f"{i}")
        continue
    except Exception as e:
        print(i)
        raise e
with open("../skip-race.txt", "w") as f:
    for i in skip_race:
        f.write(f"{i}\n")

100%|██████████| 8640/8640 [3:31:08<00:00,  1.47s/it]  


# 2.競走馬の過去成績


## パッケージ


In [1]:
import os
import time
import pandas as pd
import io

import requests
from tqdm import tqdm

## クラス


In [10]:
class HorseResult:
    def __init__(self, horse_path: str, save_path: str) -> None:
        self.race_result = pd.read_pickle(horse_path)
        self.horse_ids = self.race_result["horse_id"]
        self.save_path = save_path

    def __scraping(self, horse_id) -> pd.DataFrame | None:
        try:
            url = f"https://db.netkeiba.com/horse/{horse_id}"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            df = pd.read_html(html_string)[3]
            if df.columns[0] == "受賞歴":
                df = pd.read_html(html_string)[4]
            time.sleep(1)
            return df
        except Exception as e:
            print(horse_id)
            raise e

    def __save(self, data, name) -> None:
        if data is None:
            return
        df: pd.DataFrame = data
        df.to_pickle(f"{self.save_path}/{name}.pkl")

    def scrape_save(self) -> None:
        for horse_id in self.horse_ids:
            if os.path.isfile(f"{self.save_path}/{horse_id}.pkl"):
                continue
            df = self.__scraping(horse_id)
            self.__save(df, horse_id)

## 実行


In [6]:
dir_path = "../data/Race-Results/2022"
save_path = "../data/Horse-Results"
dir_list = os.listdir("../data/Race-Results/2022")
for filename in tqdm(dir_list):
    try:
        horse_results = HorseResult(f"{dir_path}/{filename}", save_path=save_path)
        horse_results.scrape_save()
    except Exception as e:
        print(filename)
        raise e

100%|██████████| 3456/3456 [5:00:05<00:00,  5.21s/it]   


## 確認用


In [None]:
save_path = "../data/Horse-Resutls"
race_result_path = "../data/Race-Results/2022/01020305.pkl"
test = HorseResult(race_result_path, save_path=save_path)
test.scrape_save()

In [None]:
url = "https://db.netkeiba.com/horse/2020101533"
response = requests.get(url)
response.encoding = "EUC-JP"
html_string = io.StringIO(response.text)
df = pd.read_html(html_string)[3]
if df.columns[0] == "受賞歴":
    df = pd.read_html(html_string)[4]
df

In [18]:
test = pd.read_pickle("../data/Horse-Results/2020100678.pkl")
test

Unnamed: 0,日付,開催,天 気,R,レース名,映 像,頭 数,枠 番,馬 番,オ ッ ズ,...,着差,ﾀｲﾑ 指数,通過,ペース,上り,馬体重,厩舎 ｺﾒﾝﾄ,備考,勝ち馬 (2着馬),賞金
0,2023/09/16,4阪神3,晴,4,3歳以上1勝クラス,,18,8,18,18.4,...,0.6,**,14-14,34.1-34.9,34.3,442(-6),,,マイネルラッシュ,
1,2023/08/19,2札幌3,小雨,9,千歳特別(1勝クラス),,14,3,3,6.2,...,0.8,**,6-5-5,29.5-35.9,36.1,448(+4),,,ビジュノワール,
2,2023/07/23,1札幌2,晴,8,3歳以上1勝クラス,,11,6,6,3.5,...,0.8,**,5-5-5-4,35.3-34.8,34.9,444(+4),,,クレバーテースト,80.0
3,2023/05/13,1京都7,曇,9,あずさ賞(1勝クラス),,12,5,6,25.2,...,0.1,**,1-1-1-1,36.0-33.7,33.8,440(+4),,,リミットバスター,437.6
4,2023/04/22,1京都1,晴,9,あやめ賞(1勝クラス),,8,8,8,43.4,...,0.6,**,5-4,33.1-35.5,35.6,436(0),,,プッシュオン,272.6
5,2022/12/28,6阪神9,晴,6,2歳1勝クラス,,11,8,10,57.0,...,3.5,**,7-7-7-7,37.3-37.6,40.4,436(-4),,,ユティタム,
6,2022/12/04,6中京2,晴,9,こうやまき賞(1勝クラス),,12,2,2,10.3,...,0.9,**,4-5-4,35.2-35.3,35.8,440(+4),,,コレペティトール,
7,2022/09/03,2札幌7,晴,11,札幌2歳S(G3),,14,5,8,42.7,...,1.6,**,8-8-7-7,36.7-36.3,37.2,436(-4),,,ドゥーラ,
8,2022/08/13,2札幌1,曇,9,コスモス賞(OP),,9,5,5,7.0,...,1.0,**,8-8-7-6,36.8-36.2,36.2,440(+4),,,モリアーナ,240.0
9,2022/07/23,1札幌1,曇,1,2歳未勝利,,5,1,1,1.2,...,-0.1,**,2-2-2-2,38.3-36.1,35.9,436(0),,,(トーセンウォルト),520.0


# 3.血統情報


## パッケージ


In [1]:
import os
import time
import pandas as pd
import io
from bs4 import BeautifulSoup as bs
import pickle

import requests
from tqdm import tqdm

## クラス


In [2]:
class PedigreeInfo:
    def __init__(self, horse_path: str, save_path: str) -> None:
        self.race_result = pd.read_pickle(horse_path)
        self.horse_ids = self.race_result["horse_id"]
        self.save_path = save_path

    def __scrape(self, horse_id):
        try:
            url = f"https://db.netkeiba.com/horse/{horse_id}"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            soup = bs(html_string, "html.parser")
            peds = []
            id_list = soup.find("table", attrs={"class": "blood_table"}).find_all("a")
            for i in id_list[0], id_list[4]:
                peds.append(i.get("href").replace("/horse/ped/", "").replace("/", ""))
            peds
            time.sleep(1)
            return peds
        except Exception as e:
            print(horse_id)
            raise e

    def __save(self, data, name) -> None:
        if data is None:
            return
        with open(f"{self.save_path}/{name}.pickle", "wb") as f:
            pickle.dump(data, f)

    def scrape_save(self) -> None:
        for horse_id in self.horse_ids:
            if os.path.isfile(f"{self.save_path}/{horse_id}.pickle"):
                continue
            peds_list = self.__scrape(horse_id)
            self.__save(peds_list, horse_id)

## 実行


In [4]:
dir_path = "../data/Race-Results/2022"
save_path = "../data/Pedigree"
dir_list = os.listdir("../data/Race-Results/2022")
for filename in tqdm(dir_list):
    try:
        pedigree = PedigreeInfo(f"{dir_path}/{filename}", save_path=save_path)
        pedigree.scrape_save()
    except Exception as e:
        print(filename)
        raise e

100%|██████████| 3456/3456 [14:19<00:00,  4.02it/s] 


## 確認


In [39]:
test_path = "../data/Pedigree/2020100678.pickle"
with open(test_path, "rb") as f:
    test = pickle.load(f)

test

['2011100655', '2002100816']

# 4.父・母父の過去成績


## パッケージ


In [1]:
import os
import time
import pandas as pd
import io
import pickle

import requests
from tqdm import tqdm

## クラス


In [2]:
class PedigreeResults:
    columns = [
        "日付",
        "開催",
        "天 気",
        "R",
        "レース名",
        "映 像",
        "頭 数",
        "枠 番",
        "馬 番",
        "オ ッ ズ",
        "人 気",
        "着 順",
        "騎手",
        "斤 量",
        "距離",
        "馬 場",
        "馬場 指数",
        "タイム",
        "着差",
        "ﾀｲﾑ 指数",
        "通過",
        "ペース",
        "上り",
        "馬体重",
        "厩舎 ｺﾒﾝﾄ",
        "備考",
        "勝ち馬 (2着馬)",
        "賞金",
    ]

    def __init__(self, horse_path: str, save_path: str) -> None:
        with open(horse_path, "rb") as f:
            self.horse_ids = pickle.load(f)
        self.save_path = save_path

    def __scraping(self, horse_id) -> pd.DataFrame | None:
        try:
            url = f"https://db.netkeiba.com/horse/{horse_id}"
            response = requests.get(url)
            response.encoding = "EUC-JP"
            html_string = io.StringIO(response.text)
            df = pd.read_html(html_string)[3]
            # 年度代表馬のテーブルだったら取り直す
            if df.columns[0] != "日付":
                time.sleep(1)
                df = pd.read_html(html_string)[4]
                # それでも過去成績が取れなければ0埋めのデータとする
                if df.columns[0] != "日付":
                    df = pd.DataFrame(0, index=range(1), columns=self.columns)
            time.sleep(1)
            return df
        except Exception as e:
            print(horse_id)
            raise e

    def __save(self, data, name) -> None:
        if data is None:
            return
        df: pd.DataFrame = data
        df.to_pickle(f"{self.save_path}/{name}.pkl")

    def scrape_save(self) -> None:
        for horse_id in self.horse_ids:
            if os.path.isfile(f"{self.save_path}/{horse_id}.pkl"):
                continue
            df = self.__scraping(horse_id)
            self.__save(df, horse_id)

## 実行


In [3]:
dir_path = "../data/Pedigree"
save_path = "../data/Pedigree-Results"
dir_list = os.listdir("../data/Pedigree/")
for filename in tqdm(dir_list):
    try:
        pedigree_results = PedigreeResults(
            f"{dir_path}/{filename}", save_path=save_path
        )
        pedigree_results.scrape_save()
    except Exception as e:
        print(filename)
        raise e

  0%|          | 0/11557 [00:00<?, ?it/s]

100%|██████████| 11557/11557 [42:31<00:00,  4.53it/s]  


## 確認


In [43]:
url = f"https://db.netkeiba.com/horse/000a000178"
# url = f"https://db.netkeiba.com/horse/2019101760"
columns = [
    "日付",
    "開催",
    "天 気",
    "R",
    "レース名",
    "映 像",
    "頭 数",
    "枠 番",
    "馬 番",
    "オ ッ ズ",
    "人 気",
    "着 順",
    "騎手",
    "斤 量",
    "距離",
    "馬 場",
    "馬場 指数",
    "タイム",
    "着差",
    "ﾀｲﾑ 指数",
    "通過",
    "ペース",
    "上り",
    "馬体重",
    "厩舎 ｺﾒﾝﾄ",
    "備考",
    "勝ち馬 (2着馬)",
    "賞金",
]
response = requests.get(url)
response.encoding = "EUC-JP"
html_string = io.StringIO(response.text)
df = pd.read_html(html_string)[3]
# 年度代表馬のテーブルだったら取り直す
if df.columns[0] != "日付":
    df = pd.read_html(html_string)[4]
    # それでも過去成績が取れなければ0埋めのデータとする
    if df.columns[0] != "日付":
        df = pd.DataFrame(0, index=range(1), columns=columns)

df

Unnamed: 0,日付,開催,天 気,R,レース名,映 像,頭 数,枠 番,馬 番,オ ッ ズ,...,着差,ﾀｲﾑ 指数,通過,ペース,上り,馬体重,厩舎 ｺﾒﾝﾄ,備考,勝ち馬 (2着馬),賞金
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
test_path = "../data/Pedigree/2019101760.pickle"
with open(test_path, "rb") as f:
    test = pickle.load(f)


test

['2013104704', '000a000013']

In [None]:
test_path = "../data/Pedigree-Results/000a000178.pkl"
test_path = "../data/Pedigree-Results/1994108729.pkl"
test = pd.read_pickle(test_path)
test