# パッケージ


In [9]:
import pandas as pd
import datetime
import re
import time
import os
from typing import Literal
from tqdm import tqdm
import pickle
import numpy as np
import math
from sklearn.decomposition import PCA

# データの整形をするクラス


In [2]:
class HorseProcessor:
    def remove_str(x: any) -> str:
        x_str = str(x)
        is_contain_num = re.search(r"\d+", x_str)
        if is_contain_num:
            return is_contain_num.group()
        else:
            return "0"

    def __convert_date(x: str | int) -> int:
        """日付をその年の1日1月を基点とした日数に変換する

        Args:
            x (str | int): 日付(YYYY/MM/DD)

        Returns:
            int: 日数
        """
        year = re.sub(r"(\d{1,4})/.*/.*", r"\1", x)
        new_year = datetime.datetime(int(year), 1, 1)
        unix_year = int(time.mktime(new_year.timetuple()))
        date_format = datetime.datetime.strptime(x, "%Y/%m/%d")
        unix_date = int(time.mktime(date_format.timetuple()))
        return (unix_date - unix_year) // 86400

    @staticmethod
    def __transform_race_length(length: str | int) -> str:
        if isinstance(length, str):
            length = int(length)
        elif math.isnan(length):
            length = 0
        elif not isinstance(length, int):
            raise TypeError(
                f'"length" argument is expected to be of type int or str, got {type(length).__name__} instead. The value is {length}'
            )
        match length:
            case length if length < 1000:
                return "不明"
            case length if length <= 1300:
                return "S"
            case length if length <= 1899:
                return "M"
            case length if length <= 2100:
                return "I"
            case length if length <= 2700:
                return "L"
            case length if length > 2700:
                return "E"

    @staticmethod
    def __transform_held(held: str | int) -> str:
        if isinstance(held, int):
            held = str(held)
        trim_held = re.sub(r"\d*", "", held)
        if not trim_held in [
            "東京",
            "中山",
            "中京",
            "阪神",
            "札幌",
            "函館",
            "福島",
            "新潟",
            "京都",
            "小倉",
        ]:
            return "その他"
        return trim_held

    @staticmethod
    def __transform_race_name(race: str | int) -> str:
        # r"新馬|未勝利|1勝|2勝|3勝|オープン"
        if isinstance(race, int):
            race = str(race)
        if re.search(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", race):
            transform_name = re.sub(
                r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", r"\1", race
            )
        else:
            transform_name = "その他"
        return transform_name

    @staticmethod
    def __extract_addition(df: pd.DataFrame) -> pd.DataFrame:
        """体重の増減を抽出する

        Args:
            df (pd.DataFrame): 変化対象のデータ

        Returns:
            pd.DataFrame: 変換後のデータ
        """
        weight = df["馬体重"]
        addition = weight.map(lambda x: re.sub(r".*\(([+-]\d{1,3}|0)\).*", r"\1", x))
        addition = addition.map(lambda x: re.sub(r"\+", "", x))
        return addition

    @staticmethod
    def __drop_columns(df: pd.DataFrame) -> pd.DataFrame:
        """不要なカラムを削除

        Args:
            df (pd.DataFrame): 成績データ

        Returns:
            pd.DataFrame: 削除後データ
        """
        df_processed = df[["開催", "天気", "レース名", "着順", "距離", "馬場"]]
        return df_processed

    @staticmethod
    def __divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        df_divided = df
        df_divided["コース"] = df_divided["距離"].map(
            lambda x: x[0] if not isinstance(x, int) else 0
        )
        df_divided["距離"] = df_divided["距離"].map(
            lambda x: x[1:] if not isinstance(x, int) else 0
        )
        return df_divided

    @staticmethod
    def __delete_invalid_race(df: pd.DataFrame) -> pd.DataFrame:
        df = df.drop(index=df[df["着順"] == 0].index)
        df = df.drop(index=df[df["着順"] == "0"].index)
        return df

    @classmethod
    def __divide_horse_weight(cls, df: pd.DataFrame) -> pd.DataFrame:
        """馬体重を分ける

        Args:
            df (pd.DataFrame): 加工前データ

        Returns:
            pd.DataFrame: 加工後データ
        """
        df_divided = df
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: x.replace("計不", "0(0)")
        )
        weight_addition = cls.__extract_addition(df_divided)
        df_divided["増減"] = weight_addition
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: re.sub(r"\([+-]*\d+\)", "", x)
        )
        return df_divided

    @classmethod
    def process(cls, path: pd.DataFrame | str):
        if isinstance(path, str):
            df_raw = pd.read_pickle(path)
        elif isinstance(path, pd.DataFrame):
            df_raw = path
        else:
            raise TypeError(
                f'"path" argument is expected to be of type pd.DataFrame or str, got {type(path).__name__} instead'
            )
        # 欠損値を0埋め
        df_processed = df_raw.fillna(0)
        # カラム名の空白を削除
        df_processed.columns = df_processed.columns.str.replace(" ", "")

        df_processed = cls.__drop_columns(df_processed)
        df_processed["開催"] = df_processed["開催"].map(cls.__transform_held)
        df_processed["レース名"] = df_processed["レース名"].map(
            cls.__transform_race_name
        )
        df_processed = cls.__divide_corse(df_processed)
        df_processed["距離"] = df_processed["距離"].map(cls.__transform_race_length)
        df_processed["着順"] = df_processed["着順"].map(cls.remove_str)
        df_processed = cls.__delete_invalid_race(df_processed)
        df_processed = df_processed.replace(0, "不明")
        return df_processed.iloc[::-1].reset_index(drop=True)

In [218]:
with open("../Raw-Data/Pedigree/2011101125.pickle", "rb") as f:
    test = pickle.load(f)

test_df = pd.DataFrame()
for i in test:
    father_df = pd.read_pickle(f"../Raw-Data/Pedigree-Results/{i}.pkl")
    test = HorseProcessor.process(father_df)
    test_df = pd.concat([test_df, test])
test_df.reset_index(drop=True)

Unnamed: 0,開催,天気,レース名,着順,距離,馬場,コース
0,阪神,晴,新馬,3,I,良,芝
1,阪神,晴,新馬,16,I,良,芝
2,阪神,晴,未勝利,2,I,稍,芝
3,京都,晴,未勝利,2,L,良,芝
4,東京,晴,未勝利,1,L,良,芝
...,...,...,...,...,...,...,...
58,中山,曇,G1,1,L,稍,芝
59,阪神,晴,G1,5,L,良,芝
60,東京,晴,G1,1,I,良,芝
61,東京,曇,G1,3,L,重,芝


In [197]:
test = pd.read_pickle(f"../Raw-Data/Pedigree-Results/1977103827.pkl")
test = HorseProcessor.process(test)
test

Unnamed: 0,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6
0,東京,晴,OP,3,E,良,芝
1,東京,晴,OP,6,L,良,芝
2,中山,晴,OP,3,L,良,芝


In [3]:
dir_list = os.listdir("../Raw-Data/Pedigree/")
df_integrated = pd.DataFrame()
for i in tqdm(dir_list):
    with open(f"../Raw-Data/Pedigree/{i}", "rb") as f:
        peds = pickle.load(f)
        for ped in peds:
            try:
                df = pd.read_pickle(f"../Raw-Data/Pedigree-Results/{ped}.pkl")
                df = HorseProcessor.process(df)
                df_integrated = pd.concat([df_integrated, df])
                df_integrated = df_integrated.reset_index(drop=True)
            except Exception as e:
                print(ped)
                raise Exception(e)

  0%|          | 0/11557 [00:00<?, ?it/s]

100%|██████████| 11557/11557 [17:07<00:00, 11.25it/s]


In [18]:
print(df_integrated["開催"].unique())
print(df_integrated["天気"].unique())
print(df_integrated["レース名"].unique())
print(df_integrated["着順"].unique())
print(df_integrated["距離"].unique())
print(df_integrated["馬場"].unique())
print(df_integrated["コース"].unique())
df_integrated.to_csv("../tmp/peds-results.csv")
df_integrated.to_pickle("../tmp/peds-results.pkl")
df_integrated

['京都' '阪神' '中山' '東京' '中京' '札幌' 'その他' '新潟' '小倉' '函館' '福島']
['曇' '晴' '雨' '不明' '小雨' '雪' '小雪']
['新馬' '未勝利' 'その他' 'G3' 'G1' 'G2' 'OP' 'L']
['2' '4' '1' '3' '5' '16' '8' '10' '7' '6' '12' '13' '11' '15' '9' '17'
 '14' '18' '19' '20']
['M' 'I' 'L' 'E' 'S' '不明']
['良' '稍' '重' '不' '不明']
['芝' 'ダ' '不明' '障']


Unnamed: 0,開催,天気,レース名,着順,距離,馬場,コース
0,京都,曇,新馬,2,M,良,芝
1,京都,晴,未勝利,4,M,良,ダ
2,京都,晴,未勝利,1,M,良,ダ
3,阪神,曇,その他,1,I,稍,芝
4,阪神,曇,G3,1,I,良,芝
...,...,...,...,...,...,...,...
288312,その他,不明,G1,1,M,稍,芝
288313,その他,不明,G1,1,M,良,芝
288314,その他,不明,G1,1,I,良,芝
288315,その他,不明,G1,1,I,重,芝


# テンプレート作成


In [5]:
place = [
    "札幌",
    "函館",
    "福島",
    "新潟",
    "中山",
    "東京",
    "中京",
    "京都",
    "阪神",
    "小倉",
    "その他",
]
race = ["重賞", "非重賞"]
length = ["S", "M", "I", "L", "E"]
state = ["良", "稍", "重", "不"]
seed = ["芝", "ダ", "障"]
win = ["1", "2", "3", "3<"]
columns = []
for p in place:
    for r in race:
        for l in length:
            for s in seed:
                for se in state:
                    for wi in win:
                        columns.append(f"{p}_{r}_{l}_{s}_{se}_{wi}")

columns = [i for i in columns if not re.match(r"札幌_重賞_L.*", i)]
columns = [i for i in columns if not re.match(r"札幌_重賞_(S|I)_ダ.*", i)]
columns = [i for i in columns if not re.match(r"札幌_非重賞_L_ダ.*", i)]
columns = [i for i in columns if not re.match(r"札幌_非*重賞_(S|M|I|L)_障.*", i)]
columns = [i for i in columns if not re.match(r"札幌_非*重賞_E.*", i)]

columns = [i for i in columns if not re.match(r"函館_重賞_[SMI]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"函館_重賞_[LE].*", i)]
columns = [i for i in columns if not re.match(r"函館_非重賞_[IL]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"函館_非重賞_E.*", i)]
columns = [i for i in columns if not re.match(r"函館_非*重賞_._障.*", i)]

columns = [i for i in columns if not re.match(r"福島_重賞_(S|L|E).*", i)]
columns = [i for i in columns if not re.match(r"福島_重賞_(M|I)_[ダ障].*", i)]
columns = [i for i in columns if not re.match(r"福島_重賞_._障.*", i)]
columns = [i for i in columns if not re.match(r"福島_非重賞_(I|L)_ダ.*", i)]
columns = [i for i in columns if not re.match(r"福島_非重賞_[SMIL]_障.*", i)]
columns = [i for i in columns if not re.match(r"福島_非重賞_E_[芝ダ].*", i)]

columns = [i for i in columns if not re.match(r"新潟_重賞_S_[ダ].*", i)]
columns = [i for i in columns if not re.match(r"新潟_非*重賞_[IL]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"新潟_非*重賞_E_[芝ダ].*", i)]
columns = [i for i in columns if not re.match(r"新潟_非*重賞_._障.*", i)]

columns = [i for i in columns if not re.match(r"中山_重賞_[ILE]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"中山_非*重賞_._障.*", i)]
columns = [i for i in columns if not re.match(r"中山_非重賞_[IE]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"中山_非重賞_[E]_芝.*", i)]

columns = [i for i in columns if not re.match(r"東京_非*重賞_._障.*", i)]
columns = [i for i in columns if not re.match(r"東京_非*重賞_S_芝.*", i)]
columns = [i for i in columns if not re.match(r"東京_重賞_[LE]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"東京_非重賞_[E]_ダ.*", i)]

columns = [i for i in columns if not re.match(r"中京_重賞_._障.*", i)]
columns = [i for i in columns if not re.match(r"中京_重賞_[SIE]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"中京_重賞_[L]_芝.*", i)]
columns = [i for i in columns if not re.match(r"中京_非重賞_[IE]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"中京_非重賞_E_芝.*", i)]
columns = [i for i in columns if not re.match(r"中京_非重賞_[SMIL]_障.*", i)]

columns = [i for i in columns if not re.match(r"京都_非*重賞_[LE]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"京都_重賞_._障.*", i)]
columns = [i for i in columns if not re.match(r"京都_非重賞_[SMIL]_障.*", i)]

columns = [i for i in columns if not re.match(r"阪神_重賞_[SLE]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"阪神_重賞_[SMIL]_障.*", i)]
columns = [i for i in columns if not re.match(r"阪神_非重賞_[SMIL]_障.*", i)]
columns = [i for i in columns if not re.match(r"阪神_非重賞_L_ダ.*", i)]
columns = [i for i in columns if not re.match(r"阪神_非重賞_E_[ダ芝].*", i)]

columns = [i for i in columns if not re.match(r"小倉_重賞_[SMI]_[ダ障].*", i)]
columns = [i for i in columns if not re.match(r"小倉_重賞_L_[芝障].*", i)]
columns = [i for i in columns if not re.match(r"小倉_重賞_E.*", i)]
columns = [i for i in columns if not re.match(r"小倉_非重賞_[SMIL]_障.*", i)]
columns = [i for i in columns if not re.match(r"小倉_非重賞_[IL]_ダ.*", i)]
columns = [i for i in columns if not re.match(r"小倉_非重賞_E_[ダ芝].*", i)]

columns = [i for i in columns if not re.match(r"その他_非*重賞_E_ダ.*", i)]
columns = [i for i in columns if not re.match(r"その他_非*重賞_._障.*", i)]
df_tmp = pd.DataFrame(columns=columns, index=[0]).fillna(0)
df_tmp

Unnamed: 0,札幌_重賞_S_芝_良_1,札幌_重賞_S_芝_良_2,札幌_重賞_S_芝_良_3,札幌_重賞_S_芝_良_3<,札幌_重賞_S_芝_稍_1,札幌_重賞_S_芝_稍_2,札幌_重賞_S_芝_稍_3,札幌_重賞_S_芝_稍_3<,札幌_重賞_S_芝_重_1,札幌_重賞_S_芝_重_2,...,その他_非重賞_E_芝_稍_3,その他_非重賞_E_芝_稍_3<,その他_非重賞_E_芝_重_1,その他_非重賞_E_芝_重_2,その他_非重賞_E_芝_重_3,その他_非重賞_E_芝_重_3<,その他_非重賞_E_芝_不_1,その他_非重賞_E_芝_不_2,その他_非重賞_E_芝_不_3,その他_非重賞_E_芝_不_3<
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df = pd.read_pickle("../tmp/peds-results.pkl")
race = ["G3", "G1", "G2"]
for index, row in df.iterrows():
    col = [row["距離"], row["コース"]]
    if "不明" in col:
        continue
    w = row["着順"] if int(row["着順"]) <= 3 else "3<"
    s = row["馬場"] if row["馬場"] != "不明" else "良"
    r = "重賞" if row["レース名"] in race else "非重賞"
    col = f'{row["開催"]}_{r}_{row["距離"]}_{row["コース"]}_{s}_{w}'
    df_tmp[col] += 1

In [7]:
for i in df_tmp:
    tex = df_tmp[i][0]
    if tex == 0:
        if not re.search(r"(札幌|函館|福島|新潟|中山|東京|中京|京都|阪神|小倉).*", i):
            if not re.search(r"その他_非重賞_E.*", i):
                print(i)
df_tmp

その他_重賞_S_芝_稍_3
その他_重賞_S_芝_重_2
その他_重賞_S_芝_不_2
その他_重賞_S_芝_不_3
その他_重賞_M_芝_不_2
その他_重賞_M_芝_不_3
その他_重賞_I_芝_不_3<
その他_重賞_L_ダ_稍_3<
その他_重賞_L_ダ_重_1
その他_重賞_L_ダ_重_2
その他_重賞_L_ダ_不_2
その他_重賞_L_ダ_不_3
その他_重賞_E_芝_良_3
その他_重賞_E_芝_良_3<
その他_重賞_E_芝_稍_1
その他_重賞_E_芝_稍_2
その他_重賞_E_芝_稍_3
その他_重賞_E_芝_稍_3<
その他_重賞_E_芝_重_2
その他_重賞_E_芝_重_3
その他_重賞_E_芝_重_3<
その他_重賞_E_芝_不_1
その他_重賞_E_芝_不_2
その他_重賞_E_芝_不_3
その他_重賞_E_芝_不_3<
その他_非重賞_S_芝_稍_3
その他_非重賞_S_芝_重_1
その他_非重賞_S_芝_重_2
その他_非重賞_S_芝_重_3
その他_非重賞_S_芝_不_1
その他_非重賞_S_芝_不_2
その他_非重賞_S_芝_不_3
その他_非重賞_S_芝_不_3<
その他_非重賞_S_ダ_稍_2
その他_非重賞_M_芝_稍_3
その他_非重賞_M_芝_稍_3<
その他_非重賞_M_芝_重_3
その他_非重賞_M_芝_重_3<
その他_非重賞_M_芝_不_2
その他_非重賞_M_芝_不_3<
その他_非重賞_I_芝_良_2
その他_非重賞_I_芝_良_3
その他_非重賞_I_芝_稍_1
その他_非重賞_I_芝_稍_2
その他_非重賞_I_芝_稍_3
その他_非重賞_I_芝_重_2
その他_非重賞_I_芝_重_3
その他_非重賞_I_芝_重_3<
その他_非重賞_I_芝_不_1
その他_非重賞_I_芝_不_3
その他_非重賞_I_芝_不_3<
その他_非重賞_I_ダ_稍_1
その他_非重賞_I_ダ_稍_2
その他_非重賞_I_ダ_稍_3
その他_非重賞_I_ダ_重_1
その他_非重賞_I_ダ_重_2
その他_非重賞_I_ダ_重_3
その他_非重賞_I_ダ_不_3
その他_非重賞_L_芝_良_3
その他_非重賞_L_芝_良_3<
その他_非重賞_L_芝_稍_1
その他_非重賞_L_芝_稍_2
その他_非重賞_L_芝_稍_3
その他_

Unnamed: 0,札幌_重賞_S_芝_良_1,札幌_重賞_S_芝_良_2,札幌_重賞_S_芝_良_3,札幌_重賞_S_芝_良_3<,札幌_重賞_S_芝_稍_1,札幌_重賞_S_芝_稍_2,札幌_重賞_S_芝_稍_3,札幌_重賞_S_芝_稍_3<,札幌_重賞_S_芝_重_1,札幌_重賞_S_芝_重_2,...,その他_非重賞_E_芝_稍_3,その他_非重賞_E_芝_稍_3<,その他_非重賞_E_芝_重_1,その他_非重賞_E_芝_重_2,その他_非重賞_E_芝_重_3,その他_非重賞_E_芝_重_3<,その他_非重賞_E_芝_不_1,その他_非重賞_E_芝_不_2,その他_非重賞_E_芝_不_3,その他_非重賞_E_芝_不_3<
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_tmp.to_pickle("../template/pedigree_results_template.pcl")

# PCI モデル作成


## 父と母父の成績を集計するクラス


In [2]:
class PedigreeProcessor:
    race_class = ["G3", "G1", "G2"]

    def remove_str(x: any) -> str:
        """文字列の数字のみを抽出する

        Args:
            x (any): 文字列に変換できる値

        Returns:
            str: 抽出した数字（文字列）
        """
        x_str: str = str(x)
        is_contain_num: re.Match | None = re.search(r"\d+", x_str)
        if is_contain_num:
            return is_contain_num.group()
        else:
            return "0"

    @staticmethod
    def transform_race_length(
        length: str | int,
    ) -> Literal["S", "M", "I", "L", "E", "不明"]:
        """コースの距離をSMILE区分に変換する

        Args:
            length (str | int):数字のみで構成された文字列または整数

        Raises:
            TypeError: 文字でも整数でもない値が入った場合はエラー

        Returns:
            str: 「S」「M」「I」「L」「E」のいずれかの文字または「不明」
        """
        if isinstance(length, str):
            length: int = int(length)
        elif math.isnan(length):
            length: int = 0
        elif not isinstance(length, int):
            raise TypeError(
                f'"length" argument is expected to be of type int or str, got {type(length).__name__} instead. The value is {length}'
            )
        match length:
            case length if length < 1000:
                return "不明"
            case length if length <= 1300:
                return "S"
            case length if length <= 1899:
                return "M"
            case length if length <= 2100:
                return "I"
            case length if length <= 2700:
                return "L"
            case length if length > 2700:
                return "E"

    @staticmethod
    def transform_held(
        held: str | int,
    ) -> Literal[
        "東京",
        "中山",
        "中京",
        "阪神",
        "札幌",
        "函館",
        "福島",
        "新潟",
        "京都",
        "小倉",
        "不明",
    ]:
        """「競馬場名＋開催日」の書式の文字列から競馬場名を抽出する。中央の競馬場以外は「その他」に変換

        Args:
            held (str | int): 「競馬場名＋開催日」の書式の文字列

        Returns:
            str: 中央の競馬場名または「不明」
        """
        # 数値だった場合に文字列に変換する
        if isinstance(held, int):
            held: str = str(held)
        trim_held: str = re.sub(r"\d*", "", held)
        if not trim_held in [
            "東京",
            "中山",
            "中京",
            "阪神",
            "札幌",
            "函館",
            "福島",
            "新潟",
            "京都",
            "小倉",
        ]:
            return "その他"
        return trim_held

    @staticmethod
    def transform_race_name(
        race: str | int,
    ) -> Literal[
        "新馬", "未勝利", "1勝", "2勝", "3勝", "OP", "G1", "G2", "G3", "L", "不明"
    ]:
        """レースのクラスを分類する

        Args:
            race (str | int): レースのクラス

        Returns:
            str: クラス名もしくは「不明」
        """
        if isinstance(race, int):
            race = str(race)
        if re.search(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", race):
            transform_name = re.sub(
                r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", r"\1", race
            )
        else:
            transform_name = "その他"
        return transform_name

    @staticmethod
    def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
        """不要なカラムを削除

        Args:
            df (pd.DataFrame): 成績データ

        Returns:
            pd.DataFrame: 削除後データ
        """
        df_processed: pd.DataFrame = df[
            ["開催", "天気", "レース名", "着順", "距離", "馬場"]
        ]
        return df_processed

    @staticmethod
    def divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        """データフレーム内の「距離」の値をコースと距離に分ける。それぞれの値は「コース」「距離」の値で入る

        Args:
            df (pd.DataFrame): 加工前の成績データ

        Returns:
            pd.DataFrame: 加工後のデータフレーム
        """
        df_divided: pd.DataFrame = df.copy()
        df_divided["コース"] = df_divided["距離"].map(
            lambda x: x[0] if not isinstance(x, int) else 0
        )
        df_divided["距離"] = df_divided["距離"].map(
            lambda x: x[1:] if not isinstance(x, int) else 0
        )
        return df_divided

    @staticmethod
    def delete_invalid_race(df: pd.DataFrame) -> pd.DataFrame:
        """着順が0となっているものは失格や中止のため除外する

        Args:
            df (pd.DataFrame): 除外前の成績データ

        Returns:
            pd.DataFrame: 除外後の成績データ
        """
        df: pd.DataFrame = df.drop(index=df[df["着順"] == 0].index)
        df = df.drop(index=df[df["着順"] == "0"].index)
        return df

    @staticmethod
    def preprocessing(df: pd.DataFrame) -> pd.DataFrame:
        """成績データの0埋めと不要な空白の削除を行う

        Args:
            df (pd.DataFrame): 変換前の成績データ

        Returns:
            pd.DataFrame: 変換後の成績データ
        """
        # 欠損値を0埋め
        df_preprocessed: pd.DataFrame = df.fillna(0)
        # カラム名の空白を削除
        df_preprocessed.columns = df_preprocessed.columns.str.replace(" ", "")
        return df_preprocessed

    @classmethod
    def totalling_pedigree_result(cls, df: pd.DataFrame) -> pd.DataFrame:
        """成績データを競馬場や馬場、着順等で分けて集計する

        Args:
            df (pd.DataFrame): 成績データ（時系列順）

        Returns:
            pd.DataFrame: 集計した成績データ
        """
        df_tmp: pd.DataFrame = pd.read_pickle(
            "../template/pedigree_results_template.pcl"
        )
        for index, row in df.iterrows():
            col: list[str] = [row["距離"], row["コース"]]
            if "不明" in col:
                continue
            rank: str = row["着順"] if int(row["着順"]) <= 3 else "3<"
            state: str = row["馬場"] if row["馬場"] != "不明" else "良"
            race_type: str = "重賞" if row["レース名"] in cls.race_class else "非重賞"
            col = f'{row["開催"]}_{race_type}_{row["距離"]}_{row["コース"]}_{state}_{rank}'
            df_tmp[col] += 1
        return df_tmp

    @classmethod
    def transform_data(cls, df_raw: pd.DataFrame) -> pd.DataFrame:
        """成績データの内容を集計出来る形に変換する

        Args:
            df_raw (pd.DataFrame):変換前データ

        Returns:
            pd.DataFrame: 変換誤データ
        """
        df = df_raw.copy()
        df_transformed = cls.drop_columns(df)
        df_transformed = cls.divide_corse(df_transformed)
        df_transformed["開催"] = df_transformed["開催"].map(cls.transform_held)
        df_transformed["レース名"] = df_transformed["レース名"].map(
            cls.transform_race_name
        )
        df_transformed["距離"] = df_transformed["距離"].map(cls.transform_race_length)
        df_transformed["着順"] = df_transformed["着順"].map(cls.remove_str)
        df_transformed = cls.delete_invalid_race(df_transformed)
        df_transformed = df_transformed.replace(0, "不明")
        return df_transformed

    @classmethod
    def transform_pedigree_results(cls, path: pd.DataFrame | str) -> pd.DataFrame:
        """成績データに関する処理を統合した関数

        Args:
            path (pd.DataFrame | str):未加工の成績データ

        Raises:
            TypeError: データフレームもしくはそれの場所を示す文字列出なかった場合はエラー

        Returns:
            pd.DataFrame: 加工後のデータ
        """
        if isinstance(path, str):
            df_raw: pd.DataFrame = pd.read_pickle(path)
        elif isinstance(path, pd.DataFrame):
            df_raw = path
        else:
            raise TypeError(
                f'"path" argument is expected to be of type pd.DataFrame or str, got {type(path).__name__} instead'
            )
        # 前処理
        df_processed: pd.DataFrame = cls.preprocessing(df_raw)
        # 変換
        df_processed = cls.transform_data(df_processed)
        return df_processed.iloc[::-1].reset_index(drop=True)

    @classmethod
    def process_pedigree_results(cls, path1: str, path2: str) -> pd.DataFrame:
        """関数を統合して、父と母父の成績データを変換して集計する一つの関数とした

        Args:
            path1 (str): 父の成績データ
            path2 (str): 母父の成績データ

        Returns:
            pd.DataFrame: 統合データ
        """
        df1_transformed = cls.transform_pedigree_results(path1)
        df2_transformed = cls.transform_pedigree_results(path2)
        df1_processed = cls.totalling_pedigree_result(df1_transformed)
        df2_processed = cls.totalling_pedigree_result(df2_transformed)
        return pd.concat([df1_processed, df2_processed], axis=1)

In [3]:
test_peds = pd.read_pickle("../Raw-Data/Pedigree/2011100643.pickle")
ped1 = f"../Raw-Data/Pedigree-Results/{test_peds[0]}.pkl"
ped2 = f"../Raw-Data/Pedigree-Results/{test_peds[1]}.pkl"
PedigreeProcessor.process_pedigree_results(ped1, ped2)

Unnamed: 0,札幌_重賞_S_芝_良_1,札幌_重賞_S_芝_良_2,札幌_重賞_S_芝_良_3,札幌_重賞_S_芝_良_3<,札幌_重賞_S_芝_稍_1,札幌_重賞_S_芝_稍_2,札幌_重賞_S_芝_稍_3,札幌_重賞_S_芝_稍_3<,札幌_重賞_S_芝_重_1,札幌_重賞_S_芝_重_2,...,その他_非重賞_E_芝_稍_3,その他_非重賞_E_芝_稍_3<,その他_非重賞_E_芝_重_1,その他_非重賞_E_芝_重_2,その他_非重賞_E_芝_重_3,その他_非重賞_E_芝_重_3<,その他_非重賞_E_芝_不_1,その他_非重賞_E_芝_不_2,その他_非重賞_E_芝_不_3,その他_非重賞_E_芝_不_3<
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 保存しているデータをから集計データを作成


In [4]:
dir_list = os.listdir("../Raw-Data/Pedigree/")
dfs = []
for i in tqdm(dir_list):
    peds = pd.read_pickle(f"../Raw-Data/Pedigree/{i}")
    ped1 = f"../Raw-Data/Pedigree-Results/{peds[0]}.pkl"
    ped2 = f"../Raw-Data/Pedigree-Results/{peds[1]}.pkl"
    result = PedigreeProcessor.process_pedigree_results(ped1, ped2)
    dfs.append(result)
df_integrated = pd.concat(dfs)
df_integrated

  0%|          | 0/11557 [00:00<?, ?it/s]

100%|██████████| 11557/11557 [30:21<00:00,  6.34it/s]


TypeError: unhashable type: 'DataFrame'

In [5]:
df_integrated = pd.concat(dfs)
df_integrated

Unnamed: 0,札幌_重賞_S_芝_良_1,札幌_重賞_S_芝_良_2,札幌_重賞_S_芝_良_3,札幌_重賞_S_芝_良_3<,札幌_重賞_S_芝_稍_1,札幌_重賞_S_芝_稍_2,札幌_重賞_S_芝_稍_3,札幌_重賞_S_芝_稍_3<,札幌_重賞_S_芝_重_1,札幌_重賞_S_芝_重_2,...,その他_非重賞_E_芝_稍_3,その他_非重賞_E_芝_稍_3<,その他_非重賞_E_芝_重_1,その他_非重賞_E_芝_重_2,その他_非重賞_E_芝_重_3,その他_非重賞_E_芝_重_3<,その他_非重賞_E_芝_不_1,その他_非重賞_E_芝_不_2,その他_非重賞_E_芝_不_3,その他_非重賞_E_芝_不_3<
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,22,0,242,133,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_integrated.to_pickle("../tmp/pedigree-results-list.pkl")
df_integrated.to_csv("../tmp/pedigree-results-list.csv")

## PCI のクラス


In [18]:
class PedigreePCA:
    def __init__(self, n_components: float, path: str):
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components)
        self.model_path = path

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """訓練データに対してPCAを実行し、累積寄与率と各成分の寄与率を計算する

        Args:
            df (pd.DataFrame): 訓練データ

        Returns:
            pd.DataFrame: 変換後のデータ
        """
        transformed_data = self.pca.fit_transform(df)
        print(f"累積寄与率: {self.pca.explained_variance_ratio_.sum()}")
        for i, ratio in enumerate(self.pca.explained_variance_ratio_, start=1):
            print(f"成分{i}の寄与率: {ratio}")
        return pd.DataFrame(transformed_data)

    def save_model(self):
        """モデルの保存"""
        with open(self.model_path, "wb") as f:
            pickle.dump(self.pca, f)
        print(f"モデルを{self.model_path}に保存しました。")

    def load_model(self):
        """モデルの読み込み"""
        with open(self.model_path, "rb") as f:
            self.pca = pickle.load(f)
        print(f"モデルを{self.model_path}から読み込みました。")

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """保存されたモデルを使用して、テストデータに対してPCAを実行する

        Args:
            df (pd.DataFrame): テストデータ

        Returns:
            pd.DataFrame: 変換後のデータ
        """
        transformed_data = self.pca.transform(df)
        return pd.DataFrame(transformed_data)

In [31]:
pca = PedigreePCA(44, "../models/pedigree_pca.pickle")
pca.fit_transform(df_integrated)

累積寄与率: 0.7053555124912831
成分1の寄与率: 0.05878595081361936
成分2の寄与率: 0.05682547503783853
成分3の寄与率: 0.04303053004958753
成分4の寄与率: 0.040232424812729116
成分5の寄与率: 0.03328558917342503
成分6の寄与率: 0.031445056894441104
成分7の寄与率: 0.028535570234271126
成分8の寄与率: 0.026713745285389973
成分9の寄与率: 0.021009890756906947
成分10の寄与率: 0.018474798213662193
成分11の寄与率: 0.018042410767877104
成分12の寄与率: 0.01716571736170339
成分13の寄与率: 0.016310544152464742
成分14の寄与率: 0.015331628327266231
成分15の寄与率: 0.014767904097451606
成分16の寄与率: 0.01446859705356478
成分17の寄与率: 0.013408149997614267
成分18の寄与率: 0.01285811279105538
成分19の寄与率: 0.012471113773857694
成分20の寄与率: 0.011808237937664539
成分21の寄与率: 0.011753463873754743
成分22の寄与率: 0.01088722271052209
成分23の寄与率: 0.010293160407136752
成分24の寄与率: 0.010104446615203835
成分25の寄与率: 0.009768361034959164
成分26の寄与率: 0.009670127252170351
成分27の寄与率: 0.009439841531706133
成分28の寄与率: 0.009105206702851585
成分29の寄与率: 0.008952064279267122
成分30の寄与率: 0.008845376183136374
成分31の寄与率: 0.008650282689315131
成分32の寄与率: 0.008405584896721458

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,-1.758073,-0.816981,-0.534953,-0.167826,0.197428,-0.748522,0.655999,0.517923,1.153270,0.469347,...,0.017728,0.403051,-0.062032,-0.046559,-0.359684,0.044179,0.603671,-0.332008,0.306281,-0.137777
1,-3.231542,-3.509132,1.417516,0.395583,-1.366171,2.098962,-0.451157,0.238396,-0.415770,0.066808,...,1.627011,-0.903610,0.885917,0.096029,-0.292099,-0.345785,1.477559,-0.496871,-1.208502,0.441047
2,-0.542454,-0.127821,-0.515847,-0.213358,0.152124,-0.548020,0.423740,-0.015442,-0.303366,-0.033855,...,-0.207145,-0.118608,0.103211,0.140209,0.141063,-0.077901,-0.112978,0.282679,0.174235,0.266160
3,0.329529,0.471578,-0.226403,-0.556549,-0.192012,-0.525452,-0.242760,-0.034942,0.064749,0.166293,...,-0.117718,-0.483779,-0.120040,0.057195,-0.189803,0.528940,-0.005800,-0.111326,-0.194126,-0.200654
4,-1.203814,-0.940009,0.004234,-0.229715,-0.837146,0.755340,-0.957484,-0.123109,-0.683080,-0.830701,...,-0.916216,-0.448199,1.085293,0.217152,0.678852,0.297082,0.657211,0.423697,-0.193420,-0.032871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11552,-0.176354,0.658711,-0.626545,-0.277350,0.169366,-0.703898,0.084331,0.112494,-0.588310,-0.090526,...,0.016760,-0.104811,-0.050812,0.064274,-0.008557,0.018510,0.028291,-0.102020,-0.107578,0.204682
11553,-0.319622,2.026954,-0.816266,1.657158,-0.378679,0.183017,0.643458,0.277108,-0.268719,0.028914,...,0.334506,-1.191458,0.265012,0.218135,0.375541,-0.667304,-1.203959,0.336798,-0.596713,0.210286
11554,-0.173230,0.658516,-0.631805,-0.269097,0.175205,-0.705137,0.088044,0.113387,-0.589319,-0.095671,...,0.001960,-0.104581,-0.049329,0.058399,-0.016702,0.006437,0.022765,-0.102059,-0.108852,0.208881
11555,-0.114882,0.802692,-0.420293,0.018439,0.028057,-0.674824,0.190937,0.039141,-0.716796,0.031718,...,-0.082647,-0.001128,-0.019298,0.207808,0.080876,-0.076218,0.226152,0.018745,-0.020827,0.044403


In [32]:
pca.save_model()

モデルを../models/pedigree_pca.pickleに保存しました。
