# パッケージ


In [67]:
import pandas as pd
import datetime
import re
import os, time
from tqdm import tqdm
import category_encoders as ce
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import math
from sklearn.decomposition import PCA
from abc import ABC, abstractmethod
import torch.nn.functional as F

# レース結果を整形するクラス(仮)


In [9]:
class RaceResultsProcessor_:
    """レース結果をデータを整形する"""

    def __init__(self, path: str):
        self.results_raw = pd.read_pickle(path)
        self.results_processed = pd.read_pickle(path)
        # データの0埋めを行う
        self.results_processed.fillna(0, inplace=True)
        # 馬体重のカラムについては「0(0)」で埋める
        self.results_processed["馬体重"].replace(0, "0(0)", inplace=True)

    def drop_columns(self, columns: [str]) -> None:
        """不要なカラムを削除"""
        self.results_processed = self.results_processed.drop(columns=columns)

    def divide_weight_gender(self):
        """馬の性齢と馬体重を分割する"""
        self.results_processed["性別"] = self.results_processed["性齢"].str[0]
        self.results_processed["年齢"] = self.results_processed["性齢"].str[1:]
        self.results_processed["体重"] = self.results_processed["馬体重"].replace(
            to_replace=r"(\d+).*", value=r"\1", regex=True
        )
        self.results_processed["増減"] = self.results_processed["馬体重"].replace(
            to_replace=r"\d+\(\+{0,1}([-]{0,1}\d+)\)", value=r"\1", regex=True
        )

    def transform_rank(self):
        self.results_processed["3着以内"] = self.results_processed["着順"].apply(
            lambda x: 1 if isinstance(x, int) and x <= 3 else 0
        )

    def transform_date(self, date: str):
        """日付を変換して、その年の1月1日からの日数を計算する"""
        # 日付の形式を変換
        date_converted = datetime.datetime.strptime(date, "%Y年%m月%d日")
        # その年の1月1日を計算
        base_date = datetime.datetime(date_converted.year, 1, 1)
        # 日数の差を計算
        return (date_converted - base_date).days

    def __extraction_drop_columns(
        self, df: pd.DataFrame, columns: [str]
    ) -> (pd.DataFrame, pd.DataFrame):
        df_extraction = df.loc[:, columns]
        df_dropped = df.drop(columns=columns)
        return df_extraction, df_dropped

    def make_race_infos(self):
        """データをレースの情報、出走馬の情報、過去成績の3つに分ける"""
        drop_columns = [
            "馬名",
            "性齢",
            "騎手",
            "タイム",
            "着差",
            "人気",
            "調教師",
            "単勝",
            "jockey_id",
            "馬体重",
        ]
        race_info_columns = [
            "date",
            "round",
            "course_length",
            "course_type",
            "course_way",
            "weather",
            "state_grass",
            "state_dirt",
            "place",
            "class",
        ]
        self.divide_weight_gender()
        self.transform_rank()
        self.drop_columns(drop_columns)
        self.race_info, self.horse_info = self.__extraction_drop_columns(
            self.results_processed, race_info_columns
        )
        self.horse_results, self.horse_info = self.__extraction_drop_columns(
            self.horse_info, ["horse_id"]
        )

        self.race_info = self.race_info.loc[[0], :]
        self.race_info["date"] = self.transform_date(self.race_info.loc[0, "date"])

In [10]:
test = RaceResultsProcessor_("../Raw-Data/Race-Results/2022/01020607.pkl")
test.make_race_infos()

test.race_info

Unnamed: 0,date,round,course_length,course_type,course_way,weather,state_grass,state_dirt,place,class
0,239,7,1000,ダ,右,晴,無,良,札幌,1勝


In [11]:
results_path = "../Raw-Data/Race-Results/2022/"
dir_list = os.listdir(results_path)
df_list = []
for i in tqdm(dir_list):
    result = RaceResultsProcessor_(f"{results_path}{i}")
    result.make_race_infos()
    df_list.append(result.race_info)

df_integration = pd.concat(df_list)
df_integration.to_pickle("../tmp/race-infos.pkl")

100%|██████████| 3456/3456 [00:29<00:00, 115.51it/s]


In [12]:
print(df_integration.columns)
df_integration[df_integration["course_type"] == "障"]
"""
無              1693
良              1385
重               359
稍                18
不                 1
"""
# 足りないデータを追加
tmps = []
tmps.append(df_integration)
for i in ["無", "良", "重", "稍", "不"]:
    tmp = df_integration[-1:].copy()
    tmp["state_dirt"] = i
    tmps.append(tmp)

df_integration_new = pd.concat(tmps)
# いらないデータを置換
df_integration_new["course_way"].replace("無", "右", inplace=True)
df_integration_new["state_grass"].replace("無", "良", inplace=True)
df_integration_new["state_dirt"].replace("無", "良", inplace=True)
df_integration_new

Index(['date', 'round', 'course_length', 'course_type', 'course_way',
       'weather', 'state_grass', 'state_dirt', 'place', 'class'],
      dtype='object')


Unnamed: 0,date,round,course_length,course_type,course_way,weather,state_grass,state_dirt,place,class
0,203,1,1800,芝,右,曇,重,良,札幌,未勝利
0,203,2,1700,ダ,右,曇,良,重,札幌,未勝利
0,203,3,1500,芝,右,曇,重,良,札幌,未勝利
0,203,4,1200,芝,右,曇,重,良,札幌,未勝利
0,203,5,1700,ダ,右,曇,良,重,札幌,新馬
...,...,...,...,...,...,...,...,...,...,...
0,246,12,2600,芝,右,晴,良,良,小倉,1勝
0,246,12,2600,芝,右,晴,良,良,小倉,1勝
0,246,12,2600,芝,右,晴,良,重,小倉,1勝
0,246,12,2600,芝,右,晴,良,稍,小倉,1勝


In [14]:
# ワンホットエンコーディングを行うカラムを指定
columns_to_encode = [
    "course_type",
    "course_way",
    "weather",
    "state_grass",
    "state_dirt",
    "place",
    "class",
]

# エンコーダーのインスタンスを作成
encoder = ce.OneHotEncoder(
    cols=columns_to_encode, use_cat_names=True, handle_unknown="value"
)

# ワンホットエンコーディングを実行
df_integration_encoded = encoder.fit_transform(df_integration_new)
with open("../models/race_info_encoder.pickle", "wb") as f:
    pickle.dump(encoder, f)

df_integration_encoded.columns

Index(['date', 'round', 'course_length', 'course_type_芝', 'course_type_ダ',
       'course_type_障', 'course_way_右', 'course_way_左', 'course_way_直',
       'weather_曇', 'weather_晴', 'weather_雨', 'weather_雪', 'state_grass_重',
       'state_grass_良', 'state_grass_稍', 'state_grass_不', 'state_dirt_良',
       'state_dirt_重', 'state_dirt_稍', 'state_dirt_不', 'place_札幌', 'place_函館',
       'place_福島', 'place_新潟', 'place_東京', 'place_中山', 'place_中京', 'place_阪神',
       'place_小倉', 'class_未勝利', 'class_新馬', 'class_1勝', 'class_2勝', 'class_3勝',
       'class_G3', 'class_L', 'class_オープン', 'class_G2', 'class_G1'],
      dtype='object')

# 出走馬情報の標準化


In [154]:
class RaceResults:
    """レース結果をデータを整形する"""

    with open("../models/race_info_encoder.pickle", "rb") as f:
        encoder = pickle.load(f)

    def __init__(self, path: str):
        self.results_raw = pd.read_pickle(path)
        self.results_processed = pd.read_pickle(path)
        # データの0埋めを行う
        self.results_processed.fillna(0, inplace=True)
        # 馬体重のカラムについては「0(0)」で埋める
        self.results_processed["馬体重"].replace(0, "0(0)", inplace=True)
        with open("../models/race_info_encoder.pickle", "rb") as f:
            self.encoder = pickle.load(f)

    def read_df(path: str) -> pd.DataFrame:
        if not isinstance(path, str):
            raise TypeError(
                f'"path" argument is expected to be of type str, got {type(path).__name__} instead'
            )
        results_processed = pd.read_pickle(path)
        return results_processed

    def divide_weight_gender(df_raw: pd.DataFrame):
        """馬の性齢と馬体重を分割する"""
        df = df_raw.copy()
        gender = df["性齢"].str[0]
        df["牡"] = gender.map(lambda x: 1 if x == "牡" else 0)
        df["牝"] = gender.map(lambda x: 1 if x == "牝" else 0)
        df["セ"] = gender.map(lambda x: 1 if x == "セ" else 0)
        df["年齢"] = df["性齢"].str[1:]
        df["体重"] = df["馬体重"].replace(
            to_replace=r"(\d+).*", value=r"\1", regex=True
        )
        df["増減"] = df["馬体重"].replace(
            to_replace=r"\d+\(\+{0,1}([-]{0,1}\d+)\)", value=r"\1", regex=True
        )
        return df

    def transform_rank(df_raw: pd.DataFrame) -> pd.DataFrame:
        df = df_raw.copy()
        df["3着以内"] = df["着順"].apply(
            lambda x: 1 if isinstance(x, int) and x <= 3 else 0
        )
        return df

    def drop_columns(df_raw: pd.DataFrame, columns: [str]) -> pd.DataFrame:
        """不要なカラムを削除"""
        df = df_raw.drop(columns=columns)
        return df

    def transform_date(date: str):
        """日付を変換して、その年の1月1日からの週数を計算する"""
        # 日付の形式を変換
        date_converted = datetime.datetime.strptime(date, "%Y年%m月%d日")
        # その年の1月1日を計算
        base_date = datetime.datetime(date_converted.year, 1, 1)
        # 週数の差を計算
        return (date_converted - base_date).days // 7

    def extraction_drop_columns(
        df: pd.DataFrame, columns: [str]
    ) -> (pd.DataFrame, pd.DataFrame):
        df_extraction = df.loc[:, columns]
        df_dropped = df.drop(columns=columns)
        return df_extraction, df_dropped

    @classmethod
    def adapt_race_info(cls, df_raw: pd.DataFrame) -> pd.DataFrame:
        df = df_raw.loc[[0], :]
        df["date"] = cls.transform_date(df.loc[0, "date"])
        df["course_length"] = float(df.loc[0, "course_length"]) / 100

        df = cls.encoder.transform(df)
        return df

    @classmethod
    def make_infos(cls, path: str) -> {}:
        drop_columns = [
            "馬名",
            "性齢",
            "騎手",
            "タイム",
            "着差",
            "人気",
            "調教師",
            "単勝",
            "jockey_id",
            "馬体重",
            "着順",
        ]
        race_info_columns = [
            "date",
            "round",
            "course_length",
            "course_type",
            "course_way",
            "weather",
            "state_grass",
            "state_dirt",
            "place",
            "class",
        ]
        df = cls.read_df(path)
        # データの0埋めを行う
        df = df.fillna(0)
        # 馬体重のカラムについては「0(0)」で埋める
        df["馬体重"].replace(0, "0(0)", inplace=True)
        df["馬体重"].replace("計不", "0(0)", inplace=True)
        df = cls.divide_weight_gender(df)
        df = cls.transform_rank(df)
        df = cls.drop_columns(df, drop_columns)
        race_info, horse_info = cls.extraction_drop_columns(df, race_info_columns)
        horse_id, horse_info = cls.extraction_drop_columns(horse_info, ["horse_id"])

        # 標準化等の変換
        race_info = cls.adapt_race_info(race_info)
        return {
            "race": race_info,
            "horse": horse_info,
            "ids": list(horse_id.iloc[:, 0].values),
        }

In [155]:
test = RaceResults.make_infos("../Raw-Data/Race-Results/2022/01020607.pkl")
test["horse"]

Unnamed: 0,着順,枠番,馬番,斤量,性別,年齢,体重,増減,3着以内
0,6,1,1,51,牡,3,486,4,0
1,9,2,2,52,牝,3,476,6,0
2,7,3,3,54,セ,4,478,8,0
3,11,4,4,55,牝,5,470,6,0
4,3,5,5,55,牡,4,538,-18,1
5,8,5,6,57,牡,5,498,2,0
6,4,6,7,51,牝,3,442,0,0
7,1,6,8,52,牝,3,442,10,1
8,5,7,9,53,牡,3,510,4,0
9,12,7,10,52,牝,3,514,2,0


In [None]:
results_path = "../Raw-Data/Race-Results/2022/"
dir_list = os.listdir(results_path)
df_list = []
for i in tqdm(dir_list):
    result = RaceResults.make_infos(f"{results_path}{i}")
    df_list.append(result["horse"])

df_integration = pd.concat(df_list)
df_integration.to_pickle("../tmp/horse-info.pkl")
df_integration

In [None]:
df = pd.read_pickle("../tmp/horse-info.pkl")
scaler = StandardScaler()
# 標準化したいカラムを指定
columns_to_scale = ["体重", "増減"]
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df

In [None]:
with open("../models/horse_info_scaler.pickle", "wb") as f:
    pickle.dump(scaler, f)

# データセットクラス


## VAE


In [28]:
class PositionalEncoding(nn.Module):
    def __init__(self, hidden_dim, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, hidden_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # div_termの計算
        div_term = torch.exp(
            torch.arange(0, hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim)
        )

        # position * div_term のサイズ調整
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)


class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, nheads, nlayers, dropout=0.1):
        super(Encoder, self).__init__()
        self.input_linear = nn.Linear(input_dim, hidden_dim)
        self.pos_encoder = PositionalEncoding(hidden_dim, dropout)
        encoder_layers = nn.TransformerEncoderLayer(
            hidden_dim, nheads, hidden_dim, dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)

    def forward(self, src):
        src = self.input_linear(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        return output


class Decoder(nn.Module):
    def __init__(self, hidden_dim, input_dim, nheads, nlayers, dropout=0.1):
        super(Decoder, self).__init__()
        self.pos_encoder = PositionalEncoding(hidden_dim, dropout)
        decoder_layers = nn.TransformerDecoderLayer(
            hidden_dim, nheads, hidden_dim, dropout, batch_first=True
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, nlayers)
        self.decoder = nn.Linear(hidden_dim, input_dim)

    def forward(self, src):
        src = self.pos_encoder(src)
        output = self.transformer_decoder(src, src)
        output = self.decoder(output)
        return output


class TransformerVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, nheads, nlayers, dropout=0.1):
        super(TransformerVAE, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, nheads, nlayers, dropout)
        self.decoder = Decoder(hidden_dim, input_dim, nheads, nlayers, dropout)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_log_var = nn.Linear(hidden_dim, latent_dim)
        self.fc_out = nn.Linear(latent_dim, hidden_dim)

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, src):
        encoded = self.encoder(src)
        mu = self.fc_mu(encoded)
        log_var = self.fc_log_var(encoded)
        z = self.reparameterize(mu, log_var)
        z = self.fc_out(z)
        decoded = self.decoder(z)
        return decoded, mu, log_var

    def get_latent_val(self, src):
        encoded = self.encoder(src)
        val = self.fc_mu(encoded)
        return val


class VAE:
    def __init__(self, path: str, device=None) -> None:
        """VAEモデルを推論モードで立ち上げる

        Args:
            str: 保存したモデルのパス
            device (_type_, optional): cpuでモデルを使う場合は"cpu"を入れる. デフォルトはNone.
        """
        self.model = TransformerVAE(
            input_dim=67,
            hidden_dim=64,
            latent_dim=4,
            nheads=8,
            nlayers=8,
            dropout=0.1,
        )
        if device == "cpu":
            self.model.load_state_dict(
                torch.load(path, map_location=torch.device("cpu"))
            )
        else:
            self.model.load_state_dict(torch.load(path))
        self.model.eval()

    def transform(self, df_raw: pd.DataFrame) -> torch.Tensor:
        """VAEによる変換をする

        Args:
            df_raw (pd.DataFrame): 対象のデータフレーム

        Returns:
            torch.Tensor: 変換し１次元にしたデータ
        """
        df = df_raw.copy()
        df = df.iloc[:].astype("float32")
        # データフレームをテンソルに変換
        data = torch.tensor(df.values, dtype=torch.float32).unsqueeze(0)

        with torch.no_grad():
            encoded = self.model.get_latent_val(data)
        return torch.flatten(encoded)

    def process(self, dfs: list[pd.DataFrame]):
        processed_data = []
        for i in dfs:
            data_transform = self.transform(i)
            processed_data.append(data_transform)
        return processed_data

## 成績の処理


### スーパークラス


In [29]:
class ResultProcessor(ABC):
    @abstractmethod
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

    def read_results(path: str) -> pd.DataFrame:
        if isinstance(path, str):
            return pd.read_pickle(path)
        else:
            raise TypeError(
                f'"path" argument is expected to be of type str, got {type(path).__name__} instead'
            )

    def arrange_result(df_raw: pd.DataFrame) -> pd.DataFrame:
        """データフレームの欠損値の0埋めとカラム名の空白を消す

        Args:
            df_raw (pd.DataFrame): 対象データフレーム

        Returns:
            pd.DataFrame: 処理後のデータフレーム
        """
        df = df_raw.copy()
        # 欠損値を0埋め
        df = df.fillna(0)
        # カラム名の空白を削除
        df.columns = df.columns.str.replace(" ", "")
        df["日付"] = pd.to_datetime(df["日付"], format="%Y/%m/%d")
        return df

    def remove_str(x: any) -> str:
        """文字列から数値だけを取り出す

        Args:
            x (any): 処理する文字列

        Returns:
            str: 処理後の文字列（数字がなかった場合は"0"を返す）
        """
        x_str = str(x)
        is_contain_num = re.search(r"\d+", x_str)
        if is_contain_num:
            return is_contain_num.group()
        else:
            return "0"

    def convert_date(x: str) -> int:
        """日付をその年の1/1から数えた週数に変換する

        Args:
            x (str): 日付

        Returns:
            int: 週数
        """

        # その年の1月1日を計算
        base_date = datetime.datetime(x.year, 1, 1)
        # 週数の差を計算
        return (x - base_date).days // 7

    def transform_held(held: str) -> str:
        """開催場所の文字列から不要な文字を取り除く。中央以外は"その他"にする

        Args:
            held (str): 処理する文字列

        Returns:
            str: 処理後の文字列
        """
        trim_held = re.sub(r"\d*", "", held)
        if not trim_held in [
            "東京",
            "中山",
            "中京",
            "阪神",
            "札幌",
            "函館",
            "福島",
            "新潟",
            "京都",
            "小倉",
        ]:
            return "その他"
        return trim_held

    def transform_race_name(race: str) -> str:
        """レースのクラス分けをする。当てはまらないものは"その他"にする

        Args:
            race (str): レースクラスの文字列

        Returns:
            str: 処理後文字列
        """

        if re.search(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", race):
            transform_name = re.sub(
                r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", r"\1", race
            )

        else:

            transform_name = "その他"
        return transform_name

    def extract_addition(df: pd.DataFrame) -> pd.DataFrame:

        weight = df["馬体重"]

        addition = weight.map(lambda x: re.sub(r".*\(([+-]\d{1,3}|0)\).*", r"\1", x))
        addition = addition.map(lambda x: re.sub(r"\+", "", x))
        return addition

    def drop_columns(df: pd.DataFrame, columns: [str]) -> pd.DataFrame:
        df_processed = df.drop(
            columns,
            axis=1,
        )

        return df_processed

    def divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        df_divided = df

        df_divided["コース"] = df_divided["距離"].map(lambda x: x[0])
        df_divided["距離"] = df_divided["距離"].map(lambda x: int(x[1:]) / 100)
        return df_divided

    def divide_horse_weight(df: pd.DataFrame) -> pd.DataFrame:
        """馬体重と増減を分ける

        Args:
            df (pd.DataFrame): 対象のデータフレーム

        Returns:
            pd.DataFrame: 処理後のデータフレーム
        """
        df_divided = df

        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: x.replace("計不", "0(0)")
        )
        weight = df_divided["馬体重"]
        weight_addition = weight.map(
            lambda x: re.sub(r".*\(([+-]\d{1,3}|0)\).*", r"\1", x)
        )
        weight_addition = weight_addition.map(lambda x: re.sub(r"\+", "", x))
        df_divided["増減"] = weight_addition
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: re.sub(r"\([+-]*\d+\)", "", x)
        )

        return df_divided

    def add_rows(df_raw: pd.DataFrame, rows: int) -> pd.DataFrame:
        """行の補填をする

        Args:
            df_raw (pd.DataFrame): 対象データフレーム
            rows (int): 補填する行数

        Returns:
            pd.DataFrame: 処理後のデータフレーム
        """
        df = df_raw.copy()
        df = pd.concat(
            [
                df,
                pd.DataFrame(np.zeros((rows, len(df.columns))), columns=df.columns),
            ],
            ignore_index=True,
        )
        return df

### 親成績の処理


In [30]:
class PedigreeResults(ResultProcessor):
    with open("../models/pedigree_pca.pickle", "rb") as f:
        pca = pickle.load(f)

    def arrange_result(df_raw: pd.DataFrame) -> pd.DataFrame:
        """データフレームの欠損値の0埋めとカラム名の空白を消す

        Args:
            df_raw (pd.DataFrame): 対象データフレーム

        Returns:
            pd.DataFrame: 処理後のデータフレーム
        """
        df = df_raw.copy()
        # 欠損値を0埋め
        df = df.fillna(0)
        # カラム名の空白を削除
        df.columns = df.columns.str.replace(" ", "")
        # df["日付"] = pd.to_datetime(df["日付"], format="%Y/%m/%d")
        return df

    def read_results(path: str) -> [pd.DataFrame]:
        if isinstance(path, str):
            ped_results = []
            with open(path, "rb") as f:
                peds = pickle.load(f)
            for i in peds:
                df = pd.read_pickle(f"../Raw-Data/Pedigree-Results/{i}.pkl")
                ped_results.append(df)
            return ped_results
        else:
            raise TypeError(
                f'"path" argument is expected to be of type str, got {type(path).__name__} instead'
            )

    def transform_race_length(length: str | int | float) -> str:
        if isinstance(length, str):
            length = int(length)

        elif math.isnan(length):
            length = 0
        elif not (isinstance(length, int) or isinstance(length, float)):
            raise TypeError(
                f'"length" argument is expected to be of type int or str, got {type(length).__name__} instead. The value is {length}'
            )
        match length:
            case length if length < 1000:
                return "不明"
            case length if length <= 1300:
                return "S"
            case length if length <= 1899:
                return "M"
            case length if length <= 2100:
                return "I"
            case length if length <= 2700:
                return "L"
            case length if length > 2700:
                return "E"

    def delete_invalid_race(df: pd.DataFrame) -> pd.DataFrame:
        df = df.drop(index=df[df["着順"] == 0].index)
        df = df.drop(index=df[df["着順"] == "0"].index)
        return df

    def divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        df_divided = df
        match_str = r"[芝ダ障]{0,1}\d{1,4}"
        if len(df_divided) <= 1:
            df_divided["コース"] = 0
            df_divided["距離"] = 0
            return df_divided
        df_divided["コース"] = df_divided["距離"].map(
            lambda x: x[0] if isinstance(x, str) else 0
        )
        df_divided["距離"] = df_divided["距離"].map(
            lambda x: int(x[1:]) if isinstance(x, str) else 0
        )
        return df_divided

    def totalling_result(df: pd.DataFrame) -> pd.DataFrame:
        """成績データを競馬場や馬場、着順等で分けて集計する

        Args:
            df (pd.DataFrame): 成績データ（時系列順）

        Returns:
            pd.DataFrame: 集計した成績データ
        """
        df_tmp: pd.DataFrame = pd.read_pickle(
            "../template/pedigree_results_template.pcl"
        )
        for _, row in df.iterrows():
            col: list[str] = [row["距離"], row["コース"]]
            if "不明" in col:
                continue
            rank: str = row["着順"] if int(row["着順"]) <= 3 else "3<"
            state: str = row["馬場"] if row["馬場"] != "不明" else "良"
            race_type: str = (
                "重賞" if row["レース名"] in ["G3", "G1", "G2"] else "非重賞"
            )
            col = f'{row["開催"]}_{race_type}_{row["距離"]}_{row["コース"]}_{state}_{rank}'
            df_tmp[col] += 1
        return df_tmp

    @classmethod
    def modify(cls, df_raw: pd.DataFrame) -> pd.DataFrame:
        df = df_raw.copy()
        df = cls.arrange_result(df)
        # 加工
        df = df[["開催", "天気", "レース名", "着順", "距離", "馬場"]]
        df = cls.divide_corse(df)
        if len(df) > 1:
            df["開催"] = df["開催"].map(cls.transform_held)
            df["レース名"] = df["レース名"].map(cls.transform_race_name)
            df["距離"] = df["距離"].map(cls.transform_race_length)
            df["着順"] = df["着順"].map(cls.remove_str)
        df = cls.delete_invalid_race(df)
        df = df.replace(0, "不明")
        return df.iloc[::-1].reset_index(drop=True)

    @classmethod
    def pca_transform(cls, df: pd.DataFrame) -> pd.DataFrame:
        df_transform = cls.pca.transform(df)
        return pd.DataFrame(df_transform)

    @classmethod
    def transform(cls, path: str) -> pd.DataFrame:
        results_raws = cls.read_results(path)
        results = []
        for i in results_raws:
            results_df = cls.modify(i)
            results.append(cls.totalling_result(results_df))
        result = pd.concat(results, axis=1)
        return cls.pca_transform(result)

    @classmethod
    def process(cls, path: [str] or str) -> pd.DataFrame:
        if isinstance(path, list):
            results = []
            for i in path:
                results.append(cls.transform(i))
            result = pd.concat(results)
            rows = 18 - len(result)
            if rows > 0:
                result = cls.add_rows(result, rows)
            return result
        elif isinstance(path, str):
            return cls.transform(path)

### 過去成績の処理


In [31]:
class HorseResult(ResultProcessor):
    max_column = 10
    columns_to_scale = ["馬体重", "増減", "斤量"]
    with open("../models/horse_result_encoder.pickle", "rb") as f:
        encoder: ce.OneHotEncoder = pickle.load(f)
    with open("../models/horse_results_scaler.pickle", "rb") as f:
        scaler: StandardScaler = pickle.load(f)

    def select_newer_race(df_raw: pd.DataFrame, date: str) -> pd.DataFrame:
        """レース日より前の日付の成績を抽出する

        Args:
            df_raw (pd.DataFrame): 対象データフレーム
            date (str): 基準にする日付

        Returns:
            pd.DataFrame: 抽出したデータ
        """
        df = df_raw.copy()
        df = df[df["日付"] < datetime.datetime.strptime(date, "%Y年%m月%d日")]
        return df

    @classmethod
    def mapping_data(cls, df_raw: pd.DataFrame) -> pd.DataFrame:
        """データフレームの各カラムの値に対する一括処理をまとめた関数

        Args:
            df_raw (pd.DataFrame): 対象データフレーム

        Returns:
            pd.DataFrame: 処理後のデータフレーム
        """
        df = df_raw.copy()
        df["日付"] = df["日付"].map(cls.convert_date)
        df["開催"] = df["開催"].map(cls.transform_held)
        df["レース名"] = df["レース名"].map(cls.transform_race_name)
        df["馬番"] = df["馬番"].map(lambda x: 0 if x > 18 else x)
        df["着順"] = df["着順"].map(cls.remove_str)
        df["馬場"] = df["馬場"].replace(0, "不明")
        df["天気"] = df["天気"].replace(0, "不明")
        return df

    @classmethod
    def transform(cls, df_raw: pd.DataFrame, date: str) -> pd.DataFrame:
        """成績データを変換する

        Args:
            df_raw (pd.DataFrame): 成績データ
            date (str): 基準の日付。これより前のレース成績だけを扱う

        Returns:
            pd.DataFrame: 変換後データ
        """
        df = df_raw.copy()
        columns = [
            "賞金",
            "厩舎ｺﾒﾝﾄ",
            "備考",
            "勝ち馬(2着馬)",
            "着差",
            "ﾀｲﾑ指数",
            "通過",
            "ペース",
            "上り",
            "馬場指数",
            "タイム",
            "映像",
            "騎手",
            "オッズ",
            "人気",
        ]
        df = cls.arrange_result(df)
        if date:
            df = cls.select_newer_race(df, date)
        df = cls.drop_columns(df, columns)
        df = cls.divide_horse_weight(df)
        df = cls.divide_corse(df)
        df = cls.mapping_data(df)
        # 型をintにする
        df = df.astype({"R": int, "枠番": int})
        # 標準化
        df[cls.columns_to_scale] = cls.scaler.transform(df[cls.columns_to_scale])
        # ダミー変数化
        df = cls.encoder.transform(df)
        # 行数の調整
        shortage_rows = cls.max_column - len(df)
        if shortage_rows > 0:
            df = cls.add_rows(df, shortage_rows)
        df = df.head(cls.max_column)
        df = df.iloc[::-1].reset_index(drop=True)
        return df

    @classmethod
    def process(
        cls, path: list[str] | str, date=None
    ) -> list[pd.DataFrame] | pd.DataFrame:
        """成績データが複数かどうかで処理を分けるための関数

        Args:
            path (list[str]orstr): 成績データのファイルパス
            date (_type_, optional): 日付。デフォルトはNone.

        Returns:
            list[pd.DataFrame] or pd.DataFrame: 変換したデータはデータフレーム単一か、リストに入れて返す
        """
        if isinstance(path, list):
            dfs = []
            for i in path:
                df_raw = cls.read_results(i)
                df = df_raw.copy()
                df = cls.transform(df, date)
                dfs.append(df)
            return dfs
        else:
            df_raw = cls.read_results(path)
            df = df_raw.copy()
            df = cls.transform(df, date)
            return df

## レース結果の処理


In [32]:
class RaceResults:
    """レース結果をデータを整形する"""

    with open("../models/race_info_encoder.pickle", "rb") as f:
        encoder = pickle.load(f)

    with open("../models/horse_info_scaler.pickle", "rb") as f:
        scaler = pickle.load(f)

    def read_df(path: str) -> pd.DataFrame:
        """データフレームの読み込み

        Args:
            path (str): pickleのパス

        Raises:
            TypeError: 引数が文字列でなければエラーを出す

        Returns:
            pd.DataFrame: 読み込んだデータフレーム
        """
        if not isinstance(path, str):
            raise TypeError(
                f'"path" argument is expected to be of type str, got {type(path).__name__} instead'
            )
        results_processed = pd.read_pickle(path)
        return results_processed

    def divide_weight_gender(df_raw: pd.DataFrame) -> pd.DataFrame:
        """性齢の値を性別と年齢に分け、馬体重も体重と増減に分ける。性別はダミー変数化する

        Args:
            df_raw (pd.DataFrame): 対象データフレーム

        Returns:
            pd.DataFrame: 変換後データフレーム
        """
        df = df_raw.copy()
        gender = df["性齢"].str[0]
        df["牡"] = gender.map(lambda x: 1 if x == "牡" else 0)
        df["牝"] = gender.map(lambda x: 1 if x == "牝" else 0)
        df["セ"] = gender.map(lambda x: 1 if x == "セ" else 0)
        df["年齢"] = df["性齢"].str[1:]
        df["体重"] = df["馬体重"].replace(
            to_replace=r"(\d+).*", value=r"\1", regex=True
        )
        df["増減"] = df["馬体重"].replace(
            to_replace=r"\d+\(\+{0,1}([-]{0,1}\d+)\)", value=r"\1", regex=True
        )
        return df

    def transform_rank(df_raw: pd.DataFrame) -> pd.DataFrame:
        """着順のデータを３着以内かどうかの値にする。(3着以内であれば1、そうでなければ0)

        Args:
            df_raw (pd.DataFrame): 対象データフレーム

        Returns:
            pd.DataFrame: 変換後データ
        """
        df = df_raw.copy()
        df["3着以内"] = df["着順"].apply(
            lambda x: 1 if isinstance(x, int) and x <= 3 else 0
        )
        return df

    def drop_columns(df_raw: pd.DataFrame, columns: [str]) -> pd.DataFrame:
        """不要なカラムを削除する

        Args:
            df_raw (pd.DataFrame): 対象データフレーム
            columns (str]): 削除するカラム名

        Returns:
            pd.DataFrame: 削除後データフレーム
        """
        df = df_raw.drop(columns=columns)
        return df

    def transform_date(date: str) -> str:
        """日付を変換して、その年の1月1日からの週数を計算する

        Args:
            date (str): 日付の文字列（%Y年%m月%d日）

        Returns:
            str: 変換後の日付文字列
        """
        # 日付の形式を変換
        date_converted = datetime.datetime.strptime(date, "%Y年%m月%d日")
        # その年の1月1日を計算
        base_date = datetime.datetime(date_converted.year, 1, 1)
        # 週数の差を計算
        return (date_converted - base_date).days // 7

    def extraction_drop_columns(
        df: pd.DataFrame, columns: [str]
    ) -> (pd.DataFrame, pd.DataFrame):
        """データフレームをカラム指定で分割する

        Args:
            df (pd.DataFrame): 対象のデータフレーム
            pd ([str]): 分割するカラム名

        Returns:
            pd.DataFrame, pd.DataFrame): 指定したカラムを抽出したデータフレームと、それを取り除いたデータフレーム
        """
        df_extraction = df.loc[:, columns]
        df_dropped = df.drop(columns=columns)
        return df_extraction, df_dropped

    def add_rows(df_raw: pd.DataFrame, rows: int) -> pd.DataFrame:
        df = df_raw.copy()
        df = pd.concat(
            [
                df,
                pd.DataFrame(np.zeros((rows, len(df.columns))), columns=df.columns),
            ],
            ignore_index=True,
        )
        return df

    @classmethod
    def adapt_race_info(cls, df_raw: pd.DataFrame) -> pd.DataFrame:
        """レース情報の日付をその年の週数に、コースの長さのスケールを1/100にする。データ型も変更する

        Args:
            df_raw (pd.DataFrame): レース情報

        Returns:
            pd.DataFrame: 変換後のデータ
        """
        df = df_raw.loc[[0], :]
        df["date"] = cls.transform_date(df.loc[0, "date"])
        df["course_length"] = float(df.loc[0, "course_length"]) / 100
        df["round"] = df["round"].astype(float)

        df = cls.encoder.transform(df)
        return df

    @classmethod
    def horse_info_transform(cls, df_raw: pd.DataFrame) -> pd.DataFrame:
        """出走馬情報の標準化と足りない行の補填、型変換をする

        Args:
            df_raw (pd.DataFrame): 出走馬情報

        Returns:
            pd.DataFrame: 変換後のデータ
        """
        df = df_raw.copy()
        columns_to_scale = ["体重", "増減"]
        df[columns_to_scale] = cls.scaler.transform(df[columns_to_scale])
        shortage_rows = 18 - len(df)
        df = cls.add_rows(df, shortage_rows)
        df["年齢"] = df["年齢"].astype(float)
        return df

    @classmethod
    def make_infos(cls, path: str) -> {pd.DataFrame or str}:
        """レース結果をレース情報、出走馬情報、出走馬ID、レース日付、正解ラベルの5個に分ける

        Args:
            path (str): レース結果ファイルのパス

        Returns:
            {pd.DataFrame or str}: dictで保存。キーはそれぞれrace,horse,ids,date,label。date以外はデータフレーム
        """
        drop_columns = [
            "馬名",
            "性齢",
            "騎手",
            "タイム",
            "着差",
            "人気",
            "調教師",
            "単勝",
            "jockey_id",
            "馬体重",
            "着順",
        ]
        race_info_columns = [
            "date",
            "round",
            "course_length",
            "course_type",
            "course_way",
            "weather",
            "state_grass",
            "state_dirt",
            "place",
            "class",
        ]
        df_raw = cls.read_df(path)
        df = df_raw.copy()
        # データの0埋めを行う
        df = df.fillna(0)
        # 馬体重のカラムについては「0(0)」で埋める
        df["馬体重"].replace(0, "0(0)", inplace=True)
        df["馬体重"].replace("計不", "0(0)", inplace=True)
        df = cls.divide_weight_gender(df)
        df = cls.transform_rank(df)
        df = cls.drop_columns(df, drop_columns)
        race_info, horse_info = cls.extraction_drop_columns(df, race_info_columns)
        horse_id, horse_info = cls.extraction_drop_columns(horse_info, ["horse_id"])

        # 標準化等の変換
        race_info = cls.adapt_race_info(race_info)
        horse_info = cls.horse_info_transform(horse_info)
        return {
            "race": race_info,
            "horse": horse_info.drop(["3着以内"], axis=1),
            "ids": list(horse_id.iloc[:, 0].values),
            "date": df_raw.loc[0, "date"],
            "label": horse_info["3着以内"],
        }

# 確認


In [100]:
# レース結果
test = RaceResults.make_infos("../Raw-Data/Race-Results/2022/01020607.pkl")

# 親成績
test_ped_paths = [f"../Raw-Data/Pedigree/{i}.pickle" for i in test["ids"]]
test_ped = PedigreeResults.process(test_ped_paths)

# 過去成績
test_results_paths = [f"../Raw-Data/Horse-Results/{i}.pkl" for i in test["ids"]]
test_results = HorseResult.process(test_results_paths, test["date"])
vae = VAE("../models/horse_result_VAE.pth", "cpu")
vae_test = vae.process(test_results)
vae_len = 18 - len(vae_test)
for _ in range(vae_len):
    array_zeros = torch.zeros(40)
    vae_test.append(array_zeros)
vae_test = torch.cat(vae_test)

race_array = test["race"].values.flatten()
race = torch.tensor(race_array, dtype=torch.float32)
horse_array = test["horse"].values.flatten()
horse = torch.tensor(horse_array, dtype=torch.float32)
ped_array = test_ped.values.flatten()
peds = torch.tensor(ped_array, dtype=torch.float32)
"""
race:1*40
horse:18*9
peds:18*44
result:1*720 (18*4*10)
"""
print(vae_test.size())
print(race.size())
print(horse.size())
print(peds.size())

torch.Size([720])
torch.Size([40])
torch.Size([162])
torch.Size([792])


# 学習用データセットの作成


メモ  
入力データの繋ぎ方は  
1.レース情報  
2.出走馬情報  
3.過去成績  
4.親成績  
の順番でする。各情報を１次元の torch,Tensor 型にして結合する


In [37]:
def df_to_tensor_1d(df_raw: pd.DataFrame) -> torch.Tensor:
    df = df_raw.copy()
    df_array = df.values.flatten()
    return torch.tensor(df_array, dtype=torch.float32)


def add_tensor(tensors: list[torch.Tensor]) -> torch.Tensor:
    tensors_tmp = tensors
    add_num = 18 - len(tensors)
    for _ in range(add_num):
        array_zeros = torch.zeros(40)
        tensors_tmp.append(array_zeros)
    return torch.cat(tensors_tmp)


def make_train_data(path: str):
    vae = VAE("../models/horse_result_VAE.pth")
    data = RaceResults.make_infos(path)

    ped_paths = [f"../Raw-Data/Pedigree/{i}.pickle" for i in data["ids"]]
    results_paths = [f"../Raw-Data/Horse-Results/{i}.pkl" for i in data["ids"]]

    ped_raw = PedigreeResults.process(ped_paths)
    result_raw = HorseResult.process(results_paths)

    vae_raw = vae.process(result_raw)
    vae_result = add_tensor(vae_raw)
    print(data["race"].size)
    print(data["horse"].size)
    print(vae_result.size())
    print(ped_raw.size)
    print(data["label"].size)
    race = df_to_tensor_1d(data["race"])
    horse = df_to_tensor_1d(data["horse"])
    peds = df_to_tensor_1d(ped_raw)
    label = df_to_tensor_1d(data["label"])
    return {
        "label": label,  # 正解ラベル
        "race": race,  # レース情報
        "horse": horse,  # 出走馬
        "results": torch.cat([vae_result, peds]),  # 成績
    }


# レース結果
# test = RaceResults.make_infos("../Raw-Data/Race-Results/2022/01020607.pkl")
test = make_train_data("../Raw-Data/Race-Results/2022/01020607.pkl")
for i in test.values():
    if isinstance(i, torch.Tensor):
        print(i.size())

40
162
torch.Size([720])
792
18
torch.Size([18])
torch.Size([40])
torch.Size([162])
torch.Size([1512])


In [7]:
def df_to_tensor_1d(df_raw: pd.DataFrame) -> torch.Tensor:
    df = df_raw.copy()
    df_array = df.values.flatten()
    return torch.tensor(df_array, dtype=torch.float32)


def add_tensor(tensors: list[torch.Tensor]) -> torch.Tensor:
    tensors_tmp = tensors
    add_num = 18 - len(tensors)
    for _ in range(add_num):
        array_zeros = torch.zeros(40)
        tensors_tmp.append(array_zeros)
    return torch.cat(tensors_tmp)


def make_train_data(path: str):
    vae = VAE("../models/horse_result_VAE.pth")
    data = RaceResults.make_infos(path)

    ped_paths = [f"../Raw-Data/Pedigree/{i}.pickle" for i in data["ids"]]
    results_paths = [f"../Raw-Data/Horse-Results/{i}.pkl" for i in data["ids"]]

    ped_raw = PedigreeResults.process(ped_paths)
    result_raw = HorseResult.process(results_paths)

    vae_raw = vae.process(result_raw)
    vae_result = add_tensor(vae_raw)

    race = df_to_tensor_1d(data["race"])
    horse = df_to_tensor_1d(data["horse"])
    peds = df_to_tensor_1d(ped_raw)
    label = df_to_tensor_1d(data["label"])
    return {
        "label": label,  # 正解ラベル
        "race": race,  # レース情報
        "horse": horse,  # 出走馬
        "results": torch.cat([vae_result, peds]),  # 成績
    }


def save_pickle(path: str, data) -> None:
    with open(path, "wb") as f:
        pickle.dump(data, f)


results_path = "../Raw-Data/Race-Results/2022/"
dir_list = os.listdir(results_path)
for i in tqdm(dir_list):
    try:
        save_file_name = i.replace(".pkl", "")
        save_path = f"../Processed-Data/Race-Results/{save_file_name}.pickle"
        if os.path.exists(save_path):
            continue
        result_path = f"{results_path}{i}"
        result_processed = make_train_data(result_path)
        save_pickle(save_path, result_processed)
    except Exception as e:
        print(i)
        raise e

100%|██████████| 3456/3456 [15:03<00:00,  3.82it/s]   


In [23]:
test = "../Processed-Data/Race-Results/01010101.pickle"
with open(test, "rb") as f:
    data = pickle.load(f)
data
t = torch.cat([data["race"], data["horse"], data["results"]])
t.size()

torch.Size([1714])

In [7]:
results_path = "../Processed-Data/Race-Results/"
dir_list = os.listdir(results_path)
data_set = []
for i in tqdm(dir_list):
    path = f"../Processed-Data/Race-Results/{i}"
    with open(path, "rb") as f:
        data = pickle.load(f)
    data_modify = {
        "data": torch.cat([data["race"], data["horse"], data["results"]]),
        "label": data["label"],
    }
    data_set.append(data_modify)

  0%|          | 0/3456 [00:00<?, ?it/s]

100%|██████████| 3456/3456 [00:01<00:00, 2998.37it/s]


In [12]:
data_set[0]

{'data': tensor([29.,  1., 18.,  ...,  0.,  0.,  0.]),
 'label': tensor([1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}

# 学習モデル作成


## データセットクラス


In [73]:
class CustomDataSet(Dataset):

    def __init__(self, data, is_file=False):
        """
        Args:
            file_paths (list of str): 学習用データファイルのパスのリスト
        """
        self.data = data
        self.file = is_file

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.file:
            file_path = self.data[idx]
            with open(file_path, "rb") as f:
                data = pickle.load(f)
            inputs = torch.cat([data["race"], data["horse"], data["results"]])
            labels = data["label"]
            return inputs, labels
        else:
            data_set = self.data[idx]
            return data_set["data"], data_set["label"]

## モデル


In [78]:
class NNClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(NNClassifier, self).__init__()
        self.fc_in = nn.Linear(input_size, 1024)  # 入力層から隠れ層へ
        self.fc1 = nn.Linear(1024, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc_act = nn.Mish()
        self.fc_sig = nn.Sigmoid()
        self.fc_out = nn.Linear(1024, output_size)  # 隠れ層から出力層へ

    def forward(self, x):
        out = self.fc_in(x)
        out = self.fc_act(out)
        out = self.fc1(out)
        out = self.fc_act(out)
        out = self.fc2(out)
        out = self.fc_act(out)
        out = self.fc_out(out)
        out = self.fc_sig(out)
        return out

## 学習


### 学習データの用意


In [79]:
# ファイルで学習させるとき
results_path = "../Processed-Data/Race-Results/"
dir_list_raw = os.listdir(results_path)
dir_list = list(map(lambda x: f"../Processed-Data/Race-Results/{x}", dir_list_raw))
dataset = CustomDataSet(dir_list, is_file=True)

"""
results_path = "../Processed-Data/Race-Results/"
dir_list = os.listdir(results_path)
data_set = []
for i in tqdm(dir_list):
    path = f"../Processed-Data/Race-Results/{i}"
    with open(path, "rb") as f:
        data = pickle.load(f)
    data_modify = {
        "data": torch.cat([data["race"], data["horse"], data["results"]]),
        "label": data["label"],
    }
    data_set.append(data_modify)
dataset = CustomDataSet(dir_list)
"""

dataset_size = len(dataset)
# 分割比率を設定 (例: 訓練:検証:テスト = 70%:15%:15%)
train_size = int(dataset_size * 0.7)
val_size = int(dataset_size * 0.15)
test_size = dataset_size - train_size - val_size  # 残りをテストセットとする

print(f"train:{train_size}")
print(f"val:{val_size}")
print(f"test:{test_size}")

# データセットをランダムに分割
train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)

batch = 64
# DataLoaderを作成
train_loader = DataLoader(
    train_dataset,
    batch_size=batch,
    shuffle=True,
    pin_memory=True,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

train:2419
val:518
test:519
cuda


### モデルの用意


In [83]:
class WeightedBCELoss(nn.Module):
    def __init__(self, pos_weight=1.0):
        super(WeightedBCELoss, self).__init__()
        self.pos_weight = pos_weight

    def forward(self, logits, targets):
        # ラベルが1の要素に対して指定された重みを適用
        weights = torch.ones_like(targets) * self.pos_weight
        weights[targets == 0] = 1.0
        # 重み付きバイナリクロスエントロピー損失の計算
        loss = F.binary_cross_entropy_with_logits(logits, targets, weight=weights)

        return loss


# 1714


model = NNClassifier(1714, 18).to(device)



criterion = WeightedBCELoss(pos_weight=6.0)


optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001)

In [84]:
def check_accuracy_topk(loader, model, k=3):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            scores = model(x)
            # 上位k個の予測を1に、それ以外を0にする
            topk_predictions = torch.zeros_like(scores, device=device)
            topk_vals, topk_indices = scores.topk(k, dim=1)
            # 上位k個の位置に1を設定
            topk_predictions.scatter_(1, topk_indices, 1)

            # yが1の値の時のみを考慮するために、topk_predictionsとyの論理ANDを取る
            correct_predictions = topk_predictions.bool() & y.bool()
            # 正解のカウント（yが1の場合のみ）
            num_correct += correct_predictions.type(torch.float).sum().item()
            # yが1の値の総数をカウント
            num_samples += y.sum().item()

        # 正解率の計算（yが1の場合のみに基づく）
        accuracy = (num_correct / num_samples * 100) if num_samples > 0 else 0
    model.train()
    return f"Accuracy: {accuracy:.2f}%"


num_epochs = 50  # エポック数


for epoch in range(num_epochs):

    for batch_idx, (data, targets) in enumerate(train_loader):
        x = data.to(device)
        y = targets.to(device)
        scores = model(x)
        loss = criterion(scores, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 5 == 0:

        print(
            f"Epoch [{epoch+1}/{num_epochs}]  Loss{loss}  {check_accuracy_topk(val_loader, model)}"
        )

Epoch [5/50]  Loss1.2255194187164307  Accuracy: 23.04%
Epoch [10/50]  Loss1.1354992389678955  Accuracy: 25.56%
Epoch [15/50]  Loss1.1959128379821777  Accuracy: 25.21%
Epoch [20/50]  Loss1.1577461957931519  Accuracy: 27.03%
Epoch [25/50]  Loss1.12587571144104  Accuracy: 26.19%
Epoch [30/50]  Loss1.157365083694458  Accuracy: 28.08%
Epoch [35/50]  Loss1.0958878993988037  Accuracy: 28.57%
Epoch [40/50]  Loss1.1195775270462036  Accuracy: 28.92%
Epoch [45/50]  Loss1.0984946489334106  Accuracy: 30.60%
Epoch [50/50]  Loss1.0982002019882202  Accuracy: 30.88%


### 評価


In [None]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()  # モデルを評価モードに設定

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            scores = model(x, mask=None)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(f"Accuracy: {float(num_correct)/float(num_samples)*100:.2f}%")

    model.train()  # モデルを訓練モードに戻す


check_accuracy(test_loader, model)

# 2 値分類バージョン


## データセット作成


In [22]:
test = "../Processed-Data/Race-Results/01010101.pickle"
with open(test, "rb") as f:
    data = pickle.load(f)
data
t = torch.cat([data["race"], data["horse"], data["results"]])
data["label"]
count = 1
for i in data["label"]:
    horse_num = torch.Tensor([count % 19])
    count += 1
    print(torch.cat([horse_num, t]))

tensor([ 1., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 2., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 3., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 4., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 5., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 6., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 7., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 8., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([ 9., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([10., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([11., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([12., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([13., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([14., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([15., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([16., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([17., 29.,  1.,  ...,  0.,  0.,  0.])
tensor([18., 29.,  1.,  ...,  0.,  0.,  0.])


In [27]:
results_path = "../Processed-Data/Race-Results/"
dir_list = os.listdir(results_path)
count = 1
for i in tqdm(dir_list):
    raw_path = f"../Processed-Data/Race-Results/{i}"
    with open(raw_path, "rb") as f:
        data = pickle.load(f)

    static_data = torch.cat([data["race"], data["horse"], data["results"]])
    for i in data["label"]:
        horse_num = torch.Tensor([count % 19])
        new_data = {"input": torch.cat([horse_num, static_data]), "label": i}
        with open(
            f"../Processed-Data/Race-Results-one-output/{count}.pickle", "wb"
        ) as f:
            pickle.dump(new_data, f)
        count += 1

100%|██████████| 3456/3456 [00:53<00:00, 65.03it/s]


## データセットクラス


In [68]:
class CustomDataSet(Dataset):

    def __init__(self, data):
        """
        Args:
            file_paths (list of str): 学習用データファイルのパスのリスト
        """
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data[idx]
        with open(file_path, "rb") as f:
            data = pickle.load(f)
        return data["input"], data["label"]

## NN モデル


In [69]:
class NN(nn.Module):
    def __init__(self, input_size, output_size):
        super(NN, self).__init__()
        self.fc_in = nn.Linear(input_size, 1024)  # 入力層から隠れ層へ
        self.fc1 = nn.Linear(1024, 1024)
        self.fc_act = nn.Mish()
        self.fc_sig = nn.Sigmoid()
        self.fc_out = nn.Linear(1024, output_size)  # 隠れ層から出力層へ

    def forward(self, x):
        out = self.fc_in(x)
        out = self.fc_act(out)
        out = self.fc1(out)
        out = self.fc_act(out)
        out = self.fc_out(out)
        out = self.fc_sig(out)
        return out

In [70]:
results_path = "../Processed-Data/Race-Results-one-output/"
dir_list = os.listdir(results_path)
file_list = list(map(lambda x: f"{results_path}{x}", dir_list))
dataset = CustomDataSet(file_list)
print(dataset[0][0].size())
print(dataset[0][1])

dataset_size = len(dataset)
# 分割比率を設定 (例: 訓練:検証:テスト = 70%:15%:15%)
train_size = int(dataset_size * 0.7)
val_size = int(dataset_size * 0.15)
test_size = dataset_size - train_size - val_size  # 残りをテストセットとする

print(f"train:{train_size}")
print(f"val:{val_size}")
print(f"test:{test_size}")

# データセットをランダムに分割
train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)

batch = 64
# DataLoaderを作成
train_loader = DataLoader(
    train_dataset,
    batch_size=batch,
    shuffle=True,
    pin_memory=True,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

torch.Size([1715])
tensor(1.)
train:43545
val:9331
test:9332
cuda


In [71]:
class WeightedBCELoss(nn.Module):
    def __init__(self, pos_weight=6):
        super(WeightedBCELoss, self).__init__()
        self.pos_weight = pos_weight

    def forward(self, input, target):
        # ラベルが1の要素に対して指定された重みを適用
        weight = target * self.pos_weight + (1.0 - target)
        loss = nn.functional.binary_cross_entropy(input, target, weight=weight)
        return loss


# 1714
model = NN(1715, 1).to(device)

criterion = WeightedBCELoss(pos_weight=6)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001)

In [72]:
def calculate_accuracy(scores, targets):
    # スコアが0.5以上のものを1と予測
    predictions = scores >= 0.5
    correct = (predictions == targets).float()  # 正解の予測
    accuracy = correct.sum() / len(correct)
    return accuracy


def calculate_accuracy_for_label_one(scores, targets):
    # ラベルが1のデータのみを抽出
    mask = targets == 1
    correct = ((scores >= 0.5) == targets)[mask].float()  # 正解の予測
    accuracy = correct.sum() / len(correct) if len(correct) > 0 else 0
    return accuracy


def test(loader):
    model.eval()  # モデルを評価モードに設定
    with torch.no_grad():
        total_accuracy = 0
        label_one_accuracy = 0
        num_samples = 0
        for data, targets in loader:
            x = data.to(device)
            y = targets.unsqueeze(1).to(device)
            scores = model(x)
            total_accuracy += calculate_accuracy(scores, y)
            label_one_accuracy += calculate_accuracy_for_label_one(scores, y)
            num_samples += 1
        print(
            f"Epoch [{epoch+1}/{num_epochs}]  Loss: {loss.item()}, Total Val Accuracy: {total_accuracy / num_samples}, Label 1 Val Accuracy: {label_one_accuracy / num_samples}"
        )
    model.train()  # モデルを訓練モードに戻す


num_epochs = 25  # エポック数
for epoch in range(num_epochs):

    for batch_idx, (data, targets) in enumerate(train_loader):
        x = data.to(device)
        y = targets.unsqueeze(1).to(device)
        scores = model(x)
        loss = criterion(scores, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 5 == 0:
        test(val_loader)

Epoch [5/25]  Loss: 1.1987590789794922, Total Val Accuracy: 0.18549807369709015, Label 1 Val Accuracy: 0.9717885851860046
Epoch [10/25]  Loss: 1.5290355682373047, Total Val Accuracy: 0.368990033864975, Label 1 Val Accuracy: 0.5739591121673584
Epoch [15/25]  Loss: 1.024174690246582, Total Val Accuracy: 0.28751805424690247, Label 1 Val Accuracy: 0.7172467708587646
Epoch [20/25]  Loss: 1.3569706678390503, Total Val Accuracy: 0.3459533452987671, Label 1 Val Accuracy: 0.5355780720710754
Epoch [25/25]  Loss: 1.1005702018737793, Total Val Accuracy: 0.4014424681663513, Label 1 Val Accuracy: 0.3859107494354248
