# パッケージ


In [1]:
import pandas as pd
import datetime
import re
import time
import os
from tqdm import tqdm
import category_encoders as ce
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler

import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math
from sklearn.model_selection import train_test_split

# データ加工(データフレーム内)


## one-hot-encoding のモデル作成


### データの整形を行うクラス(one-hot-encoding 前のため仮)


In [69]:
class HorseProcessor:
    def remove_str(x: any) -> str:
        x_str = str(x)
        is_contain_num = re.search(r"\d+", x_str)
        if is_contain_num:
            return is_contain_num.group()
        else:
            return "0"

    def convert_date(x: str | int) -> int:
        """日付をその年の1日1月を基点とした日数に変換する
        Args:
            x (str | int): 日付(YYYY/MM/DD)
        Returns:
            int: 日数
        """
        """日付を変換して、その年の1月1日からの週数を計算する"""
        # 日付の形式を変換
        date_converted = datetime.datetime.strptime(x, "%Y/%m/%d")
        # その年の1月1日を計算
        base_date = datetime.datetime(date_converted.year, 1, 1)
        # 週数の差を計算
        return (date_converted - base_date).days // 7

    @staticmethod
    def transform_held(held: str) -> str:
        trim_held = re.sub(r"\d*", "", held)
        if not trim_held in [
            "東京",
            "中山",
            "中京",
            "阪神",
            "札幌",
            "函館",
            "福島",
            "新潟",
            "京都",
            "小倉",
        ]:
            return "その他"
        return trim_held

    @staticmethod
    def transform_race_name(race: str) -> str:
        # r"新馬|未勝利|1勝|2勝|3勝|オープン"
        if re.search(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", race):
            transform_name = re.sub(
                r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", r"\1", race
            )
        else:
            transform_name = "その他"
        return transform_name

    @staticmethod
    def extract_addition(df: pd.DataFrame) -> pd.DataFrame:
        """体重の増減を抽出する
        Args:
            df (pd.DataFrame): 変化対象のデータ
        Returns:
            pd.DataFrame: 変換後のデータ
        """
        weight = df["馬体重"]
        addition = weight.map(lambda x: re.sub(r".*\(([+-]\d{1,3}|0)\).*", r"\1", x))
        addition = addition.map(lambda x: re.sub(r"\+", "", x))
        return addition

    @staticmethod
    def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
        """不要なカラムを削除
        Args:
            df (pd.DataFrame): 成績データ
        Returns:
            pd.DataFrame: 削除後データ
        """
        df_processed = df.drop(
            [
                "賞金",
                "厩舎ｺﾒﾝﾄ",
                "備考",
                "勝ち馬(2着馬)",
                "着差",
                "ﾀｲﾑ指数",
                "通過",
                "ペース",
                "上り",
                "馬場指数",
                "タイム",
                "映像",
                "騎手",
                "オッズ",
                "人気",
            ],
            axis=1,
        )
        return df_processed

    @staticmethod
    def divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        df_divided = df
        df_divided["コース"] = df_divided["距離"].map(lambda x: x[0])
        df_divided["距離"] = df_divided["距離"].map(lambda x: int(x[1:]) / 100)
        return df_divided

    @classmethod
    def divide_horse_weight(cls, df: pd.DataFrame) -> pd.DataFrame:
        """馬体重を分ける
        Args:
            df (pd.DataFrame): 加工前データ
        Returns:
            pd.DataFrame: 加工後データ
        """
        df_divided = df
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: x.replace("計不", "0(0)")
        )
        weight_addition = cls.extract_addition(df_divided)
        df_divided["増減"] = weight_addition
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: re.sub(r"\([+-]*\d+\)", "", x)
        )
        return df_divided

    @classmethod
    def process(cls, path):
        df_raw = pd.read_pickle(path)
        df_processed = df_raw.copy()
        # カラムの整形・削除
        df_processed.columns = df_processed.columns.str.replace(" ", "")
        df_processed = cls.drop_columns(df_processed)
        # データの変換
        df_processed = cls.divide_horse_weight(df_processed)
        df_processed["日付"] = df_processed["日付"].map(cls.convert_date)
        df_processed["開催"] = df_processed["開催"].map(cls.transform_held)
        df_processed["レース名"] = df_processed["レース名"].map(cls.transform_race_name)
        df_processed = cls.divide_corse(df_processed)
        df_processed["馬番"] = df_processed["馬番"].map(lambda x: 0 if x > 18 else x)
        df_processed["着順"] = df_processed["着順"].map(cls.remove_str)
        # 欠損値の処理
        df_processed["馬場"] = df_processed["馬場"].fillna("不明")
        df_processed["天気"] = df_processed["天気"].fillna("不明")
        df_processed = df_processed.fillna(0)
        # 型をintにする
        df = df_processed.astype({"R": int, "枠番": int})
        # 開催,天 気,レース名
        return df

In [70]:
test = "../Raw-Data/Horse-Results/2017105082.pkl"
"""
['日付', '開催', '天 気', 'R', 'レース名', '映 像', '頭 数', '枠 番', '馬 番', 'オ ッ ズ',
  '人 気', '着 順', '騎手', '斤 量', '距離', '馬 場', '馬場 指数', 'タイム', '着差', 'ﾀｲﾑ 指数',
  '通過', 'ペース', '上り', '馬体重', '厩舎 ｺﾒﾝﾄ', '備考', '勝ち馬 (2着馬)', '賞金']
"""
df = HorseProcessor.process(test)

df

Unnamed: 0,日付,開催,天気,R,レース名,頭数,枠番,馬番,着順,斤量,距離,馬場,馬体重,増減,コース
0,47,東京,曇,12,G1,18,5,9,7,58,24.0,良,514,-12,芝
1,40,京都,曇,11,G2,14,2,2,7,59,24.0,重,526,6,芝
2,25,阪神,曇,11,G1,17,4,8,8,58,22.0,良,520,0,芝
3,11,その他,晴,0,G1,15,0,15,13,57,20.0,良,0,0,ダ
4,51,中山,晴,11,G1,16,3,6,10,57,25.0,良,518,0,芝
5,47,東京,晴,12,G1,18,3,6,1,57,24.0,良,518,0,芝
6,40,阪神,曇,11,G2,14,6,10,1,56,24.0,稍,518,8,芝
7,23,東京,曇,11,3勝,15,5,9,1,56,24.0,良,510,-2,芝
8,19,東京,曇,10,3勝,12,8,11,3,57,24.0,良,512,-2,芝
9,15,中山,曇,12,3勝,13,5,7,3,55,25.0,良,514,-2,芝


### 整形して 1 つのファイルに統合


In [71]:
dir_list = os.listdir("../Raw-Data/Horse-Results/")
df_integrated = pd.DataFrame()
tmp = []
for i in tqdm(dir_list):
    df_raw = HorseProcessor.process(f"../Raw-Data/Horse-Results/{i}")
    # df = HorseProcessor.divide_data_frame(df_raw)["str"]
    # df_integrated = pd.concat([df_integrated, df_raw], axis=0)
    tmp.append(df_raw)

df_integrated = pd.concat(tmp, axis=0)
df_integrated = df_integrated.drop_duplicates()
df_integrated = df_integrated.reset_index(drop=True)
df_integrated

100%|██████████| 11557/11557 [00:44<00:00, 257.28it/s]


Unnamed: 0,日付,開催,天気,R,レース名,頭数,枠番,馬番,着順,斤量,距離,馬場,馬体重,増減,コース
0,30,新潟,晴,8,G3,14,5,7,13,60.0,32.5,良,462,2,障
1,26,福島,晴,4,OP,13,7,11,7,60.0,27.5,良,460,-8,障
2,21,新潟,小雨,4,OP,14,4,5,9,60.0,28.9,稍,468,2,障
3,18,新潟,晴,1,OP,14,6,9,8,60.0,28.9,良,466,-8,障
4,46,東京,晴,8,OP,13,8,13,0,59.0,31.1,良,474,6,障
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206761,31,札幌,晴,10,2勝,11,1,1,1,55.0,15.0,良,472,0,芝
206762,28,札幌,晴,9,1勝,9,8,9,1,55.0,18.0,良,472,0,芝
206763,6,東京,晴,11,G3,12,7,9,9,56.0,18.0,良,472,2,芝
206764,50,阪神,晴,11,G1,17,7,14,3,55.0,16.0,良,470,6,芝


### 保存


In [72]:
df_integrated.to_csv("../tmp/horse-result-tmp.csv")
df_integrated.to_pickle("../tmp/horse-result-tmp.pkl")

In [34]:
tmp_data = pd.read_pickle("../tmp/horse-result-tmp.pkl")
print(tmp_data.columns)
"""
馬場
天気
"""
tmp_data["天気"] = tmp_data["天気"].replace("不明", "晴")
tmp_data["馬場"] = tmp_data["馬場"].replace("不明", "良")
tmp_data["開催"] = tmp_data["開催"].replace("その他", "福島")
tmp_data["レース名"] = tmp_data["レース名"].replace("その他", "未勝利")
tmp_data["馬番"] = tmp_data["馬番"].replace(0, 1)
tmp_data["枠番"] = tmp_data["枠番"].replace(0, 1)
tmp_data[["枠番"]].value_counts()

Index(['日付', '開催', '天気', 'R', 'レース名', '頭数', '枠番', '馬番', '着順', '斤量', '距離', '馬場',
       '馬体重', '増減', 'コース'],
      dtype='object')


枠番
8     31761
7     30649
6     28085
5     26451
4     23965
3     23031
2     21803
1     21021
Name: count, dtype: int64

### one-hot-encoding のモデル作成


In [67]:
tmp_data = pd.read_pickle("../tmp/horse-result-tmp.pkl")
tmp_data["天気"] = tmp_data["天気"].replace("不明", "晴")
tmp_data["馬場"] = tmp_data["馬場"].replace("不明", "良")
tmp_data["開催"] = tmp_data["開催"].replace("その他", "福島")
tmp_data["レース名"] = tmp_data["レース名"].replace("その他", "未勝利")
tmp_data["馬番"] = tmp_data["馬番"].replace(0, 1)
tmp_data["枠番"] = tmp_data["枠番"].replace(0, 1)

encoder = ce.OneHotEncoder(
    cols=["開催", "天気", "レース名", "馬場", "コース", "枠番", "馬番"],
    handle_unknown="value",
    use_cat_names=True,
)


data_one_hot = encoder.fit_transform(tmp_data)


df_one_hot = pd.DataFrame(data_one_hot)


df_one_hot.columns

Index(['日付', '開催_新潟', '開催_福島', '開催_東京', '開催_中京', '開催_小倉', '開催_京都', '開催_中山',
       '開催_札幌', '開催_函館', '開催_阪神', '天気_晴', '天気_小雨', '天気_曇', '天気_雨', '天気_小雪',
       '天気_雪', 'R', 'レース名_G3', 'レース名_OP', 'レース名_未勝利', 'レース名_新馬', 'レース名_G1',
       'レース名_G2', 'レース名_3勝', 'レース名_2勝', 'レース名_L', 'レース名_1勝', '頭数', '枠番_5.0',
       '枠番_7.0', '枠番_4.0', '枠番_6.0', '枠番_8.0', '枠番_1.0', '枠番_3.0', '枠番_2.0',
       '馬番_7.0', '馬番_11.0', '馬番_5.0', '馬番_9.0', '馬番_13.0', '馬番_1.0', '馬番_10.0',
       '馬番_12.0', '馬番_4.0', '馬番_14.0', '馬番_8.0', '馬番_6.0', '馬番_3.0', '馬番_2.0',
       '馬番_16.0', '馬番_15.0', '馬番_17.0', '馬番_18.0', '着順', '斤量', '距離', '馬場_良',
       '馬場_稍', '馬場_重', '馬場_不', '馬体重', '増減', 'コース_障', 'コース_ダ', 'コース_芝'],
      dtype='object')

In [68]:
with open("../models/horse_result_encoder.pickle", "wb") as f:
    pickle.dump(encoder, f)

In [69]:
tmp_data = pd.read_pickle("../tmp/horse-result-tmp.pkl")
tmp_data["馬番"].value_counts()

馬番
5     16781
2     16719
1     16680
6     16644
4     16639
7     16628
3     16626
8     15989
9     14858
10    13458
11    11341
12     9662
13     7383
14     6469
15     5276
16     4088
17      837
18      686
0         2
Name: count, dtype: int64

In [70]:
tmp_data = pd.read_pickle("../tmp/horse-result-tmp.pkl")
tmp_data["天気"] = tmp_data["天気"].replace("不明", "晴")
tmp_data["馬場"] = tmp_data["馬場"].replace("不明", "良")
tmp_data["開催"] = tmp_data["開催"].replace("その他", "福島")
tmp_data["レース名"] = tmp_data["レース名"].replace("その他", "未勝利")
tmp_data["馬番"] = tmp_data["馬番"].replace(0, 1)
tmp_data["枠番"] = tmp_data["枠番"].replace(0, 1)


scaler = StandardScaler()

columns_to_scale = ["馬体重", "増減", "斤量"]

tmp_data[columns_to_scale] = scaler.fit_transform(tmp_data[columns_to_scale])

tmp_data

Unnamed: 0,日付,開催,天気,R,レース名,頭数,枠番,馬番,着順,斤量,距離,馬場,馬体重,増減,コース
0,30,新潟,晴,8,G3,14,5,7,13,2.882668,32.5,良,-0.123278,0.206988,障
1,26,福島,晴,4,OP,13,7,11,7,2.882668,27.5,良,-0.164349,-1.235823,障
2,21,新潟,小雨,4,OP,14,4,5,9,2.882668,28.9,稍,-0.000064,0.206988,障
3,18,新潟,晴,1,OP,14,6,9,8,2.882668,28.9,良,-0.041135,-1.235823,障
4,46,東京,晴,8,OP,13,8,13,0,2.303276,31.1,良,0.123150,0.784113,障
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206761,31,札幌,晴,10,2勝,11,1,1,1,-0.014292,15.0,良,0.082078,-0.081574,芝
206762,28,札幌,晴,9,1勝,9,8,9,1,-0.014292,18.0,良,0.082078,-0.081574,芝
206763,6,東京,晴,11,G3,12,7,9,9,0.565100,18.0,良,0.082078,0.206988,芝
206764,50,阪神,晴,11,G1,17,7,14,3,-0.014292,16.0,良,0.041007,0.784113,芝


In [71]:
with open("../models/horse_results_scaler.pickle", "wb") as f:
    pickle.dump(scaler, f)

## 学習用のデータにする


### クラス


In [15]:
class HorseProcessor:
    max_rows = 10
    columns_to_scale = ["馬体重", "増減", "斤量"]
    with open("../models/horse_result_encoder.pickle", "rb") as f:
        encoder: ce.OneHotEncoder = pickle.load(f)
    with open("../models/horse_results_scaler.pickle", "rb") as f:
        scaler: StandardScaler = pickle.load(f)

    def remove_str(x: any) -> str:
        x_str = str(x)
        is_contain_num = re.search(r"\d+", x_str)
        if is_contain_num:
            return is_contain_num.group()
        else:
            return "0"

    def convert_date(x: str | int) -> int:
        """日付をその年の1日1月を基点とした週数に変換する

        Args:
            x (str | int): 日付(YYYY/MM/DD)

        Returns:
            int: 日数
        """
        # 日付の形式を変換
        date_converted = datetime.datetime.strptime(x, "%Y/%m/%d")
        # その年の1月1日を計算
        base_date = datetime.datetime(date_converted.year, 1, 1)
        # 週数の差を計算
        return (date_converted - base_date).days // 7

    @staticmethod
    def __transform_held(held: str) -> str:
        trim_held = re.sub(r"\d*", "", held)
        if not trim_held in [
            "東京",
            "中山",
            "中京",
            "阪神",
            "札幌",
            "函館",
            "福島",
            "新潟",
            "京都",
            "小倉",
        ]:
            return "その他"
        return trim_held

    @staticmethod
    def __transform_race_name(race: str) -> str:
        # r"新馬|未勝利|1勝|2勝|3勝|オープン"
        if re.search(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", race):
            transform_name = re.sub(
                r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", r"\1", race
            )
        else:
            transform_name = "その他"
        return transform_name

    @staticmethod
    def __extract_addition(df: pd.DataFrame) -> pd.DataFrame:
        """体重の増減を抽出する

        Args:
            df (pd.DataFrame): 変化対象のデータ

        Returns:
            pd.DataFrame: 変換後のデータ
        """
        weight = df["馬体重"]
        addition = weight.map(lambda x: re.sub(r".*\(([+-]\d{1,3}|0)\).*", r"\1", x))
        addition = addition.map(lambda x: re.sub(r"\+", "", x))
        return addition

    @staticmethod
    def __drop_columns(df: pd.DataFrame) -> pd.DataFrame:
        """不要なカラムを削除

        Args:
            df (pd.DataFrame): 成績データ

        Returns:
            pd.DataFrame: 削除後データ
        """
        df_processed = df.drop(
            [
                "賞金",
                "厩舎ｺﾒﾝﾄ",
                "備考",
                "勝ち馬(2着馬)",
                "着差",
                "ﾀｲﾑ指数",
                "通過",
                "ペース",
                "上り",
                "馬場指数",
                "タイム",
                "映像",
                "騎手",
                "オッズ",
                "人気",
            ],
            axis=1,
        )
        return df_processed

    @staticmethod
    def __divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        df_divided = df
        df_divided["コース"] = df_divided["距離"].map(lambda x: x[0])
        df_divided["距離"] = df_divided["距離"].map(lambda x: int(x[1:]) / 100)
        return df_divided

    @classmethod
    def __divide_horse_weight(cls, df: pd.DataFrame) -> pd.DataFrame:
        """馬体重を分ける

        Args:
            df (pd.DataFrame): 加工前データ

        Returns:
            pd.DataFrame: 加工後データ
        """
        df_divided = df
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: x.replace("計不", "0(0)")
        )
        weight_addition = cls.__extract_addition(df_divided)
        df_divided["増減"] = weight_addition
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: re.sub(r"\([+-]*\d+\)", "", x)
        )
        return df_divided

    @classmethod
    def process(cls, path):
        df_raw = pd.read_pickle(path).head(cls.max_rows)
        df_processed = df_raw.copy()
        # カラム名の空白を削除
        df_processed.columns = df_processed.columns.str.replace(" ", "")
        df_processed = cls.__drop_columns(df_processed)
        df_processed = cls.__divide_horse_weight(df_processed)
        df_processed["日付"] = df_processed["日付"].map(cls.convert_date)
        df_processed["開催"] = df_processed["開催"].map(cls.__transform_held)
        df_processed["レース名"] = df_processed["レース名"].map(
            cls.__transform_race_name
        )
        df_processed = cls.__divide_corse(df_processed)
        # df_processed["距離"] = df_processed["距離"].map(lambda x: int(x) / 100)
        df_processed["馬番"] = df_processed["馬番"].map(lambda x: 0 if x > 18 else x)
        df_processed["着順"] = df_processed["着順"].map(cls.remove_str)
        # 欠損値の処理
        df_processed["馬場"] = (
            df_processed["馬場"].fillna("不明").infer_objects(copy=False)
        )
        df_processed["天気"] = (
            df_processed["天気"].fillna("不明").infer_objects(copy=False)
        )
        df_processed = df_processed.fillna(0).infer_objects(copy=False)
        # 型をintにする
        df = df_processed.astype({"R": int, "枠番": int})
        # 標準化
        df[cls.columns_to_scale] = cls.scaler.transform(df[cls.columns_to_scale])
        # ダミー変数化
        df = cls.encoder.transform(df)
        # 行数を調整
        if len(df) < cls.max_rows:
            rows_to_add = cls.max_rows - len(df)
            # すべての項目が0の行を作成
            additional_rows = pd.DataFrame(
                np.zeros((rows_to_add, len(df.columns))), columns=df.columns
            )
            # 追加の行をDataFrameに結合
            df = pd.concat([df, additional_rows], ignore_index=True)
        return df.iloc[::-1].reset_index(drop=True)

In [16]:
test = "../Raw-Data/Horse-Results/2012101505.pkl"
df = HorseProcessor.process(test)


df

Unnamed: 0,日付,開催_新潟,開催_福島,開催_東京,開催_中京,開催_小倉,開催_京都,開催_中山,開催_札幌,開催_函館,...,距離,馬場_良,馬場_稍,馬場_重,馬場_不,馬体重,増減,コース_障,コース_ダ,コース_芝
0,27,0,0,0,0,1,0,0,0,0,...,17.0,0,0,1,0,0.78029,1.361237,0,1,0
1,34,0,0,0,0,1,0,0,0,0,...,28.6,1,0,0,0,0.739218,-0.370136,1,0,0
2,37,0,0,0,0,0,0,1,0,0,...,28.8,1,0,0,0,0.492791,-1.812948,1,0,0
3,3,0,0,0,0,1,0,0,0,0,...,28.6,1,0,0,0,0.492791,-0.081574,1,0,0
4,11,0,0,0,0,0,0,0,0,0,...,14.0,0,0,0,1,0.841896,2.371205,0,1,0
5,14,0,0,0,0,0,0,0,0,0,...,14.0,1,0,0,0,0.862432,0.062707,0,1,0
6,16,0,0,0,0,0,0,0,0,0,...,13.0,0,0,1,0,0.924039,0.351269,0,1,0
7,20,0,0,0,0,0,0,0,0,0,...,14.0,0,0,1,0,0.985646,0.351269,0,1,0
8,23,0,0,0,0,0,0,0,0,0,...,13.0,0,0,0,1,1.006181,0.062707,0,1,0
9,25,0,0,0,0,0,0,0,0,0,...,16.0,0,0,0,1,1.006181,-0.081574,0,1,0


### AE 学習用データの作成


In [17]:
dir_list = os.listdir("../Raw-Data/Horse-Results/")
df_integrated = pd.DataFrame()
for i in tqdm(dir_list):
    df = HorseProcessor.process(f"../Raw-Data/Horse-Results/{i}")
    df.to_pickle(f"../Processed-Data/Horse-Results/{i}")

  0%|          | 0/11557 [00:00<?, ?it/s]

100%|██████████| 11557/11557 [03:22<00:00, 56.95it/s]


# AE 作成


## モデル


### 位置エンコーディング


In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, hidden_dim, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, hidden_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # div_termの計算
        div_term = torch.exp(
            torch.arange(0, hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim)
        )

        # position * div_term のサイズ調整
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

### エンコーダ


In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, nheads, nlayers, dropout=0.1):
        super(Encoder, self).__init__()
        self.input_linear = nn.Linear(input_dim, hidden_dim)
        self.pos_encoder = PositionalEncoding(hidden_dim, dropout)
        encoder_layers = nn.TransformerEncoderLayer(
            hidden_dim, nheads, hidden_dim, dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)

    def forward(self, src):
        src = self.input_linear(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        return output

### デコーダ


In [4]:
class Decoder(nn.Module):
    def __init__(self, hidden_dim, input_dim, nheads, nlayers, dropout=0.1):
        super(Decoder, self).__init__()
        self.pos_encoder = PositionalEncoding(hidden_dim, dropout)
        decoder_layers = nn.TransformerDecoderLayer(
            hidden_dim, nheads, hidden_dim, dropout, batch_first=True
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, nlayers)
        self.decoder = nn.Linear(hidden_dim, input_dim)

    def forward(self, src):
        src = self.pos_encoder(src)
        output = self.transformer_decoder(src, src)
        output = self.decoder(output)
        return output

### VAE モデル


input_dim: 入力データの特徴量の次元数。この例では、各時系列データポイントが 25 個の特徴を持っていることを意味します。  
hidden_dim: Transformer モデル内の隠れ層の次元数。これは、モデル内部の各セルフアテンション層や全結合層のサイズを決定します。  
latent_dim: 潜在空間の次元数。VAE のエンコーダが出力する潜在変数の次元数です。  
nheads: マルチヘッドアテンションの「ヘッド」の数。これは、セルフアテンションを並列に行う際の分割数を指します。  
nlayers: Transformer 内のエンコーダ（およびデコーダ）層の数。モデルの深さを決定します。  
dropout: ドロップアウト率。過学習を防ぐために層間でランダムにノードを無効化する割合。


In [5]:
class TransformerVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, nheads, nlayers, dropout=0.1):
        super(TransformerVAE, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, nheads, nlayers, dropout)
        self.decoder = Decoder(hidden_dim, input_dim, nheads, nlayers, dropout)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_log_var = nn.Linear(hidden_dim, latent_dim)
        self.fc_out = nn.Linear(latent_dim, hidden_dim)

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, src):
        encoded = self.encoder(src)
        mu = self.fc_mu(encoded)
        log_var = self.fc_log_var(encoded)
        z = self.reparameterize(mu, log_var)
        z = self.fc_out(z)
        decoded = self.decoder(z)
        return decoded, mu, log_var

    def get_latent_val(self, src):
        encoded = self.encoder(src)
        val = self.fc_mu(encoded)
        return val

### VAE の損失関数


In [6]:
def vae_loss(recon_x, x, mu, log_var):
    recon_loss = F.mse_loss(recon_x, x, reduction="sum")
    kl_div = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return recon_loss + kl_div

### データセットクラス


In [7]:
class TimeSeriesDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        # CSVファイルからデータフレームを読み込む
        df = pd.read_pickle(self.file_paths[idx])
        df = df.astype("float32")
        # データフレームをテンソルに変換
        data_tensor = torch.tensor(df.values, dtype=torch.float32)
        return data_tensor

In [8]:
"""
Index(['日付', '開催_新潟', '開催_福島', '開催_東京', '開催_中京', '開催_小倉', '天気_晴', '天気_小雨',
       '天気_曇', 'R', 'レース名_G3', 'レース名_OP', '頭数', '枠番', '馬番', '着順', '斤量', '距離',
       '馬場_良', '馬場_稍', '馬体重', '増減', 'コース_障'],
      dtype='object')
"""

test_df = pd.read_pickle("../Processed-Data/Horse-Results/2011101814.pkl")

# データフレームの全ての値をfloat32型に変換
test_df = test_df.astype("float32")
# データフレームをテンソルに変換
data_tensor = torch.tensor(test_df.values, dtype=torch.float32)
data_tensor.size()

torch.Size([10, 67])

### 学習


#### optuna でハイパーパラメータ探索


In [9]:
file_names = os.listdir("../Processed-Data/Horse-Results/")
file_paths = list(map(lambda x: "../Processed-Data/Horse-Results/" + x, file_names))
train_paths, test_paths = train_test_split(file_paths, test_size=0.3)
train_paths, val_paths = train_test_split(train_paths, test_size=0.2)

# カスタムデータセットのインスタンス化
train_dataset = TimeSeriesDataset(train_paths)
val_dataset = TimeSeriesDataset(val_paths)
test_dataset = TimeSeriesDataset(test_paths)

# データローダーの設定
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [11]:
def objective(trial):
    # ハイパーパラメータの提案
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    hidden_dim = trial.suggest_categorical("hidden_dim", [8 * i for i in range(2, 13)])
    latent_dim = trial.suggest_int("latent_dim", 2, 20, log=True)

    # モデルとオプティマイザの設定
    model = TransformerVAE(
        input_dim=67,
        hidden_dim=hidden_dim,
        latent_dim=latent_dim,
        nheads=8,
        nlayers=8,
        dropout=0.1,
    )
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    # 訓練ループ
    for epoch in range(15):  # エポック数は適宜調整
        model.train()
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            recon_batch, mu, log_var = model(batch)
            loss = vae_loss(recon_batch, batch, mu, log_var)  # あなたの損失関数を使用
            loss.backward()
            optimizer.step()

    # 検証データセットでの性能評価
    # ここでは、簡単化のために最後の訓練損失を使用しています
    return loss.item()

In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)  # 試行回数は適宜調整

# 最適なハイパーパラメータを取得
best_params = study.best_params
best_value = study.best_trial.value
print("最適なハイパーパラメータ:", best_params)
print("最適な試行の値:", best_value)

[I 2024-02-23 17:58:49,557] A new study created in memory with name: no-name-4de8b5d2-0f07-4bf7-9473-ffa31d2885d6
[I 2024-02-23 18:01:54,661] Trial 0 finished with value: 14289.3671875 and parameters: {'lr': 0.0005848000583139613, 'hidden_dim': 32, 'latent_dim': 2}. Best is trial 0 with value: 14289.3671875.
[I 2024-02-23 18:04:31,161] Trial 1 finished with value: 24731.24609375 and parameters: {'lr': 0.0014449486078287315, 'hidden_dim': 88, 'latent_dim': 12}. Best is trial 0 with value: 14289.3671875.
[I 2024-02-23 18:07:55,443] Trial 2 finished with value: 4884.84375 and parameters: {'lr': 0.00039184798884231393, 'hidden_dim': 32, 'latent_dim': 3}. Best is trial 2 with value: 4884.84375.
[I 2024-02-23 18:10:53,298] Trial 3 finished with value: 66491.4609375 and parameters: {'lr': 2.3855165476421653e-05, 'hidden_dim': 40, 'latent_dim': 15}. Best is trial 2 with value: 4884.84375.
[I 2024-02-23 18:13:44,336] Trial 4 finished with value: 24170.513671875 and parameters: {'lr': 0.00220969

最適なハイパーパラメータ: {'lr': 0.0003321518308021874, 'hidden_dim': 64, 'latent_dim': 4}
最適な試行の値: 2330.4013671875


最適なハイパーパラメータ: {'lr': 0.0003105892254637004, 'hidden_dim': 72, 'latent_dim': 10}  
最適な試行の値: 3516.364501953125  
最適なハイパーパラメータ: {'lr': 0.00018391539970447082, 'hidden_dim': 88, 'latent_dim': 4}  
最適な試行の値: 3377.569580078125  
{'lr': 0.00013470490850046185, 'hidden_dim': 88, 'latent_dim': 16}. Best is trial 0 with value: 2913.029052734375.  
最適なハイパーパラメータ: {'lr': 0.0003321518308021874, 'hidden_dim': 64, 'latent_dim': 4}  
最適な試行の値: 2330.4013671875


In [21]:
"""{'lr': 0.00015207365301991906, 'hidden_dim': 120, 'latent_dim': 23}
value: 885.03564453125.

{'lr': 0.0003215171404708647, 'hidden_dim': 48, 'latent_dim': 24}
"""

best_params = study.best_params
best_value = study.best_trial.value
print("最適なハイパーパラメータ:", best_params)
print("最適な試行の値:", best_value)
best_params

最適なハイパーパラメータ: {'lr': 0.0003105892254637004, 'hidden_dim': 72, 'latent_dim': 10}
最適な試行の値: 3516.364501953125


{'lr': 0.0003105892254637004, 'hidden_dim': 72, 'latent_dim': 10}

#### 本学習


In [12]:
file_names = os.listdir("../Processed-Data/Horse-Results/")
file_paths = list(map(lambda x: "../Processed-Data/Horse-Results/" + x, file_names))
train_paths, test_paths = train_test_split(file_paths, test_size=0.3)
train_paths, val_paths = train_test_split(train_paths, test_size=0.2)

# カスタムデータセットのインスタンス化
train_dataset = TimeSeriesDataset(train_paths)
val_dataset = TimeSeriesDataset(val_paths)
test_dataset = TimeSeriesDataset(test_paths)

# データローダーの設定
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# GPUが使える場合はGPU上で動かす
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# モデルのインスタンス化
model = TransformerVAE(
    input_dim=67,
    hidden_dim=64,
    latent_dim=4,
    nheads=8,
    nlayers=8,
    dropout=0.1,
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0003321518308021874)

# エポック数
num_epochs = 1000
# 評価を行うエポック数
eval_interval = 100

# 訓練
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        recon_batch, mu, log_var = model(batch)
        loss = vae_loss(recon_batch, batch, mu, log_var)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    # 一定のエポック数ごとに検証データセットでモデルを評価
    if epoch % eval_interval == 0 or epoch == num_epochs - 1:
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for val_batch in val_loader:
                val_batch = val_batch.to(device)
                recon_batch, mu, log_var = model(val_batch)
                loss = vae_loss(recon_batch, val_batch, mu, log_var)
                val_loss += loss.item()

        val_loss /= len(val_loader.dataset)
        print(
            f"Epoch {epoch}, Train Loss: {train_loss / len(train_loader.dataset)}, Val Loss: {val_loss}"
        )

Epoch 0, Train Loss: 9260.335116529517, Val Loss: 7096.69003012979
Epoch 100, Train Loss: 274.3910008128211, Val Loss: 232.7100547913126
Epoch 200, Train Loss: 193.23354074911143, Val Loss: 185.0399320812249
Epoch 300, Train Loss: 175.26785978654164, Val Loss: 172.33336044330386
Epoch 400, Train Loss: 170.4574132125541, Val Loss: 166.54922949339462
Epoch 500, Train Loss: 167.9102824327106, Val Loss: 166.1536800351514
Epoch 600, Train Loss: 170.4832520323547, Val Loss: 163.441719800197
Epoch 700, Train Loss: 164.41084558601597, Val Loss: 162.66561425660538
Epoch 800, Train Loss: 165.27810842967662, Val Loss: 162.05711713680083
Epoch 900, Train Loss: 167.21875290508856, Val Loss: 161.9586095633112
Epoch 999, Train Loss: 162.31711611027808, Val Loss: 159.79968276807787


In [13]:
model.eval()
test_loss = 0
with torch.no_grad():
    for test_batch in test_loader:
        test_batch = test_batch.to(device)
        recon_batch, mu, log_var = model(test_batch)
        loss = vae_loss(recon_batch, test_batch, mu, log_var)
        test_loss += loss.item()

test_loss /= len(test_loader.dataset)
print(f"Test Loss: {test_loss}.")

Test Loss: 160.59897741121955.


#### 保存


In [14]:
torch.save(model.state_dict(), "../models/horse_result_VAE.pth")

## 確認(実際に変換してみる)


horse_result_VAE1.pth : input_dim=67,hidden_dim=64,latent_dim=4,nheads=8,nlayers=8,dropout=0.1,


In [16]:
model = TransformerVAE(
    input_dim=67,
    hidden_dim=64,
    latent_dim=4,
    nheads=8,
    nlayers=8,
    dropout=0.1,
)
model.load_state_dict(torch.load("../models/horse_result_VAE.pth"))
model.eval()
test_df = pd.read_pickle("../Processed-Data/Horse-Results/2011101814.pkl")
test_df = test_df.iloc[:].astype("float32")
# データフレームをテンソルに変換
data = torch.tensor(test_df.values, dtype=torch.float32).unsqueeze(0)

with torch.no_grad():
    encoded = model.get_latent_val(data)

print(data.size())
print(encoded.size())
encoded = torch.flatten(encoded)
print(encoded.size())
encoded

torch.Size([1, 10, 67])
torch.Size([1, 10, 4])
torch.Size([40])


tensor([ 1.3240,  1.1904, -1.2016,  1.0077,  0.5459,  1.3108, -0.5373,  2.0632,
        -0.6467,  1.4909, -0.3631,  1.0998, -1.1117,  0.8077,  0.9746,  2.0135,
         2.5587,  0.8441, -0.3512,  2.1659,  0.5870,  1.3265, -0.4426,  1.1715,
         0.1295,  0.7271, -0.2655,  1.0376, -0.9515,  0.5080, -1.4459,  1.2903,
        -1.6778,  1.1907, -1.3257,  3.1523,  0.1215,  0.7278,  0.6286,  1.0269])