# パッケージ


In [1]:
import pandas as pd
import datetime
import re
import time
import os
from tqdm import tqdm
import category_encoders as ce
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler

import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math
from sklearn.model_selection import train_test_split

# データ加工(データフレーム内)


## one-hot-encoding のモデル作成


### データの整形(仮)


In [28]:
class HorseProcessor:
    def remove_str(x: any) -> str:
        x_str = str(x)
        is_contain_num = re.search(r"\d+", x_str)
        if is_contain_num:
            return is_contain_num.group()
        else:
            return "0"

    def convert_date(x: str | int) -> int:
        """日付をその年の1日1月を基点とした日数に変換する
        Args:
            x (str | int): 日付(YYYY/MM/DD)
        Returns:
            int: 日数
        """
        year = re.sub(r"(\d{1,4})/.*/.*", r"\1", x)
        new_year = datetime.datetime(int(year), 1, 1)
        unix_year = int(time.mktime(new_year.timetuple()))
        date_format = datetime.datetime.strptime(x, "%Y/%m/%d")
        unix_date = int(time.mktime(date_format.timetuple()))
        return (unix_date - unix_year) // 86400

    @staticmethod
    def __transform_held(held: str) -> str:
        trim_held = re.sub(r"\d*", "", held)
        if not trim_held in [
            "東京",
            "中山",
            "中京",
            "阪神",
            "札幌",
            "函館",
            "福島",
            "新潟",
            "京都",
            "小倉",
        ]:
            return "その他"
        return trim_held

    @staticmethod
    def __transform_race_name(race: str) -> str:
        # r"新馬|未勝利|1勝|2勝|3勝|オープン"
        if re.search(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", race):
            transform_name = re.sub(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", r"\1", race)
        else:
            transform_name = "その他"
        return transform_name

    @staticmethod
    def __extract_addition(df: pd.DataFrame) -> pd.DataFrame:
        """体重の増減を抽出する
        Args:
            df (pd.DataFrame): 変化対象のデータ
        Returns:
            pd.DataFrame: 変換後のデータ
        """
        weight = df["馬体重"]
        addition = weight.map(lambda x: re.sub(r".*\(([+-]\d{1,3}|0)\).*", r"\1", x))
        addition = addition.map(lambda x: re.sub(r"\+", "", x))
        return addition

    @staticmethod
    def __drop_columns(df: pd.DataFrame) -> pd.DataFrame:
        """不要なカラムを削除
        Args:
            df (pd.DataFrame): 成績データ
        Returns:
            pd.DataFrame: 削除後データ
        """
        df_processed = df.drop(
            [
                "賞金",
                "厩舎 ｺﾒﾝﾄ",
                "備考",
                "勝ち馬 (2着馬)",
                "着差",
                "ﾀｲﾑ 指数",
                "通過",
                "ペース",
                "上り",
                "馬場 指数",
                "タイム",
                "映 像",
                "騎手",
                "オ ッ ズ",
                "人 気",
            ],
            axis=1,
        )
        return df_processed

    @staticmethod
    def __divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        df_divided = df
        df_divided["コース"] = df_divided["距離"].map(lambda x: x[0])
        df_divided["距離"] = df_divided["距離"].map(lambda x: x[1:])
        return df_divided

    @classmethod
    def __divide_horse_weight(cls, df: pd.DataFrame) -> pd.DataFrame:
        """馬体重を分ける
        Args:
            df (pd.DataFrame): 加工前データ
        Returns:
            pd.DataFrame: 加工後データ
        """
        df_divided = df
        df_divided["馬体重"] = df_divided["馬体重"].map(lambda x: x.replace("計不", "0(0)"))
        weight_addition = cls.__extract_addition(df_divided)
        df_divided["増減"] = weight_addition
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: re.sub(r"\([+-]*\d+\)", "", x)
        )
        return df_divided

    @classmethod
    def process(cls, path):
        df_raw = pd.read_pickle(path)
        df_processed = cls.__drop_columns(df_raw)
        df_processed = cls.__divide_horse_weight(df_processed)
        df_processed["日付"] = df_processed["日付"].map(cls.convert_date)
        df_processed["開催"] = df_processed["開催"].map(cls.__transform_held)
        df_processed["レース名"] = df_processed["レース名"].map(cls.__transform_race_name)
        df_processed = cls.__divide_corse(df_processed)
        df_processed["距離"] = df_processed["距離"].map(lambda x: int(x) / 100)
        df_processed["馬 番"] = df_processed["馬 番"].map(lambda x: 0 if x > 18 else x)
        df_processed["着 順"] = df_processed["着 順"].map(cls.remove_str)
        # カラム名の空白を削除
        df_processed.columns = df_processed.columns.str.replace(" ", "")
        # 欠損値を0埋め
        df_processed = df_processed.fillna(0)
        # 型をintにする
        df = df_processed.astype({"R": int, "枠番": int})
        # 開催,天 気,レース名
        return df

In [29]:
test = "../Raw-Data/Horse-Results/2017105082.pkl"
"""
['日付', '開催', '天 気', 'R', 'レース名', '映 像', '頭 数', '枠 番', '馬 番', 'オ ッ ズ',
  '人 気', '着 順', '騎手', '斤 量', '距離', '馬 場', '馬場 指数', 'タイム', '着差', 'ﾀｲﾑ 指数',
  '通過', 'ペース', '上り', '馬体重', '厩舎 ｺﾒﾝﾄ', '備考', '勝ち馬 (2着馬)', '賞金']
"""
df = HorseProcessor.process(test)

df

Unnamed: 0,日付,開催,天気,R,レース名,頭数,枠番,馬番,着順,斤量,距離,馬場,馬体重,増減,コース
0,329,東京,曇,12,G1,18,5,9,7,58,24.0,良,514,-12,芝
1,281,京都,曇,11,G2,14,2,2,7,59,24.0,重,526,6,芝
2,175,阪神,曇,11,G1,17,4,8,8,58,22.0,良,520,0,芝
3,83,その他,晴,0,G1,15,0,15,13,57,20.0,良,0,0,ダ
4,358,中山,晴,11,G1,16,3,6,10,57,25.0,良,518,0,芝
5,330,東京,晴,12,G1,18,3,6,1,57,24.0,良,518,0,芝
6,282,阪神,曇,11,G2,14,6,10,1,56,24.0,稍,518,8,芝
7,161,東京,曇,11,3勝,15,5,9,1,56,24.0,良,510,-2,芝
8,133,東京,曇,10,3勝,12,8,11,3,57,24.0,良,512,-2,芝
9,106,中山,曇,12,3勝,13,5,7,3,55,25.0,良,514,-2,芝


### 整形して 1 つのファイルに統合


In [31]:
dir_list = os.listdir("../Raw-Data/Horse-Results/")
df_integrated = pd.DataFrame()
for i in tqdm(dir_list):
    df_raw = HorseProcessor.process(f"../Raw-Data/Horse-Results/{i}")
    # df = HorseProcessor.divide_data_frame(df_raw)["str"]
    df_integrated = pd.concat([df_integrated, df_raw], axis=0)

df_integrated
df_integrated = df_integrated.drop_duplicates()
df_integrated = df_integrated.reset_index(drop=True)
df_integrated

  0%|          | 0/11557 [00:00<?, ?it/s]

  5%|▌         | 620/11557 [00:03<01:04, 169.33it/s]


KeyboardInterrupt: 

### 保存


In [32]:
df_integrated.to_csv("../tmp/horse-result-tmp.csv")
df_integrated.to_pickle("../tmp/horse-result-tmp.pkl")

### one-hot-encoding のモデル作成


In [33]:
tmp_data = pd.read_pickle("../tmp/horse-result-tmp.pkl")
encoder = ce.OneHotEncoder(
    cols=["開催", "天気", "レース名", "馬場", "コース", "枠番", "馬番"],
    handle_unknown="value",
    use_cat_names=True,
)
data_one_hot = encoder.fit_transform(tmp_data)
df_one_hot = pd.DataFrame(data_one_hot)
df_one_hot.columns

Index(['日付', '開催_新潟', '開催_福島', '開催_東京', '開催_中京', '開催_小倉', '開催_京都', '開催_中山',
       '開催_札幌', '開催_函館', '開催_阪神', '開催_その他', '天気_晴', '天気_小雨', '天気_曇', '天気_雨',
       '天気_小雪', '天気_0', '天気_雪', 'R', 'レース名_G3', 'レース名_OP', 'レース名_未勝利',
       'レース名_その他', 'レース名_新馬', 'レース名_G1', 'レース名_G2', 'レース名_3勝', 'レース名_2勝',
       'レース名_L', 'レース名_1勝', '頭数', '枠番_5.0', '枠番_7.0', '枠番_4.0', '枠番_6.0',
       '枠番_8.0', '枠番_1.0', '枠番_3.0', '枠番_2.0', '枠番_0.0', '馬番_7.0', '馬番_11.0',
       '馬番_5.0', '馬番_9.0', '馬番_13.0', '馬番_1.0', '馬番_10.0', '馬番_12.0', '馬番_4.0',
       '馬番_14.0', '馬番_8.0', '馬番_6.0', '馬番_3.0', '馬番_2.0', '馬番_16.0', '馬番_15.0',
       '馬番_17.0', '馬番_18.0', '着順', '斤量', '距離', '馬場_良', '馬場_稍', '馬場_重', '馬場_不',
       '馬場_0', '馬体重', '増減', 'コース_障', 'コース_ダ', 'コース_芝'],
      dtype='object')

In [34]:
with open("../models/horse_result_encoder.pickle", "wb") as f:
    pickle.dump(encoder, f)

In [35]:
tmp_data = pd.read_pickle("../tmp/horse-result-tmp.pkl")
tmp_data["頭数"].value_counts()

頭数
16    7483
15    2450
14    2366
12    2283
10    1893
11    1840
13    1763
18    1484
9     1419
8      896
17     460
7      386
6      118
0       31
5       28
4        2
19       1
20       1
Name: count, dtype: int64

In [37]:
tmp_data = pd.read_pickle("../tmp/horse-result-tmp.pkl")
scaler = StandardScaler()
columns_to_scale = ["日付", "馬体重", "増減", "斤量", "R", "頭数"]
tmp_data[columns_to_scale] = scaler.fit_transform(tmp_data[columns_to_scale])

tmp_data

Unnamed: 0,日付,開催,天気,R,レース名,頭数,枠番,馬番,着順,斤量,距離,馬場,馬体重,増減,コース
0,0.262117,新潟,晴,-0.152117,G3,0.156992,5,7,13,2.271237,32.5,良,-0.336885,0.198514,障
1,0.001020,福島,晴,-1.470710,OP,-0.182955,7,11,7,2.271237,27.5,良,-0.382504,-1.129751,障
2,-0.325353,新潟,小雨,-1.470710,OP,0.156992,4,5,9,2.271237,28.9,稍,-0.200029,0.198514,障
3,-0.521176,新潟,晴,-2.459656,OP,0.156992,6,9,8,2.271237,28.9,良,-0.245648,-1.129751,障
4,1.315834,東京,晴,-0.152117,OP,-0.182955,8,13,0,1.709437,31.1,良,-0.063174,0.729820,障
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,1.483683,阪神,晴,-2.459656,未勝利,-0.522902,8,12,2,-0.537764,12.0,稍,-0.063174,0.464167,ダ
38,1.287859,京都,晴,-1.470710,未勝利,1.516780,8,16,7,-0.537764,12.0,良,-0.154411,0.198514,芝
39,1.026761,京都,曇,-2.130007,未勝利,-1.542743,4,4,3,-0.537764,12.0,良,-0.200029,-0.332792,芝
40,0.830938,阪神,雨,-1.800359,未勝利,-0.182955,4,4,4,-1.099564,12.0,重,-0.154411,-0.332792,芝


In [38]:
with open("../models/horse_results_scaler.pickle", "wb") as f:
    pickle.dump(scaler, f)

## 学習用のデータにする


### クラス


In [39]:
class HorseProcessor:
    max_rows = 10
    columns_to_scale = ["日付", "馬体重", "増減", "斤量", "R", "頭数"]
    with open("../models/horse_result_encoder.pickle", "rb") as f:
        encoder: ce.OneHotEncoder = pickle.load(f)
    with open("../models/horse_results_scaler.pickle", "rb") as f:
        scaler: StandardScaler = pickle.load(f)

    def remove_str(x: any) -> str:
        x_str = str(x)
        is_contain_num = re.search(r"\d+", x_str)
        if is_contain_num:
            return is_contain_num.group()
        else:
            return "0"

    def convert_date(x: str | int) -> int:
        """日付をその年の1日1月を基点とした日数に変換する

        Args:
            x (str | int): 日付(YYYY/MM/DD)

        Returns:
            int: 日数
        """
        year = re.sub(r"(\d{1,4})/.*/.*", r"\1", x)
        new_year = datetime.datetime(int(year), 1, 1)
        unix_year = int(time.mktime(new_year.timetuple()))
        date_format = datetime.datetime.strptime(x, "%Y/%m/%d")
        unix_date = int(time.mktime(date_format.timetuple()))
        return (unix_date - unix_year) // 86400

    @staticmethod
    def __transform_held(held: str) -> str:
        trim_held = re.sub(r"\d*", "", held)
        if not trim_held in [
            "東京",
            "中山",
            "中京",
            "阪神",
            "札幌",
            "函館",
            "福島",
            "新潟",
            "京都",
            "小倉",
        ]:
            return "その他"
        return trim_held

    @staticmethod
    def __transform_race_name(race: str) -> str:
        # r"新馬|未勝利|1勝|2勝|3勝|オープン"
        if re.search(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", race):
            transform_name = re.sub(r".*(新馬|未勝利|1勝|2勝|3勝|OP|G1|G2|G3|L).*", r"\1", race)
        else:
            transform_name = "その他"
        return transform_name

    @staticmethod
    def __extract_addition(df: pd.DataFrame) -> pd.DataFrame:
        """体重の増減を抽出する

        Args:
            df (pd.DataFrame): 変化対象のデータ

        Returns:
            pd.DataFrame: 変換後のデータ
        """
        weight = df["馬体重"]
        addition = weight.map(lambda x: re.sub(r".*\(([+-]\d{1,3}|0)\).*", r"\1", x))
        addition = addition.map(lambda x: re.sub(r"\+", "", x))
        return addition

    @staticmethod
    def __drop_columns(df: pd.DataFrame) -> pd.DataFrame:
        """不要なカラムを削除

        Args:
            df (pd.DataFrame): 成績データ

        Returns:
            pd.DataFrame: 削除後データ
        """
        df_processed = df.drop(
            [
                "賞金",
                "厩舎 ｺﾒﾝﾄ",
                "備考",
                "勝ち馬 (2着馬)",
                "着差",
                "ﾀｲﾑ 指数",
                "通過",
                "ペース",
                "上り",
                "馬場 指数",
                "タイム",
                "映 像",
                "騎手",
                "オ ッ ズ",
                "人 気",
            ],
            axis=1,
        )
        return df_processed

    @staticmethod
    def __divide_corse(df: pd.DataFrame) -> pd.DataFrame:
        df_divided = df
        df_divided["コース"] = df_divided["距離"].map(lambda x: x[0])
        df_divided["距離"] = df_divided["距離"].map(lambda x: x[1:])
        return df_divided

    @classmethod
    def __divide_horse_weight(cls, df: pd.DataFrame) -> pd.DataFrame:
        """馬体重を分ける

        Args:
            df (pd.DataFrame): 加工前データ

        Returns:
            pd.DataFrame: 加工後データ
        """
        df_divided = df
        df_divided["馬体重"] = df_divided["馬体重"].map(lambda x: x.replace("計不", "0(0)"))
        weight_addition = cls.__extract_addition(df_divided)
        df_divided["増減"] = weight_addition
        df_divided["馬体重"] = df_divided["馬体重"].map(
            lambda x: re.sub(r"\([+-]*\d+\)", "", x)
        )
        return df_divided

    @classmethod
    def process(cls, path):
        df_raw = pd.read_pickle(path).head(cls.max_rows)
        df_processed = cls.__drop_columns(df_raw)
        df_processed = cls.__divide_horse_weight(df_processed)
        df_processed["日付"] = df_processed["日付"].map(cls.convert_date)
        df_processed["開催"] = df_processed["開催"].map(cls.__transform_held)
        df_processed["レース名"] = df_processed["レース名"].map(cls.__transform_race_name)
        df_processed = cls.__divide_corse(df_processed)
        df_processed["距離"] = df_processed["距離"].map(lambda x: int(x) / 100)
        df_processed["馬 番"] = df_processed["馬 番"].map(lambda x: 0 if x > 18 else x)
        df_processed["着 順"] = df_processed["着 順"].map(cls.remove_str)
        # カラム名の空白を削除
        df_processed.columns = df_processed.columns.str.replace(" ", "")
        # 欠損値を0埋め
        df_processed = df_processed.fillna(0)
        # 型をintにする
        df = df_processed.astype({"R": int, "枠番": int})
        # 標準化
        df[cls.columns_to_scale] = cls.scaler.transform(df[cls.columns_to_scale])
        # ダミー変数化
        df = cls.encoder.transform(df)
        # 行数を調整
        if len(df) < cls.max_rows:
            rows_to_add = cls.max_rows - len(df)
            # すべての項目が0の行を作成
            additional_rows = pd.DataFrame(
                np.zeros((rows_to_add, len(df.columns))), columns=df.columns
            )
            # 追加の行をDataFrameに結合
            df = pd.concat([df, additional_rows], ignore_index=True)
        return df.iloc[::-1].reset_index(drop=True)

In [40]:
test = "../Raw-Data/Horse-Results/2017105082.pkl"
df = HorseProcessor.process(test)


df

Unnamed: 0,日付,開催_新潟,開催_福島,開催_東京,開催_中京,開催_小倉,開催_京都,開催_中山,開催_札幌,開催_函館,...,馬場_良,馬場_稍,馬場_重,馬場_不,馬場_0,馬体重,増減,コース_障,コース_ダ,コース_芝
0,-0.707675,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0.8492,-0.332792,0,0,1
1,-0.455902,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0.803581,-0.332792,0,0,1
2,-0.194804,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0.757962,-0.332792,0,0,1
3,0.933512,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0.940437,0.995473,0,0,1
4,1.381109,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0.940437,-0.067139,0,0,1
5,1.642206,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0.940437,-0.067139,0,0,1
6,-0.922148,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,-10.874794,-0.067139,0,1,0
7,-0.064255,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0.986056,-0.067139,0,0,1
8,0.924187,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1.122911,0.72982,0,0,1
9,1.371784,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0.8492,-1.661056,0,0,1


### AE 学習用データの作成


In [41]:
dir_list = os.listdir("../Raw-Data/Horse-Results/")
df_integrated = pd.DataFrame()
for i in tqdm(dir_list):
    df = HorseProcessor.process(f"../Raw-Data/Horse-Results/{i}")
    df.to_pickle(f"../Processed-Data/Horse-Results/{i}")

  0%|          | 0/11557 [00:00<?, ?it/s]

100%|██████████| 11557/11557 [02:25<00:00, 79.26it/s]


# AE 作成


## モデル


### 位置エンコーディング


In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, hidden_dim, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, hidden_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # div_termの計算
        div_term = torch.exp(
            torch.arange(0, hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim)
        )

        # position * div_term のサイズ調整
        pe[:, 0::2] = torch.sin(position * div_term.unsqueeze(0))
        pe[:, 1::2] = torch.cos(position * div_term.unsqueeze(0))

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

### エンコーダ


In [3]:
class Encoder(nn.Module):
    def __init__(self, hidden_dim, nheads, nlayers, dropout=0.1):
        super(Encoder, self).__init__()
        self.pos_encoder = PositionalEncoding(hidden_dim, dropout)
        encoder_layers = nn.TransformerEncoderLayer(
            hidden_dim, nheads, hidden_dim, dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)

    def forward(self, src):
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        return output

### デコーダ


In [4]:
class Decoder(nn.Module):
    def __init__(self, hidden_dim, input_dim, nheads, nlayers, dropout=0.1):
        super(Decoder, self).__init__()
        self.pos_encoder = PositionalEncoding(hidden_dim, dropout)
        decoder_layers = nn.TransformerDecoderLayer(
            hidden_dim, nheads, hidden_dim, dropout, batch_first=True
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, nlayers)
        self.decoder = nn.Linear(hidden_dim, input_dim)

    def forward(self, src):
        src = self.pos_encoder(src)
        output = self.transformer_decoder(src, src)
        output = self.decoder(output)
        return output

### VAE モデル


input_dim: 入力データの特徴量の次元数。この例では、各時系列データポイントが 25 個の特徴を持っていることを意味します。  
hidden_dim: Transformer モデル内の隠れ層の次元数。これは、モデル内部の各セルフアテンション層や全結合層のサイズを決定します。  
latent_dim: 潜在空間の次元数。VAE のエンコーダが出力する潜在変数の次元数です。  
nheads: マルチヘッドアテンションの「ヘッド」の数。これは、セルフアテンションを並列に行う際の分割数を指します。  
nlayers: Transformer 内のエンコーダ（およびデコーダ）層の数。モデルの深さを決定します。  
dropout: ドロップアウト率。過学習を防ぐために層間でランダムにノードを無効化する割合。


In [5]:
class TransformerVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, nheads, nlayers, dropout=0.1):
        super(TransformerVAE, self).__init__()
        self.input_linear = nn.Linear(input_dim, hidden_dim)
        self.encoder = Encoder(hidden_dim, nheads, nlayers, dropout)
        self.decoder = Decoder(hidden_dim, input_dim, nheads, nlayers, dropout)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_log_var = nn.Linear(hidden_dim, latent_dim)
        self.fc_out = nn.Linear(latent_dim, hidden_dim)

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, src):
        src = self.input_linear(src)
        encoded = self.encoder(src)
        mu = self.fc_mu(encoded)
        log_var = self.fc_log_var(encoded)
        z = self.reparameterize(mu, log_var)
        z = self.fc_out(z)
        decoded = self.decoder(z)
        return decoded, mu, log_var

### VAE の損失関数


In [6]:
def vae_loss(recon_x, x, mu, log_var):
    recon_loss = F.mse_loss(recon_x, x, reduction="sum")
    kl_div = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return recon_loss + kl_div

### データセットクラス


In [7]:
class TimeSeriesDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        # CSVファイルからデータフレームを読み込む
        df = pd.read_pickle(self.file_paths[idx])
        df = df.iloc[:].astype("float32")
        # データフレームをテンソルに変換
        data_tensor = torch.tensor(df.values, dtype=torch.float32)
        return data_tensor

In [8]:
"""
Index(['日付', '開催_新潟', '開催_福島', '開催_東京', '開催_中京', '開催_小倉', '天気_晴', '天気_小雨',
       '天気_曇', 'R', 'レース名_G3', 'レース名_OP', '頭数', '枠番', '馬番', '着順', '斤量', '距離',
       '馬場_良', '馬場_稍', '馬体重', '増減', 'コース_障'],
      dtype='object')
"""
test_df = pd.read_pickle("../Processed-Data/Horse-Results/2011101814.pkl")


len(test_df.columns)

72

### 学習


#### optuna でハイパーパラメータ探索


In [29]:
file_names = os.listdir("../Processed-Data/Horse-Results/")
file_paths = list(map(lambda x: "../Processed-Data/Horse-Results/" + x, file_names))
train_paths, test_paths = train_test_split(file_paths, test_size=0.3)
train_paths, val_paths = train_test_split(train_paths, test_size=0.2)

# カスタムデータセットのインスタンス化
train_dataset = TimeSeriesDataset(train_paths)
val_dataset = TimeSeriesDataset(val_paths)
test_dataset = TimeSeriesDataset(test_paths)

# データローダーの設定
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
def objective(trial):
    # ハイパーパラメータの提案
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    hidden_dim = trial.suggest_categorical("hidden_dim", [8 * i for i in range(2, 17)])
    latent_dim = trial.suggest_int("latent_dim", 16, 36, log=True)

    # モデルとオプティマイザの設定
    model = TransformerVAE(
        input_dim=72,
        hidden_dim=hidden_dim,
        latent_dim=latent_dim,
        nheads=8,
        nlayers=8,
        dropout=0.1,
    )
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    # 訓練ループ
    for epoch in range(15):  # エポック数は適宜調整
        model.train()
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            recon_batch, mu, log_var = model(batch)
            loss = vae_loss(recon_batch, batch, mu, log_var)  # あなたの損失関数を使用
            loss.backward()
            optimizer.step()

    # 検証データセットでの性能評価
    # ここでは、簡単化のために最後の訓練損失を使用しています
    return loss.item()

In [31]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # 試行回数は適宜調整

# 最適なハイパーパラメータを取得
best_params = study.best_params
best_params

[I 2024-01-14 20:06:43,319] A new study created in memory with name: no-name-536a45d9-4e36-49db-9aa8-99fcb752d354
[I 2024-01-14 20:09:26,661] Trial 0 finished with value: 2217.046875 and parameters: {'lr': 0.00040031166573357757, 'hidden_dim': 40, 'latent_dim': 21}. Best is trial 0 with value: 2217.046875.
[I 2024-01-14 20:11:57,169] Trial 1 finished with value: 3262.27880859375 and parameters: {'lr': 0.00024470041502269393, 'hidden_dim': 128, 'latent_dim': 23}. Best is trial 0 with value: 2217.046875.
[I 2024-01-14 20:14:43,208] Trial 2 finished with value: 3908.54638671875 and parameters: {'lr': 0.0012011823000395206, 'hidden_dim': 72, 'latent_dim': 16}. Best is trial 0 with value: 2217.046875.
[I 2024-01-14 20:17:26,469] Trial 3 finished with value: 2744.502197265625 and parameters: {'lr': 0.0004850247487994254, 'hidden_dim': 104, 'latent_dim': 31}. Best is trial 0 with value: 2217.046875.
[I 2024-01-14 20:20:10,231] Trial 4 finished with value: 3730.185546875 and parameters: {'lr':

{'lr': 0.000375832517132247, 'hidden_dim': 40, 'latent_dim': 25}

In [26]:
"""{'lr': 0.00015207365301991906, 'hidden_dim': 120, 'latent_dim': 23}
value: 885.03564453125.

{'lr': 0.0003215171404708647, 'hidden_dim': 48, 'latent_dim': 24}
"""
best_params = study.best_params
best_params

NameError: name 'study' is not defined

#### 本学習


In [8]:
file_names = os.listdir("../Processed-Data/Horse-Results/")
file_paths = list(map(lambda x: "../Processed-Data/Horse-Results/" + x, file_names))
train_paths, test_paths = train_test_split(file_paths, test_size=0.3)
train_paths, val_paths = train_test_split(train_paths, test_size=0.2)

# カスタムデータセットのインスタンス化
train_dataset = TimeSeriesDataset(train_paths)
val_dataset = TimeSeriesDataset(val_paths)
test_dataset = TimeSeriesDataset(test_paths)

# データローダーの設定
batch_size = 10
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# GPUが使える場合はGPU上で動かす
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# モデルのインスタンス化
model = TransformerVAE(
    input_dim=72,
    hidden_dim=40,
    latent_dim=24,
    nheads=8,
    nlayers=8,
    dropout=0.1,
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.000375832517132247)

# エポック数
num_epochs = 100
# 評価を行うエポック数
eval_interval = 10

# 訓練
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        recon_batch, mu, log_var = model(batch)
        loss = vae_loss(recon_batch, batch, mu, log_var)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    # 一定のエポック数ごとに検証データセットでモデルを評価
    if epoch % eval_interval == 0 or epoch == num_epochs - 1:
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for val_batch in val_loader:
                val_batch = val_batch.to(device)
                recon_batch, mu, log_var = model(val_batch)
                loss = vae_loss(recon_batch, val_batch, mu, log_var)
                val_loss += loss.item()

        val_loss /= len(val_loader.dataset)
        print(
            f"Epoch {epoch}, Train Loss: {train_loss / len(train_loader.dataset)}, Val Loss: {val_loss}"
        )

Epoch 0, Train Loss: 1167.0059476406454, Val Loss: 628.1757527317193
Epoch 10, Train Loss: 292.59296029223077, Val Loss: 288.32920446973503
Epoch 20, Train Loss: 195.35087657369024, Val Loss: 178.30867855039014
Epoch 30, Train Loss: 150.13185503161907, Val Loss: 137.15682526955055
Epoch 40, Train Loss: 133.40729812152824, Val Loss: 125.04862804436418
Epoch 50, Train Loss: 128.60618339353468, Val Loss: 122.0872663538005
Epoch 60, Train Loss: 126.59982178602128, Val Loss: 122.86725204335744
Epoch 70, Train Loss: 124.82836783781539, Val Loss: 121.98799550135439
Epoch 80, Train Loss: 124.18883349460347, Val Loss: 121.42508021332277
Epoch 90, Train Loss: 122.82567377964398, Val Loss: 120.00226750509405
Epoch 99, Train Loss: 122.09243264372945, Val Loss: 119.16302644897009


In [9]:
model.eval()
test_loss = 0
with torch.no_grad():
    for test_batch in test_loader:
        test_batch = test_batch.to(device)
        recon_batch, mu, log_var = model(test_batch)
        loss = vae_loss(recon_batch, test_batch, mu, log_var)
        test_loss += loss.item()

test_loss /= len(test_loader.dataset)
print(f"Test Loss: {test_loss}.")

Test Loss: 120.07420588474648.


#### 保存


In [10]:
torch.save(model.state_dict(), "../models/horse_result_VAE2.pth")

## 確認


horse_result_VAE1.pth : input_dim=72,hidden_dim=48,latent_dim=24,nheads=8,nlayers=8,dropout=0.1,


In [36]:
model = TransformerVAE(
    input_dim=72,
    hidden_dim=48,
    latent_dim=24,
    nheads=8,
    nlayers=8,
    dropout=0.1,
)
model.load_state_dict(torch.load("../models/horse_result_VAE1.pth"))
model.eval()
test_df = pd.read_pickle("../Processed-Data/Horse-Results/2011101814.pkl")
test_df = test_df.iloc[:].astype("float32")
# データフレームをテンソルに変換
data = torch.tensor(test_df.values, dtype=torch.float32)

with torch.no_grad():
    encoded = model.fc_mu(model.encoder(model.input_linear(data)))
# with torch.no_grad():
# decoded = model.decoder(encoded)

# print(encoded)
print(data.size())
print(encoded.size())

torch.Size([10, 72])
torch.Size([10, 10, 24])
