# パッケージ


In [28]:
import pandas as pd
import datetime
import re
import os, time
from tqdm import tqdm
import category_encoders as ce
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import math
from sklearn.decomposition import PCA
from abc import ABC, abstractmethod
import torch.nn.functional as F
from torch.utils.data.dataset import Subset
import random

# 複勝基準モデル作成


## データセットクラス


In [2]:
class CustomDataSet(Dataset):

    def __init__(self, data, is_file=False):
        """
        Args:
            file_paths (list of str): 学習用データファイルのパスのリスト
        """
        self.data = data
        self.file = is_file

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.file:
            file_path = self.data[idx]
            with open(file_path, "rb") as f:
                data = pickle.load(f)
            inputs = torch.cat([data["race"], data["horse"], data["results"]])
            labels = data["label"]
            return inputs, labels
        else:
            data_set = self.data[idx]
            return data_set["data"], data_set["label"]

## モデル


In [3]:
class NNClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(NNClassifier, self).__init__()
        self.fc_in = nn.Linear(input_size, 1024)  # 入力層から隠れ層へ
        self.fc1 = nn.Linear(1024, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc_act1 = nn.Mish()
        self.fc_act2 = nn.Mish()
        self.fc_act3 = nn.Mish()
        self.fc_sig = nn.Sigmoid()
        self.fc_out = nn.Linear(1024, output_size)  # 隠れ層から出力層へ

    def forward(self, x):
        out = self.fc_in(x)
        out = self.fc_act1(out)
        out = self.fc1(out)
        out = self.fc_act2(out)
        out = self.fc2(out)
        out = self.fc_act3(out)
        out = self.fc_out(out)
        out = self.fc_sig(out)
        return out

## 学習


### 学習データの用意


In [5]:
# ファイルで学習させるとき
results_path = "../Processed-Data/Race-Results/"
dir_list_raw = os.listdir(results_path)
dir_list = list(map(lambda x: f"../Processed-Data/Race-Results/{x}", dir_list_raw))
dataset = CustomDataSet(dir_list, is_file=True)

"""
results_path = "../Processed-Data/Race-Results/"
dir_list = os.listdir(results_path)
data_set = []
for i in tqdm(dir_list):
    path = f"../Processed-Data/Race-Results/{i}"
    with open(path, "rb") as f:
        data = pickle.load(f)
    data_modify = {
        "data": torch.cat([data["race"], data["horse"], data["results"]]),
        "label": data["label"],
    }
    data_set.append(data_modify)
dataset = CustomDataSet(dir_list)
"""

dataset_size = len(dataset)
# 分割比率を設定 (例: 訓練:検証:テスト = 70%:15%:15%)
train_size = int(dataset_size * 0.7)
val_size = int(dataset_size * 0.15)
test_size = dataset_size - train_size - val_size  # 残りをテストセットとする

print(f"train:{train_size}")
print(f"val:{val_size}")
print(f"test:{test_size}")

# データセットをランダムに分割
train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)

batch = 16
# DataLoaderを作成
train_loader = DataLoader(
    train_dataset,
    batch_size=batch,
    shuffle=True,
    pin_memory=True,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

train:21765
val:4663
test:4665
cuda


### 各ラベルの分布を調べる


In [6]:
def get_all_files_in_directory(path):
    """
    指定したパスの配下にあるすべてのファイル（子孫ファイルも含む）のリストを返します。

    :param path: ファイルを検索するディレクトリのパス
    :return: ファイルのパスのリスト
    """
    files_list = []
    for root, dirs, files in os.walk(path):
        for file_name in files:
            files_list.append(f"{root}/{file_name}")
    return files_list


zero_distribution = torch.zeros(18)
one_distribution = torch.zeros(18)
dir_list = get_all_files_in_directory("../Processed-Data/Race-Results")
for i in tqdm(dir_list):
    with open(i, "rb") as f:
        data = pickle.load(f)
    labels = data["label"]
    # ラベルが1の位置をカウント
    one_distribution += labels
    # ラベルが0の位置をカウント（全ての要素から1のカウントを引く）
    zero_distribution += 1 - labels

# 0が1に対してどれくらい多いかの割合を計算
zero_to_one_ratio = zero_distribution / one_distribution
one_to_zero_ratio = one_distribution / zero_distribution

print("0が1に対してどれくらい多いかの割合:\n", zero_to_one_ratio)
print("その逆\n", one_to_zero_ratio)

  0%|          | 0/31093 [00:00<?, ?it/s]

 10%|█         | 3256/31093 [00:16<02:24, 192.95it/s]


KeyboardInterrupt: 

In [13]:
# テンソルの平均
weight_tensor_one = torch.tensor(
    [
        4.0467,
        3.8973,
        4.0508,
        3.7718,
        4.0142,
        3.8689,
        4.0264,
        4.0231,
        4.4075,
        4.6175,
        5.3236,
        5.6824,
        7.1395,
        8.0703,
        9.9560,
        12.4485,
        85.8520,
        96.1656,
    ],
    dtype=torch.float,
)
torch.mean(weight_tensor_one)

tensor(15.0757)

In [14]:
torch.ones_like(weight_tensor_one) * 15

tensor([15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15., 15.,
        15., 15., 15., 15.])

### モデルの用意


In [12]:
class WeightedBCELoss(nn.Module):
    def __init__(self):
        super(WeightedBCELoss, self).__init__()
        self.weight_tensor_one = torch.tensor(
            [
                4.0467,
                3.8973,
                4.0508,
                3.7718,
                4.0142,
                3.8689,
                4.0264,
                4.0231,
                4.4075,
                4.6175,
                5.3236,
                5.6824,
                7.1395,
                8.0703,
                9.9560,
                12.4485,
                85.8520,
                96.1656,
            ],
            dtype=torch.float,
        ).to("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, logits, targets):
        # ラベルが1の要素に対してweight_tensor_oneの重みを適用し、
        # ラベルが0の要素に対しては1の重みを適用
        weights = torch.where(
            targets == 1, self.weight_tensor_one, torch.ones_like(targets)
        )
        weights = weights.to(targets.device)  # weightsをtargetsと同じデバイスに移動
        # 重み付きバイナリクロスエントロピー損失の計算
        loss = F.binary_cross_entropy_with_logits(logits, targets, weight=weights)
        return loss


model = NNClassifier(1534, 18).to(device)
criterion = WeightedBCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001)

In [13]:
def check_accuracy_topk(loader, model, k=3):
    num_correct_1 = 0
    num_samples_1 = 0
    num_correct_0 = 0
    num_samples_0 = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            scores = model(x)
            # 上位k個の予測を1に、それ以外を0にする
            topk_predictions = torch.zeros_like(scores, device=device)
            topk_vals, topk_indices = scores.topk(k, dim=1)
            # 上位k個の位置に1を設定
            topk_predictions.scatter_(1, topk_indices, 1)
            # 1の場合
            correct_predictions_1 = topk_predictions.bool() & y.bool()
            num_correct_1 += correct_predictions_1.type(torch.float).sum().item()
            num_samples_1 += y.sum().item()
            # 0の場合
            correct_predictions_0 = (~topk_predictions.bool()) & (~y.bool())
            num_correct_0 += correct_predictions_0.type(torch.float).sum().item()
            num_samples_0 += (1 - y).sum().item()
        # 正解率の計算
        accuracy_1 = (num_correct_1 / num_samples_1 * 100) if num_samples_1 > 0 else 0
        accuracy_0 = (num_correct_0 / num_samples_0 * 100) if num_samples_0 > 0 else 0
    model.train()
    return f"Accuracy for label 1: {accuracy_1:.2f}%\nAccuracy for label 0: {accuracy_0:.2f}%"


num_epochs = 500


for epoch in range(num_epochs):

    for batch_idx, (data, targets) in enumerate(train_loader):
        x = data.to(device)
        y = targets.to(device)
        scores = model(x)
        loss = criterion(scores, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(
            f"Epoch [{epoch+1}/{num_epochs}]  Loss{loss}\n{check_accuracy_topk(val_loader, model)}"
        )

Epoch [10/500]  Loss1.3499157428741455
Accuracy for label 1: 27.46%
Accuracy for label 0: 85.24%
Epoch [20/500]  Loss1.0067671537399292
Accuracy for label 1: 31.61%
Accuracy for label 0: 85.97%
Epoch [30/500]  Loss1.066394567489624
Accuracy for label 1: 35.12%
Accuracy for label 0: 86.59%
Epoch [40/500]  Loss0.9392207860946655
Accuracy for label 1: 36.92%
Accuracy for label 0: 86.91%
Epoch [50/500]  Loss0.9865322709083557
Accuracy for label 1: 36.89%
Accuracy for label 0: 86.91%
Epoch [60/500]  Loss0.9213132262229919
Accuracy for label 1: 36.92%
Accuracy for label 0: 86.91%
Epoch [70/500]  Loss1.3121676445007324
Accuracy for label 1: 36.57%
Accuracy for label 0: 86.85%
Epoch [80/500]  Loss0.9469384551048279
Accuracy for label 1: 36.15%
Accuracy for label 0: 86.77%
Epoch [90/500]  Loss0.9151275157928467
Accuracy for label 1: 36.19%
Accuracy for label 0: 86.78%
Epoch [100/500]  Loss0.9598709344863892
Accuracy for label 1: 36.04%
Accuracy for label 0: 86.76%
Epoch [110/500]  Loss0.8541965

KeyboardInterrupt: 

### 評価


In [11]:
def check_accuracy_topk(loader, model, k=3):
    num_correct_1 = 0
    num_samples_1 = 0
    num_correct_0 = 0
    num_samples_0 = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            scores = model(x)
            # 上位k個の予測を1に、それ以外を0にする
            topk_predictions = torch.zeros_like(scores, device=device)
            topk_vals, topk_indices = scores.topk(k, dim=1)
            # 上位k個の位置に1を設定
            topk_predictions.scatter_(1, topk_indices, 1)
            # 1の場合
            correct_predictions_1 = topk_predictions.bool() & y.bool()
            num_correct_1 += correct_predictions_1.type(torch.float).sum().item()
            num_samples_1 += y.sum().item()
            # 0の場合
            correct_predictions_0 = (~topk_predictions.bool()) & (~y.bool())
            num_correct_0 += correct_predictions_0.type(torch.float).sum().item()
            num_samples_0 += (1 - y).sum().item()
        # 正解率の計算
        accuracy_1 = (num_correct_1 / num_samples_1 * 100) if num_samples_1 > 0 else 0
        accuracy_0 = (num_correct_0 / num_samples_0 * 100) if num_samples_0 > 0 else 0
    model.train()
    return f"Accuracy for label 1: {accuracy_1:.2f}%\nAccuracy for label 0: {accuracy_0:.2f}%"



print(check_accuracy_topk(test_loader, model, 3))

'Accuracy for label 1: 35.36%\nAccuracy for label 0: 86.65%'

### モデル保存(状態のみ)


In [10]:
torch.save(model.state_dict(), "../models/nn_classifier_1.pth")

# アンサンブル NN モデル


## データセットクラス


In [14]:
class CustomDataSet(Dataset):

    def __init__(self, data, is_file=False):
        """
        Args:
            file_paths (list of str): 学習用データファイルのパスのリスト
        """
        self.data = data
        self.file = is_file

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.file:
            file_path = self.data[idx]
            with open(file_path, "rb") as f:
                data = pickle.load(f)
            inputs = torch.cat([data["race"], data["horse"], data["results"]])
            labels = data["label"]
            return inputs, labels
        else:
            data_set = self.data[idx]
            return data_set["data"], data_set["label"]

## モデル


In [29]:
class NNClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(NNClassifier, self).__init__()
        self.rand_num = random.randint(8, 64)
        self.dropout = nn.Dropout(0.2)
        self.fc_in = nn.Linear(input_size, self.rand_num)  # 入力層から隠れ層へ
        self.fc1 = nn.Linear(self.rand_num, self.rand_num)
        self.fc2 = nn.Linear(self.rand_num, self.rand_num)
        self.fc_act1 = nn.Mish()
        self.fc_act2 = nn.Mish()
        self.fc_act3 = nn.Mish()
        self.fc_sig = nn.Sigmoid()
        self.fc_out = nn.Linear(self.rand_num, output_size)  # 隠れ層から出力層へ

    def forward(self, x):
        out = self.fc_in(x)
        out = self.fc_act1(out)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.fc_act2(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.fc_act3(out)
        out = self.dropout(out)
        out = self.fc_out(out)
        out = self.fc_sig(out)
        return out

## 学習


### 学習データの用意


In [26]:
# ファイルで学習させるとき
results_path = "../Processed-Data/Race-Results/"
dir_list_raw = os.listdir(results_path)
dir_list = list(map(lambda x: f"../Processed-Data/Race-Results/{x}", dir_list_raw))
dataset_top = CustomDataSet(dir_list, is_file=True)

dataset_size = len(dataset_top)
# 分割比率を設定 (例: 訓練:検証:テスト = 70%:15%:15%)
train_size = int(dataset_size * 0.7)
val_size = int(dataset_size * 0.15)
test_size = dataset_size - train_size - val_size  # 残りをテストセットとする

print(f"train:{train_size}")
print(f"val:{val_size}")
print(f"test:{test_size}")

# データセットをランダムに分割
train_dataset, val_dataset, test_dataset = random_split(
    dataset_top, [train_size, val_size, test_size]
)

# 訓練データセットを5つのサブセットに分割
sub_train_sizes = [int(train_size / 5) for _ in range(4)]  # 最初の4つのサイズ
sub_train_sizes.append(train_size - sum(sub_train_sizes))  # 最後のサブセットのサイズ

batch = 16

# 分割された訓練データセットのインデックスを生成
indices = torch.randperm(train_size).tolist()
sub_train_datasets = [
    Subset(
        train_dataset, indices[sum(sub_train_sizes[:i]) : sum(sub_train_sizes[: i + 1])]
    )
    for i in range(5)
]

# 各サブセットに対応するDataLoaderを作成
sub_train_loaders = [
    DataLoader(dataset, batch_size=batch, shuffle=True, pin_memory=True)
    for dataset in sub_train_datasets
]

# 各サブセットのDataLoaderを確認
for i, loader in enumerate(sub_train_loaders):
    print(f"Sub train loader {i}: {len(loader.dataset)} samples")

# 検証、確認用のDataLoaderを作成
val_loader = DataLoader(
    val_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch,
    shuffle=False,
    pin_memory=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

train:21765
val:4663
test:4665
Sub train loader 0: 4353 samples
Sub train loader 1: 4353 samples
Sub train loader 2: 4353 samples
Sub train loader 3: 4353 samples
Sub train loader 4: 4353 samples
cuda


### 独自の誤差関数とモデルの用意


In [30]:
class WeightedBCELoss(nn.Module):
    def __init__(self):
        super(WeightedBCELoss, self).__init__()
        self.weight_tensor_one = torch.tensor(
            [
                4.0467,
                3.8973,
                4.0508,
                3.7718,
                4.0142,
                3.8689,
                4.0264,
                4.0231,
                4.4075,
                4.6175,
                5.3236,
                5.6824,
                7.1395,
                8.0703,
                9.9560,
                12.4485,
                85.8520,
                96.1656,
            ],
            dtype=torch.float,
        ).to("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, logits, targets):
        # ラベルが1の要素に対してweight_tensor_oneの重みを適用し、
        # ラベルが0の要素に対しては1の重みを適用
        weights = torch.where(
            targets == 1, self.weight_tensor_one, torch.ones_like(targets)
        )
        weights = weights.to(targets.device)  # weightsをtargetsと同じデバイスに移動
        # 重み付きバイナリクロスエントロピー損失の計算
        loss = F.binary_cross_entropy_with_logits(logits, targets, weight=weights)
        return loss


# アンサンブル用にモデルを５個作る
models = [NNClassifier(1534, 18).to(device) for _ in range(5)]

### アンサンブル学習させる


In [31]:
num_epochs = 5
for count, model in enumerate(models):
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
    criterion = WeightedBCELoss().to(device)  # 損失関数も適切なデバイスに移動
    for epoch in range(num_epochs):
        for batch_idx, (data, targets) in enumerate(train_loader):
            x = data.to(device)
            y = targets.to(device)
            scores = model(x)
            loss = criterion(scores, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 1 == 0:
            print(f"Model {count}, Epoch {epoch + 1} finished")

Model 0, Epoch 1 finished
Model 0, Epoch 2 finished
Model 0, Epoch 3 finished
Model 0, Epoch 4 finished
Model 0, Epoch 5 finished
Model 1, Epoch 1 finished
Model 1, Epoch 2 finished
Model 1, Epoch 3 finished
Model 1, Epoch 4 finished
Model 1, Epoch 5 finished
Model 2, Epoch 1 finished
Model 2, Epoch 2 finished
Model 2, Epoch 3 finished
Model 2, Epoch 4 finished
Model 2, Epoch 5 finished
Model 3, Epoch 1 finished
Model 3, Epoch 2 finished
Model 3, Epoch 3 finished
Model 3, Epoch 4 finished
Model 3, Epoch 5 finished
Model 4, Epoch 1 finished
Model 4, Epoch 2 finished
Model 4, Epoch 3 finished
Model 4, Epoch 4 finished
Model 4, Epoch 5 finished


In [32]:
def check_accuracy_topk_ensemble(loaders, models, k=3):
    num_correct_1 = 0
    num_samples_1 = 0
    num_correct_0 = 0
    num_samples_0 = 0

    # モデルを評価モードに設定
    for model in models:
        model.eval()

    with torch.no_grad():
        for x, y in loaders:
            x = x.to(device)
            y = y.to(device)
            # アンサンブルの予測を格納するリスト
            predictions = []
            for model in models:
                scores = model(x)
                predictions.append(scores)
            # 予測の平均を計算
            avg_predictions = torch.mean(torch.stack(predictions), dim=0)

            # 上位k個の予測を1に、それ以外を0にする
            topk_predictions = torch.zeros_like(avg_predictions, device=device)
            topk_vals, topk_indices = avg_predictions.topk(k, dim=1)
            topk_predictions.scatter_(1, topk_indices, 1)

            # 1の場合
            correct_predictions_1 = topk_predictions.bool() & y.bool()
            num_correct_1 += correct_predictions_1.type(torch.float).sum().item()
            num_samples_1 += y.sum().item()
            # 0の場合
            correct_predictions_0 = (~topk_predictions.bool()) & (~y.bool())
            num_correct_0 += correct_predictions_0.type(torch.float).sum().item()
            num_samples_0 += (1 - y).sum().item()

    # 正解率の計算
    accuracy_1 = (num_correct_1 / num_samples_1 * 100) if num_samples_1 > 0 else 0
    accuracy_0 = (num_correct_0 / num_samples_0 * 100) if num_samples_0 > 0 else 0

    # モデルを訓練モードに戻す
    for model in models:
        model.train()

    return f"Accuracy for label 1: {accuracy_1:.2f}%\nAccuracy for label 0: {accuracy_0:.2f}%"


# 使用例
print(check_accuracy_topk_ensemble(test_loader, models, 3))

Accuracy for label 1: 22.78%
Accuracy for label 0: 84.41%


In [None]:
for count, model in enumerate(models):
    # モデルの状態と乱数の値を保存
    model_info = {"state_dict": model.state_dict(), "rand_num": model.rand_num}
    torch.save(model_info, f"../models/nn_ensemble_{count}")

In [None]:
models = []
for i in range(5):
    # モデル情報のロード
    model_info = torch.load(f"../models/nn_ensemble_{i}")
    rand_num = model_info["rand_num"]

    # モデルのインスタンスを作成（適切なinput_sizeとoutput_sizeを指定）
    model = NNClassifier(1534, 18)
    model.rand_num = rand_num  # 乱数の値を設定

    # モデルの状態をロード
    model.load_state_dict(model_info["state_dict"])
    models.append(model)