In [1]:
import gym
from tensorflow import keras
from tensorflow.keras.optimizers.legacy import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import matplotlib.pyplot as plt
#from keras.metrics import AUC

import numpy as np

from env.env import KeibaEnv
from tensorflow.keras.callbacks import Callback
import datetime

class ModelCheckpoint(Callback):
    def __init__(self, model, filepath, interval, verbose=1):
        super(ModelCheckpoint, self).__init__()
        self._mine_model = model
        self.filepath = filepath
        self.interval = interval
        self.verbose = verbose
        self.total_episodes = 0
        self.best_reward = -np.inf
        self.total_reward = 0

    def on_episode_end(self, episode, logs):
        """ Save weights at interval steps during training if
        reward improves """
        self.total_episodes += 1
        self.total_reward += logs['episode_reward']
        if self.total_episodes % self.interval != 0:
            # Nothing to do.
            return

        now = datetime.datetime.now()
        self._mine_model.save(self.filepath + now.strftime('%Y%m%d%H%M%S'))
#         if self.total_reward > self.best_reward:
#             if self.verbose > 0:
#                 print('Episode {}: Reward improved '
#                       'from {} to {}'.format(self.total_episodes,
#                                              self.best_reward,
#                                              self.total_reward))
#             self._mine_model.save(self.filepath, overwrite=True)
#             self.best_reward = self.total_reward
#             self.total_reward = 0

#         else:
#             self.total_reward = 0


# ゲーム環境を作成します
env = KeibaEnv(
    [
        "./../data/train/2003_train_binary.csv",
        "./../data/train/2004_train_binary.csv",
        "./../data/train/2005_train_binary.csv",
        "./../data/train/2006_train_binary.csv",
        "./../data/train/2007_train_binary.csv",
        "./../data/train/2008_train_binary.csv",
        "./../data/train/2009_train_binary.csv",
        "./../data/train/2010_train_binary.csv",
        "./../data/train/2011_train_binary.csv",
        "./../data/train/2012_train_binary.csv",
    ],
    [
        "./../data/train/2003_result_wide.csv",
        "./../data/train/2004_result_wide.csv",
        "./../data/train/2005_result_wide.csv",
        "./../data/train/2006_result_wide.csv",
        "./../data/train/2007_result_wide.csv",
        "./../data/train/2008_result_wide.csv",
        "./../data/train/2009_result_wide.csv",
        "./../data/train/2010_result_wide.csv",
        "./../data/train/2011_result_wide.csv",
        "./../data/train/2012_result_wide.csv",
    ]
)

# ゲーム環境を初期化します。
observation = env.reset()

# 環境からアクション数を取得します。このゲームでは4となります。
nb_actions = env.action_space.n

try:
    model = keras.models.load_model('./../model/binary_model_wide')
except:
    print("create")
    # Kerasを使ってモデルを作成します。
    model = keras.models.Sequential([
        keras.layers.Flatten(input_shape=(1,) + env.observation_space.shape),
        keras.layers.Dense(256, activation="elu", kernel_regularizer=keras.regularizers.l2(0.0001)),
        keras.layers.Dense(256, activation='elu', kernel_regularizer=keras.regularizers.l2(0.0001)),
        keras.layers.Dense(256, activation='elu', kernel_regularizer=keras.regularizers.l2(0.0001)),
        keras.layers.Dense(256, activation='elu', kernel_regularizer=keras.regularizers.l2(0.0001)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(nb_actions, activation="linear"),
    ])

model.summary()

# 経験値を蓄積するためのメモリです。学習を安定させるために使用します。
memory = SequentialMemory(limit=50000, window_length=1)

# 行動ポリシーはBoltzmannQPolicyを使用しています。
# EpsGreedyQPolicyと比較して、こちらの方が収束が早かったので採用しています。
policy = EpsGreedyQPolicy()

# DQNAgentを作成します。
dqn = DQNAgent(
    model=model,
    nb_actions=nb_actions,
    memory=memory,
    target_model_update=1e-2,
    policy=policy)

# DQNAgentのコンパイル。最適化はAdam,評価関数はMAEを使用します。
dqn.compile(Adam(learning_rate=1e-4), metrics=[
    keras.losses.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'), 'accuracy'
])

cp_callback = ModelCheckpoint(model, "./../checkpoint/20230411_binary_model_wide", 10)

# 学習開始
history = dqn.fit(env, nb_steps=30000000, visualize=False, log_interval=34462, callbacks=[cp_callback], verbose=1)



RANK_ONE_TWO_HORSE            :              0円 的中率 0.00%    回収率 0.00%    (0/0)
RANK_ONE_THREE_HORSE          :              0円 的中率 0.00%    回収率 0.00%    (0/0)
RANK_TWO_THREE_HORSE          :              0円 的中率 0.00%    回収率 0.00%    (0/0)
NO_ACITON                     :              0円 的中率 0.00%    回収率 0.00%    (0/0)
TOTAL                         :              0円 的中率 0.00%    回収率 0.00%    (0/0)
create
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 40)                0         
                                                                 
 dense (Dense)               (None, 256)               10496     
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dense_2 (Dense)             (None, 256)    

  updates=self.state_updates,


  997/34462 [..............................] - ETA: 1:43 - reward: -0.4804

  updates=self.state_updates,



RANK_ONE_TWO_HORSE            :        -319900円 的中率 9.70%    回収率 69.27%   (101/1041)
RANK_ONE_THREE_HORSE          :        -418700円 的中率 8.78%    回収率 57.28%   (86/980)
RANK_TWO_THREE_HORSE          :         106200円 的中率 8.17%    回収率 106.68%  (130/1591)
NO_ACITON                     :              0円 的中率 76.86%   回収率 0.00%    (23711/30850)
TOTAL                         :        -632400円 的中率 8.78%    回収率 82.49%   (317/3612)
1 episodes - episode_reward: 13594.000 [13594.000, 13594.000] - loss: 1.603 - binary_crossentropy: -281.587 - accuracy: 0.679 - mean_q: 34.040

Interval 2 (34462 steps performed)

RANK_ONE_TWO_HORSE            :         236300円 的中率 12.38%   回収率 127.87%  (105/848)
RANK_ONE_THREE_HORSE          :        -238900円 的中率 9.85%    回収率 72.94%   (87/883)
RANK_TWO_THREE_HORSE          :         -37900円 的中率 8.15%    回収率 95.59%   (70/859)
NO_ACITON                     :              0円 的中率 76.80%   回収率 0.00%    (24477/31872)
TOTAL                         :         -40500円 的中率 10.

In [None]:
# 学習した重みをファイルに保存します。
#dqn.save_weights('./../model/moving_test.hdf5', overwrite=True)

# 学習したモデルを保存
model.save("./../model/binary_model_wide")
print(history.history['episode_reward'])

# ゲームごとのステップ数と報酬をグラフ化します。
#plt.plot(history.history['nb_episode_steps'], label='nb_episode_steps')
plt.plot(history.history['episode_reward'], label='episode_reward')
plt.legend()
plt.show()

INFO:tensorflow:Assets written to: ./../model/binary_model_wide\assets
[13594.0, 15016.0, 15008.0, 15032.0, 14946.0, 14908.0, 15034.0, 14898.0, 14796.0, 14944.0, 14960.0, 14838.0, 14964.0, 14846.0, 15028.0, 14868.0, 14978.0, 14868.0, 14896.0, 14930.0, 14996.0, 14962.0, 14980.0, 14814.0, 14820.0, 14914.0, 14860.0, 14806.0, 14970.0, 14802.0, 14802.0, 15030.0, 14864.0, 14956.0, 15068.0, 15096.0, 14988.0, 14876.0, 14742.0, 15222.0, 14990.0, 14912.0, 14878.0, 14626.0, 14842.0, 14832.0, 14896.0, 14878.0, 15002.0, 14902.0, 14994.0, 14990.0, 14862.0, 15082.0, 15104.0, 14920.0, 14868.0, 15002.0, 14968.0, 14954.0, 14870.0, 14938.0, 15020.0, 14846.0, 14954.0, 14982.0, 14890.0, 14882.0, 14898.0, 15004.0, 14886.0, 15070.0, 14766.0, 15044.0, 14956.0, 14866.0, 14972.0, 14914.0, 15048.0, 15100.0, 15036.0, 14942.0, 15052.0, 15044.0, 14938.0, 14958.0, 15130.0, 15078.0, 14942.0, 14952.0, 14730.0, 15004.0, 15134.0, 14966.0, 15076.0, 14974.0, 14984.0, 15146.0, 15216.0, 15094.0, 15030.0, 15016.0, 15218.0, 1