In [1]:
from dataclasses import dataclass
import time
import random
from pathlib import Path
import shutil

import tensorflow as tf
import numpy as np
from tqdm import tqdm
import yaml

from rl.network import ResNet
from rl.mcts import MCTS
from rl.buffer import ReplayBuffer
from rl import game

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

training_settings = config["training_settings"]
network_settings = config["network_settings"]
mcts_settings = config["mcts_settings"]
num_cpus = training_settings["num_cpus"]
n_episodes = training_settings["n_episodes"]
buffer_size = training_settings["buffer_size"]
batch_size = training_settings["batch_size"]
epochs_per_update = training_settings["epochs_per_update"]
update_period = training_settings["update_period"]
save_period = training_settings["save_period"]


@dataclass
class Sample:
    state: np.ndarray
    mcts_policy: np.ndarray
    reward: float


def selfplay(weights, test=False):
    """Perform a self-play game and collect training data."""
    record = []
    if test:
        state = game.get_initial_test_state()
    else:
        state = game.get_initial_state()
    game.reset_used_columns()
    network = ResNet(action_space=game.ACTION_SPACE)

    # Initialize network parameters
    network.predict(game.encode_state(state))
    network.set_weights(weights)

    mcts = MCTS(network=network)
    done = False
    total_score = 0
    step_count = 0
    prev_action = None

    while not done and step_count < game.MAX_STEPS:
        mcts_policy = mcts.search(
            root_state=state,
            num_simulations=mcts_settings["num_mcts_simulations"],
            prev_action=prev_action,
        )
        if prev_action is not None:
            indices = [i for i in range(game.ACTION_SPACE) if i != prev_action]
            prob = mcts_policy[indices]
            action = np.random.choice(indices, p=prob / prob.sum())
        else:
            indices = list(range(game.ACTION_SPACE))
            action = np.random.choice(indices, p=mcts_policy)
        record.append(Sample(state.copy(), mcts_policy, reward=None))
        state, done = game.step(state, action, prev_action, mcts_policy)
        prev_action = action
        # print(state, action_score, done)
        # total_score += action_score
        step_count += 1

    # The reward is calculated based on the final state
    reward = game.get_reward(state, total_score)

    # Assign the reward to each sample
    for sample in record:
        sample.reward = reward

    return record

In [2]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

training_settings = config["training_settings"]
network_settings = config["network_settings"]
mcts_settings = config["mcts_settings"]
num_cpus = training_settings["num_cpus"]
n_episodes = training_settings["n_episodes"]
buffer_size = training_settings["buffer_size"]
batch_size = training_settings["batch_size"]
epochs_per_update = training_settings["epochs_per_update"]
update_period = training_settings["update_period"]
save_period = training_settings["save_period"]

# ray.init(num_cpus=num_cpus, num_gpus=1, local_mode=False)

logdir = Path("log")
if logdir.exists():
    shutil.rmtree(logdir)
summary_writer = tf.summary.create_file_writer(str(logdir))

game.initialize_game()  # Initialize game variables

network = ResNet(action_space=game.ACTION_SPACE)

dummy_state = game.encode_state(game.get_initial_state())
network.predict(dummy_state)

current_weights = network.get_weights()

optimizer = tf.keras.optimizers.Adam(learning_rate=network_settings["learning_rate"])

replay = ReplayBuffer(buffer_size=buffer_size)

# # Start self-play workers
work_in_progresses = [selfplay(current_weights, True)]

n_updates = 0
n = 0
while n <= n_episodes:
    for _ in range(update_period):
        # Wait for a self-play worker to finish
        finished = selfplay(current_weights, test=False)
        replay.add_record(finished)
        n += 1

    # Update network
    if len(replay) >= batch_size:
        num_iters = epochs_per_update * (len(replay) // batch_size)
        for i in range(num_iters):
            states, mcts_policy, rewards = replay.get_minibatch(batch_size=batch_size)
            with tf.GradientTape() as tape:
                p_pred, v_pred = network(states, training=True)
                value_loss = tf.square(rewards - v_pred)
                policy_loss = -tf.reduce_sum(
                    mcts_policy * tf.math.log(p_pred + 1e-5), axis=1, keepdims=True
                )
                loss = tf.reduce_mean(value_loss + policy_loss)
            grads = tape.gradient(loss, network.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 1.0)
            optimizer.apply_gradients(zip(grads, network.trainable_variables))
            n_updates += 1

            if i % 100 == 0:
                with summary_writer.as_default():
                    tf.summary.scalar(
                        "value_loss", tf.reduce_mean(value_loss), step=n_updates
                    )
                    tf.summary.scalar(
                        "policy_loss", tf.reduce_mean(policy_loss), step=n_updates
                    )

        current_weights = network.get_weights()

    if n % save_period == 0:
        network.save_weights(f"checkpoints/network_{n}.weights.h5")

Q: [0, 0, 0]
U: [0.0, 0.0, 0.0]
Scores: [0. 0. 0.]
Q: [0, 0, 0.9999998211860657]
U: [0.5153322301601074, 0.3532580507516364, 0.0657048595441281]
Scores: [0.51533223 0.35325805 1.06570468]
Q: [0, 0, 0.9999998211860657]
U: [0.7287898290203974, 0.49958232639044736, 0.0619471356540835]
Scores: [0.72878983 0.49958233 1.06194696]
Q: [0, 0, 0.9999998807907104]
U: [0.8925816054150846, 0.6118608920845793, 0.056902077517303355]
Scores: [0.89258161 0.61186089 1.05690196]
Q: [0, 0, 0.9999999105930328]
U: [1.0306644603202149, 0.7065161015032728, 0.052563887635302474]
Scores: [1.03066446 0.7065161  1.0525638 ]
Q: [0, 0, 0.9999999284744263]
U: [1.1523178976345676, 0.7899090150797298, 0.04897351079758209]
Scores: [1.1523179  0.78990902 1.04897344]
Q: [0.9999998211860657, 0, 0.9999999284744263]
U: [0.6311505059513814, 0.8653019718717126, 0.053647793168117054]
Scores: [1.63115033 0.86530197 1.05364772]
Q: [0.9999998211860657, 0, 0.9999999284744263]
U: [0.4544803078599812, 0.9346329509202638, 0.057946239

In [10]:
import numpy as np
import tensorflow as tf
from rl.network import ResNet
import rl.game as game

# ゲームの初期化
game.initialize_game()
game.reset_used_columns()

# ネットワークの構築と重みの読み込み
network = ResNet(action_space=game.ACTION_SPACE)
network.load_weights("checkpoints/network_200.weights.h5")

# 初期状態の生成
state = game.get_initial_test_state()
done = False
total_score = 0
step_count = 0
prev_action = None
print(state)
while not done and step_count < game.MAX_STEPS:
    # 状態のエンコードと次元の調整
    encoded_state = game.encode_state(state)
    input_state = np.expand_dims(encoded_state, axis=0)

    # モデルによる予測
    policy_output, value_output = network.predict(input_state)
    policy = policy_output.numpy()[0]
    if prev_action is not None:
        indices = [i for i in range(game.ACTION_SPACE) if i != prev_action]
        prob = policy[indices]
        action = np.random.choice(indices, p=prob / prob.sum())
    else:
        indices = list(range(game.ACTION_SPACE))
        action = np.random.choice(indices, p=policy)
    selected_action = game.ACTIONS[action]
    print(f"Step {step_count}: Selected action {selected_action}")

    # アクションの適用
    state, done = game.step(state, action, prev_action)
    prev_action = action
    step_count += 1

    # 現在の状態を表示（必要に応じて）
    print("Current state:\n", state)
    print("Game done:", done)
    print("-" * 50)

# ゲーム結果の表示
if done:
    print(
        f"Game finished successfully in {step_count} steps with total score: {total_score}"
    )
else:
    print(f"Game terminated after reaching the maximum steps ({game.MAX_STEPS}).")
    print(f"Total score: {total_score}")

[[0. 0. 1. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 1. 0. 0.]]
policy_output=<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[1., 0., 0.]], dtype=float32)>
Step 0: Selected action (0, 1)
Current state:
 [[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 0.]
 [1. 1. 0. 0.]]
Game done: False
--------------------------------------------------
policy_output=<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[1., 0., 0.]], dtype=float32)>


  action = np.random.choice(indices, p=prob/prob.sum())


ValueError: probabilities contain NaN