In [1]:
import os
import glob

from dataclasses import dataclass
from pathlib import Path
import shutil

import tensorflow as tf
import numpy as np
from tqdm import tqdm
import yaml

from rl.network import ResNet
from rl.mcts import MCTS
from rl.buffer import ReplayBuffer, Sample
from rl.game import Game, encode_state

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

base_path = "graphs"
index = "20241130"
qubits = config["game_settings"]["N"]
training_settings = config["training_settings"]
network_settings = config["network_settings"]
mcts_settings = config["mcts_settings"]
num_cpus = training_settings["num_cpus"]
num_gpus = training_settings["num_gpus"]
n_episodes = training_settings["n_episodes"]
buffer_size = training_settings["buffer_size"]
batch_size = training_settings["batch_size"]
epochs_per_update = training_settings["epochs_per_update"]
update_period = training_settings["update_period"]
save_period = training_settings["save_period"]
eval_period = training_settings.get("eval_period", 100)

In [2]:
def selfplay(weights, qubits, current_episode, config):
    record = []
    game = Game(qubits, config)
    state = game.get_initial_state()
    game.reset_used_columns()
    network = ResNet(action_space=len(game.coupling_map), config=config)
    network.predict(encode_state(state, qubits))
    network.set_weights(weights)

    mcts = MCTS(qubits=qubits, network=network, config=config)
    done = False
    total_score = 0
    step_count = 0
    prev_action = None

    while not done and step_count < game.MAX_STEPS:
        mcts_policy = mcts.search(
            root_state=state,
            prev_action=prev_action,
            num_simulations=mcts_settings["num_mcts_simulations"],
        )
        if prev_action is not None:
            indices = [i for i in range(len(game.coupling_map)) if i != prev_action]
            prob = mcts_policy[indices]
            prob = prob/ prob.sum()
            # if use_network_policy:
            #     prob = np.ones(len(prob))/len(prob)
            action = np.random.choice(indices, p=prob)
        else:
            indices = list(range(len(game.coupling_map)))
            prob = mcts_policy
            # if use_network_policy:
            #     prob = np.ones(len(mcts_policy))/len(mcts_policy)
            action = np.random.choice(indices, p=prob)
        record.append(Sample(state.copy(), mcts_policy, reward=None))
        state, done, action_score = game.step(state, action, prev_action)
        prev_action = action
        # print(state, action_score, done)
        total_score += action_score
        step_count += 1

    reward = game.get_reward(state, total_score)
    for sample in record:
        sample.reward = reward
    return record

def evaluate_self_play(qubits, network, config):
    pattern = os.path.join(base_path, f"adj_matrix_{qubits}_*.npy")
    file_paths = glob.glob(pattern)
    avg_depth = []
    avg_counts = []
    for file_path in tqdm(file_paths):
        state = np.load(file_path)
        game = Game(qubits, config)
        swap_pairs = []
        done = False
        total_score = 0
        step_count = 0
        prev_action = None
        while not done and step_count < game.MAX_STEPS:
            encoded_state = encode_state(state, qubits)
            input_state = np.expand_dims(encoded_state, axis=0)
            policy_logits, value_output = network.predict(input_state)
            policy = tf.nn.softmax(policy_logits).numpy()[0]
            valid_actions = game.get_valid_actions(state, prev_action)
            policy = np.array([policy[a] if a in valid_actions else 0 for a in range(len(game.coupling_map))])
            policy_sum = np.sum(policy)
            if policy_sum > 0:
                policy /= policy_sum
            else:
                policy[valid_actions] = 1 / len(valid_actions)
            action = np.random.choice(len(policy), p=policy)
            selected_action = game.coupling_map[action]
            swap_pairs.append(selected_action)
            state, done, _ = game.step(state, action, prev_action)
            prev_action = action
            step_count += 1
        if not done:
            depth = game.MAX_STEPS
            swap_count = game.MAX_STEPS
        else:
            depth = game.current_layer
            swap_count = len(swap_pairs)
        avg_counts.append(swap_count)
        avg_depth.append(depth)
    return np.mean(avg_depth), np.mean(avg_counts)

logdir = Path("log")
if logdir.exists():
    shutil.rmtree(logdir)
summary_writer = tf.summary.create_file_writer(str(logdir))

game = Game(qubits, config)
network = ResNet(action_space=len(game.coupling_map), config=config)

dummy_state = encode_state(game.state, qubits)
network.predict(encode_state(game.state, qubits))
current_weights = network.get_weights()

optimizer = tf.keras.optimizers.Adam(learning_rate=network_settings["learning_rate"])

replay = ReplayBuffer(buffer_size=buffer_size)

n_updates = 0
n = 0

while n < n_episodes:
    for _ in tqdm(range(update_period)):
        finished = selfplay(current_weights,qubits,n, config)
        replay.add_record(finished)
        n += 1

    if len(replay) >= batch_size:
        num_iters = epochs_per_update * (len(replay) // batch_size)
        value_loss_weight = 1.0
        policy_loss_weight = 1.0
        for i in tqdm(range(num_iters)):
            states, mcts_policy, rewards = replay.get_minibatch(batch_size=batch_size)
            with tf.GradientTape() as tape:
                p_pred, v_pred = network(states, training=True)
                value_loss = tf.square(rewards - v_pred)
                policy_loss = -tf.reduce_sum(
                    mcts_policy * tf.math.log(p_pred + 1e-5), axis=1, keepdims=True
                )
                loss = tf.reduce_mean(value_loss_weight * value_loss + policy_loss_weight * policy_loss)
            grads = tape.gradient(loss, network.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 1.0)
            optimizer.apply_gradients(zip(grads, network.trainable_variables))
            n_updates += 1

            if i % 10 == 0:
                with summary_writer.as_default():
                    tf.summary.scalar(
                        "value_loss", tf.reduce_mean(value_loss), step=n_updates
                    )
                    tf.summary.scalar(
                        "policy_loss", tf.reduce_mean(policy_loss), step=n_updates
                    )

        current_weights = network.get_weights()

    # モデルの保存と評価
    if n % save_period == 0:
        network.save(f"checkpoints/network{qubits}_{index}_{n}.keras")
        network.save_weights(f"checkpoints/network{qubits}_{index}_{n}.weights.h5")

    # if n % eval_period == 0:
    #     depth, count = evaluate_self_play(qubits, network, config)
    #     print(f"Episode {n}: SWAP depth is {depth}, SWAP count is {count}")
    #     print("-" * 50)

100%|██████████| 10/10 [00:29<00:00,  2.92s/it]


In [4]:
import numpy as np
import tensorflow as tf
from rl.network import ResNet
import rl.game as game

game = Game(qubits, config)

network = ResNet(action_space=len(game.coupling_map),config=config)
network = tf.keras.models.load_model(f"checkpoints/network{qubits}_{index}_700.keras")

KeyError: 'action_space'

In [11]:
for _ in range(12):
    game = Game(qubits, config)

    state = game.state
    ans = []
    done = False
    total_score = 0
    step_count = 0
    prev_action = None
    print(state)
    while not done and step_count < game.MAX_STEPS:
        encoded_state = encode_state(state, qubits)
        input_state = np.expand_dims(encoded_state, axis=0)

        policy_output, value_output = network.predict(input_state)
        policy = policy_output[0]
        if prev_action is not None:
            indices = [i for i in range(len(game.coupling_map)) if i != prev_action]

            prob = policy.numpy()[indices]
            if prob.sum() < 1e-6:
                action = np.random.choice(indices)
            else:
                action = np.random.choice(indices, p=prob / prob.sum())
        else:
            indices = list(range(len(game.coupling_map)))
            action = np.random.choice(indices, p=policy)
        selected_action = game.coupling_map[action]
        ans.append(selected_action)
        state, done, _ = game.step(state, action, prev_action)
        prev_action = action
        step_count += 1

    if done:
        print(f"Game finished successfully in {step_count} steps with {ans}")
    else:
        print(f"Game terminated after reaching the maximum steps ({game.MAX_STEPS}).")
        print(f"Total score: {total_score}")

[[0. 0. 1. 0. 1. 1.]
 [0. 0. 0. 1. 1. 1.]
 [1. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1.]
 [1. 1. 1. 0. 0. 0.]
 [1. 1. 0. 1. 0. 0.]]
Game terminated after reaching the maximum steps (25).
Total score: 0
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]
Game finished successfully in 4 steps with [(4, 5), (1, 2), (4, 5), (2, 3)]
[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Game finished successfully in 13 steps with [(4, 5), (0, 1), (4, 5), (2, 3), (4, 5), (0, 1), (4, 5), (2, 3), (4, 5), (3, 4), (4, 5), (2, 3), (4, 5)]
[[0. 0. 1. 0. 0. 1.]
 [0. 0. 0. 1. 0. 1.]
 [1. 0. 0. 0. 1. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0.]]
Game finished successfully in 18 steps with [(4, 5), (2, 3), (4, 5), (3, 4), (4, 5), (2, 3), (4, 5), (3, 4), (4, 5), (0, 1), (4, 5), (1, 2), (4, 5), (2, 3), (4, 5), (1, 2), (4, 5), (3, 4)]
[[0. 0. 

In [6]:
evaluate_self_play(qubits,network,config)

(11.633333333333333, 18.066666666666666)