In [1]:
import os
import glob

from dataclasses import dataclass
from pathlib import Path
import shutil

import tensorflow as tf
import numpy as np
from tqdm import tqdm
import yaml
import ray

from rl.network import ResNet
from rl.mcts import MCTS
from rl.buffer import ReplayBuffer, Sample
from rl.game import Game, encode_state

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

base_path = "graphs"
index = "20241130"
qubits = config["game_settings"]["N"]
training_settings = config["training_settings"]
network_settings = config["network_settings"]
mcts_settings = config["mcts_settings"]
num_cpus = training_settings["num_cpus"]
num_gpus = training_settings["num_gpus"]
n_episodes = training_settings["n_episodes"]
buffer_size = training_settings["buffer_size"]
batch_size = training_settings["batch_size"]
epochs_per_update = training_settings["epochs_per_update"]
update_period = training_settings["update_period"]
save_period = training_settings["save_period"]
eval_period = training_settings.get("eval_period", 100)

@ray.remote(num_cpus=1)
def selfplay(weights, qubits, current_episode, config):
    record = []
    game = Game(qubits, config)
    state = game.get_initial_state()
    game.reset_used_columns()
    network = ResNet(action_space=len(game.coupling_map), config=config)
    dummy_state = encode_state(state, qubits)
    network(dummy_state[np.newaxis, ...])
    network.set_weights(weights)

    mcts = MCTS(qubits=qubits, network=network, config=config)
    done = False
    total_score = 0
    step_count = 0
    prev_action = None

    while not done and step_count < game.MAX_STEPS:
        mcts_policy = mcts.search(
            root_state=state,
            prev_action=prev_action,
            step_count=step_count,
        )
        action = np.random.choice(len(mcts_policy), p=mcts_policy)
        record.append(Sample(state.copy(), mcts_policy, reward=None))
        state, done, action_score = game.step(state, action, prev_action)
        prev_action = action
        total_score += action_score
        step_count += 1

    reward = game.get_reward(state, total_score)
    for sample in record:
        sample.reward = reward
    return record

def evaluate_self_play(qubits, network, config):
    pattern = os.path.join(base_path, f"adj_matrix_{qubits}_*.npy")
    file_paths = glob.glob(pattern)
    avg_depth = []
    avg_counts = []
    for file_path in file_paths:
        state = np.load(file_path)
        game = Game(qubits, config)
        swap_pairs = []
        done = False
        total_score = 0
        step_count = 0
        prev_action = None
        while not done and step_count < game.MAX_STEPS:
            encoded_state = encode_state(state, qubits)
            input_state = np.expand_dims(encoded_state, axis=0)
            policy_logits, value_output = network.predict(input_state)
            policy = tf.nn.softmax(policy_logits).numpy()[0]
            valid_actions = game.get_valid_actions(state, prev_action)
            policy = np.array([policy[a] if a in valid_actions else 0 for a in range(len(game.coupling_map))])
            policy_sum = np.sum(policy)
            if policy_sum > 0:
                policy /= policy_sum
            else:
                policy[valid_actions] = 1 / len(valid_actions)
            action = np.random.choice(len(policy), p=policy)
            selected_action = game.coupling_map[action]
            swap_pairs.append(selected_action)
            state, done, _ = game.step(state, action, prev_action)
            prev_action = action
            step_count += 1
        # 量子回路の深さやSWAP回数を計算（必要に応じて）
        if not done:
            depth = game.MAX_STEPS
            swap_count = game.MAX_STEPS
        else:
            depth = game.current_layer
            swap_count = len(swap_pairs)
        avg_counts.append(swap_count)
        avg_depth.append(depth)
    return np.mean(avg_depth), np.mean(avg_counts)

ray.init(num_cpus=num_cpus, num_gpus=num_gpus, local_mode=False)
print(ray.available_resources())
logdir = Path("log")
if logdir.exists():
    shutil.rmtree(logdir)
summary_writer = tf.summary.create_file_writer(str(logdir))

game = Game(qubits, config)
network = ResNet(action_space=len(game.coupling_map), config=config)

dummy_state = encode_state(game.state, qubits)
network(dummy_state[np.newaxis, ...])  # モデルのビルド

current_weights = network.get_weights()
current_weights_id = ray.put(current_weights)

optimizer = tf.keras.optimizers.Adam(learning_rate=network_settings["learning_rate"])

replay = ReplayBuffer(buffer_size=buffer_size)

n_updates = 0
n = 0

while n < n_episodes:
    work_in_progresses = [
        selfplay.remote(current_weights_id, qubits, n + i, config)
        for i in range(update_period)
    ]
    for _ in tqdm(range(update_period)):
        finished, work_in_progresses = ray.wait(work_in_progresses, num_returns=1)
        record = ray.get(finished[0])
        replay.add_record(record)
        n += 1

    # ネットワークの更新
    if len(replay) >= batch_size:
        num_iters = epochs_per_update * (len(replay) // batch_size)
        value_loss_weight = 1.0
        policy_loss_weight = 1.0
        for i in range(num_iters):
            states, mcts_policy, rewards = replay.get_minibatch(batch_size=batch_size)
            with tf.GradientTape() as tape:
                p_logits, v_pred = network(states, training=True)
                # ポリシー損失（from_logits=Trueを指定）
                policy_loss = tf.nn.softmax_cross_entropy_with_logits(
                    labels=mcts_policy, logits=p_logits
                )
                policy_loss = tf.reduce_mean(policy_loss)
                # バリュー損失
                value_loss = tf.reduce_mean(tf.square(rewards - v_pred))
                # L2正則化
                reg_loss = tf.add_n(network.losses)
                loss = policy_loss_weight * policy_loss + value_loss_weight * value_loss + reg_loss
            grads = tape.gradient(loss, network.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 1.0)
            optimizer.apply_gradients(zip(grads, network.trainable_variables))
            n_updates += 1

            if i % 10 == 0:
                with summary_writer.as_default():
                    tf.summary.scalar("value_loss", value_loss, step=n_updates)
                    tf.summary.scalar("policy_loss", policy_loss, step=n_updates)
                    tf.summary.scalar("total_loss", loss, step=n_updates)
                    tf.summary.scalar("reg_loss", reg_loss, step=n_updates)

        # ネットワークのパラメータを更新
        current_weights = network.get_weights()
        current_weights_id = ray.put(current_weights)

    # モデルの保存と評価
    if n % save_period == 0:
        network.save(f"checkpoints/network{qubits}_{index}_{n}.keras")
        network.save_weights(f"checkpoints/network{qubits}_{index}_{n}.weights.h5")

    if n % eval_period == 0:
        depth, count = evaluate_self_play(qubits, network, config)
        print(f"Episode {n}: SWAP depth is {depth}, SWAP count is {count}")
        print("-" * 50)

2024-11-30 15:22:18,107	INFO worker.py:1807 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


{'CPU': 8.0, 'memory': 42176108954.0, 'node:127.0.0.1': 1.0, 'node:__internal_head__': 1.0, 'object_store_memory': 2147483648.0}


100%|██████████| 50/50 [00:51<00:00,  1.03s/it]
100%|██████████| 50/50 [00:19<00:00,  2.58it/s]


Episode 100: SWAP depth is 8.7, SWAP count is 16.0
--------------------------------------------------


100%|██████████| 50/50 [00:27<00:00,  1.84it/s]
100%|██████████| 50/50 [00:32<00:00,  1.52it/s]


Episode 200: SWAP depth is 8.533333333333333, SWAP count is 14.866666666666667
--------------------------------------------------


100%|██████████| 50/50 [00:31<00:00,  1.61it/s]
100%|██████████| 50/50 [00:27<00:00,  1.83it/s]


Episode 300: SWAP depth is 7.266666666666667, SWAP count is 12.9
--------------------------------------------------


100%|██████████| 50/50 [00:31<00:00,  1.58it/s]
100%|██████████| 50/50 [00:35<00:00,  1.42it/s]


Episode 400: SWAP depth is 7.266666666666667, SWAP count is 13.666666666666666
--------------------------------------------------


100%|██████████| 50/50 [00:36<00:00,  1.37it/s]


In [3]:
import numpy as np
import tensorflow as tf
from rl.network import ResNet
import rl.game as game

game = Game(qubits, config)

network = ResNet(action_space=len(game.coupling_map),config=config)
# network = tf.keras.models.load_model(f"checkpoints/network{qubits}_{index}_700.keras")
# 初期状態の生成
network.load_weights(f"checkpoints/network{qubits}_{index}_700.weights.h5")

In [14]:
for _ in range(12):
    state = game.state
    ans = []
    done = False
    total_score = 0
    step_count = 0
    prev_action = None
    print(state)
    while not done and step_count < game.MAX_STEPS:
        encoded_state = encode_state(state, qubits)
        input_state = np.expand_dims(encoded_state, axis=0)

        policy_output, value_output = network.predict(input_state)
        policy = policy_output[0]
        if prev_action is not None:
            indices = [i for i in range(len(game.coupling_map)) if i != prev_action]

            prob = policy[indices]
            if prob.sum() < 1e-6:
                action = np.random.choice(indices)
            else:
                action = np.random.choice(indices, p=prob / prob.sum())
        else:
            indices = list(range(len(game.coupling_map)))
            action = np.random.choice(indices, p=policy)
        selected_action = game.coupling_map[action]
        ans.append(selected_action)
        state, done, _ = game.step(state, action, prev_action)
        prev_action = action
        step_count += 1

    if done:
        print(f"Game finished successfully in {step_count} steps with {ans}")
    else:
        print(f"Game terminated after reaching the maximum steps ({game.MAX_STEPS}).")
        print(f"Total score: {total_score}")

[[0. 0. 1. 1. 1. 1. 0.]
 [0. 0. 0. 1. 0. 1. 1.]
 [1. 0. 0. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 1. 1.]
 [1. 0. 1. 0. 0. 0. 1.]
 [1. 1. 1. 1. 0. 0. 0.]
 [0. 1. 0. 1. 1. 0. 0.]]
Game terminated after reaching the maximum steps (25).
Total score: 0
[[0. 0. 1. 1. 1. 1. 0.]
 [0. 0. 0. 1. 0. 1. 1.]
 [1. 0. 0. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 1. 1.]
 [1. 0. 1. 0. 0. 0. 1.]
 [1. 1. 1. 1. 0. 0. 0.]
 [0. 1. 0. 1. 1. 0. 0.]]
Game terminated after reaching the maximum steps (25).
Total score: 0
[[0. 0. 1. 1. 1. 1. 0.]
 [0. 0. 0. 1. 0. 1. 1.]
 [1. 0. 0. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 1. 1.]
 [1. 0. 1. 0. 0. 0. 1.]
 [1. 1. 1. 1. 0. 0. 0.]
 [0. 1. 0. 1. 1. 0. 0.]]
Game terminated after reaching the maximum steps (25).
Total score: 0
[[0. 0. 1. 1. 1. 1. 0.]
 [0. 0. 0. 1. 0. 1. 1.]
 [1. 0. 0. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 1. 1.]
 [1. 0. 1. 0. 0. 0. 1.]
 [1. 1. 1. 1. 0. 0. 0.]
 [0. 1. 0. 1. 1. 0. 0.]]
Game terminated after reaching the maximum steps (25).
Total score: 0
[[0. 0. 1. 1. 1. 1. 0.]
 [0. 0. 0. 1. 0. 1. 