In [1]:
import os

from libs.rlinml.agent import Agent
from utils import prepare_device, prepare_env, save_reward_change, load_reward_change, plot_reward_change

In [4]:
import numpy as np
import math
import time
from itertools import count
import qiskit

In [3]:
# prepare env
env = prepare_env('CartPole-v1')

# prepare device
device, device_name = prepare_device()

if device_name == "cuda":
    print("device_name:cuda")
    num_episodes = 6000
else:
    print("device_name:cpu")
    num_episodes = 2000

device_name:cpu


In [5]:
hyper_params = {'GAMMA':0.99, 'BATCH_SIZE':16, 'MEMORY_SIZE':10000, 'c_depth': 5, \
                'backend': qiskit.Aer.get_backend('statevector_simulator'), 'shots':1}
model_type = "hybrid"

agent = Agent(env, hyper_params, model_type, device)

total_steps = 0
STEPS_PER_UPDATE = 10 #1 # Train the model every x steps
STEPS_PER_TARGET_UPDATE = 30 #1 # Update the target model every x steps
reward_change = [] # あるエピソードにおいて、どの程度期間カートポールを立てていられていたか

epsilon = 1.0 # Epsilon greedy parameter
print("The epislon was initialized to " + str(epsilon))
epsilon_min = 0.01  # Minimum epsilon greedy parameter
decay_epsilon = 0.99 # Decay rate of epsilon greedy parameter

The initial parameters of the nets:
OrderedDict([('custom_multiply_layer1.weight', tensor([1., 1., 1., 1.])), ('re_uploading_PQC_layer.thetas', tensor([0.2145, 1.7541, 1.8549, 0.3300, 2.0042, 1.1117, 1.0628, 2.3753, 1.0560,
        0.9189, 0.7430, 1.8682, 0.6279, 1.0090, 3.0084, 2.1423, 2.4867, 0.0782,
        0.0432, 2.5322, 0.0427, 1.3226, 2.7323, 0.0964, 1.8358, 0.4877, 1.9398,
        1.2508, 0.6110, 2.7681, 0.5107, 2.5807, 1.3975, 0.7577, 1.6016, 2.0821,
        1.6162, 2.9922, 0.2042, 0.1317, 1.0983, 2.3725, 0.6005, 0.2829, 1.4809,
        2.4460, 1.2713, 0.1079, 0.3398, 0.1672, 0.6277, 2.4331, 3.0493, 0.5349,
        1.3874, 2.6574, 0.4520, 1.4431, 2.7301, 2.1936, 1.9479, 1.9185, 0.7471,
        2.8767, 0.5414, 0.8636, 1.2191, 2.0101, 2.6371, 0.2666, 1.5765, 2.2905])), ('custom_multiply_layer2.weight', tensor([1., 1.]))])
The epislon was initialized to 1.0


In [None]:
start = time.time()

for i_episode in range(num_episodes):
    # Initialize the environment and get it's state
    state = env.reset()[0]

    for t in count():
        action = agent.get_action(state, epsilon)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            next_state = observation

        # Store the transition in memory
        agent.store_experience(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        total_steps += 1
        # Perform one step of the optimization (on the policy network)
        if total_steps % STEPS_PER_UPDATE == 0:
            agent.optimize_model()

        # update of the target network's weights
        if total_steps % STEPS_PER_TARGET_UPDATE == 0:
            agent.update_target_network()

        if done:
            # Decay epsilon
            epsilon = max(epsilon * decay_epsilon, epsilon_min)
            reward_change.append(t+1)
            break

    if (i_episode+1)%10 == 0:
        avg_rewards = np.mean(reward_change[-10:])
        print("Episode {}/{}, average last 10 rewards {}".format(i_episode+1, num_episodes, avg_rewards))
        if avg_rewards >= 500.0:
            break

end = time.time()
print('Complete')
print("It takes {} minitues".format((end-start)/60))

print(os.getcwd())
PATH = os.getcwd()
save_reward_change(reward_change, PATH, file_name="reward_change_hybrid.txt")
agent.save_model(PATH)

In [None]:
plot_reward_change(reward_change)

In [None]:
reward_change_loaded = load_reward_change(PATH, file_name="reward_change_hybrid.txt")

In [None]:
print(reward_change_loaded)