# Install and import packages
--------

In [None]:
#import gym
import gymnasium as gym
import torch.optim as optim
from stable_baselines3 import A2C, PPO
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3.common.callbacks import BaseCallback

from gymnasium_robotics.core import GoalEnv

#from stable_baselines3 import HerReplayBuffer

import qas_gym
from qas_gym import envs
import sys
import os

# Basic Environment
------
Create your gym environment :

In [None]:
# Parameters 
env_name = 'BasicNQubit-v0'#'NoisyThreeQubit-v0'#'BasicThreeQubit-v0'
#gym.wrappers.EnvCompatibility(env_name)
fidelity_threshold = 0.95
reward_penalty = 0.01
max_timesteps = 20#20
#target = (1/(2*np.sqrt(2.))) * np.asarray([1 + 0.j, 1 +0.j, 1 +0.j, -1+0.j, 1+0.j, 1+0.j, -1+0.j, 1+0.j])
target = np.zeros(2**4, dtype=complex)
target[0] = 1. / np.sqrt(2) + 0.j
target[-1] = 1. / np.sqrt(2) + 0.j

#target = np.zeros(2**3, dtype=complex)
#target[4] = 1. / np.sqrt(3) + 0.j
#target[6] = 1. / np.sqrt(3) + 0.j
#target[7] = 1. / np.sqrt(3) + 0.j
#target = state_vector
#seed
seed=1

# Environment
env = gym.make(env_name,
               target=target,
               fidelity_threshold=fidelity_threshold,
               reward_penalty=reward_penalty,
               max_timesteps=max_timesteps#,error_single=0.01, error_multi=0.01,error_rate=0.
              )
#env.observation_space.shape
#gym.wrappers.EnvCompatibility(env_name)
observation, info = env.reset(seed=seed)
print(env.action_space) 
print(env.observation_space)


Diplay the action gates : 

In [None]:
for idx, gate in enumerate(env.unwrapped.action_gates):
    print('Action({:02d}) --> {}'.format(idx, gate))

Diplay the state observables : 

In [None]:
for idx, observable in enumerate(env.unwrapped.state_observables):
    print('State({:02d}) --> {}'.format(idx, observable))

# A2C Agent
------

In [None]:
# Parameters
gamma = 0.98
learning_rate = 0.0001
policy_kwargs = dict(optimizer_class=optim.Adam)


# Agent
a2c_model = A2C("MultiInputPolicy",
                env,
                gamma=gamma,
                learning_rate=learning_rate,
                policy_kwargs=policy_kwargs,
                tensorboard_log='logs/')


In [None]:
a2c_model.learn(total_timesteps=550000, progress_bar=True)

In [None]:
import time
from IPython.display import clear_output
state = env.reset()
done = False
while not done:
    action,_states = a2c_model.predict(state)
    state, reward, done, info = env.step(action)
    clear_output(wait=True)
    env.render()
    time.sleep(1)
print(info)

In [None]:
a2c_model.save("a2c_qas_v0")

# PPO Model
------

In [None]:
# Parameters
gamma = 0.97#0.97
n_epochs = 4
clip_range = 0.2
learning_rate = 0.0004
policy_kwargs = dict(optimizer_class=optim.Adam)


# Agent
ppo_model = PPO("MultiInputPolicy",
                env,
                verbose=1,
                #batch_size=128,
                seed=seed,
                gamma=gamma,
                n_epochs=n_epochs,
                clip_range=clip_range,
                learning_rate=learning_rate,
                policy_kwargs=policy_kwargs,
                tensorboard_log='logs/')

In [None]:
ppo_model.learn(total_timesteps=180000, progress_bar=True)#180000 callback=callback

In [None]:
import time
from IPython.display import clear_output
import gymnasium as gym

# 定义
#env = gym.make("YourEnv-v0", ...)
#from stable_baselines3 import PPO
#ppo_model = PPO.load("path/to/your/ppo_model")

state, info = env.reset()
print("Initial state:",state) # Debug: Print initial state

done = False
truncated = False
entanglement_list_pre = []
fidelity_list_pre = []

while not done and not truncated:
    action, _states = ppo_model.predict(state, deterministic=False) # 非确定性的策略，以便进行探索
    next_state, reward, done, truncated, info = env.step(action)

    #entanglement_list_pre.append(env.unwrapped.entangle_list)
    #fidelity_list_pre.append(env.unwrapped.fidelity_list)
    
    clear_output(wait=True)
    env.render()
    time.sleep(1)

    state = next_state   # Correctly update state
    print("Current state:", state) # Debug: Print current state

print(info)

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(env.fidelity_values)
plt.title("Fidelity per Episode")
plt.xlabel("Rollout")
plt.ylabel("Fidelity")

plt.subplot(1, 2, 2)
plt.plot(env.concurrence_values)
plt.title("entanglement per Episode")
plt.xlabel("Rollout")
plt.ylabel("Entanglement")

plt.tight_layout()
plt.show()

In [None]:
ppo_model.save("ppo_qas_cluster_step180000_New_concurrence_gamma0.97lr0.0004_seed=1")

In [None]:
model = PPO.load("model_path", env=env)

# 继续训练
model.learn(total_timesteps=10000)

In [None]:
ppo_model.learn(total_timesteps=20000, reset_num_timesteps=False)

# Results
------

In [None]:
%load_ext tensorboard
%tensorboard --logdir=logs/

In [25]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env.plot_entangle_list()
env.plot_fidelity_list()

In [None]:
import numpy as np
import os

# Results
------

In [None]:
save_dir = './entangle_log'  
if not os.path.exists(save_dir):
    os.makedirs(save_dir) 

# 存储
file_path = os.path.join(save_dir, 'entangle_list_gam0.97_lr0.0004_cluster_no_entangle_seed=2time0.7_fin.npy')  # 创建路径
np.save(file_path, env.concurrence_values)
print(f"entangle_list has been saved to '{file_path}'.")

# 读取
#loaded_entangle_list = np.load(file_path)
#print("Loaded entangle_list:", loaded_entangle_list)

In [None]:
save_dir = './fidelity_log'  
if not os.path.exists(save_dir):
    os.makedirs(save_dir) 

# 存储
file_path = os.path.join(save_dir, 'fidelity_list_gam0.97_lr0.0004_cluster_no_entangle_seed=2time0.7_fin.npy')  # 创建路径
np.save(file_path, env.fidelity_values)
print(f"entangle_list has been saved to '{file_path}'.")

# 读取
#loaded_entangle_list = np.load(file_path)
#print("Loaded entangle_list:", loaded_entangle_list)

In [None]:
%load_ext tensorboard
%tensorboard --logdir=logs/