In [1]:
!pip install tf_agents
!pip install tf-agents[reverb]



In [2]:
import tensorflow as tf
import tf_agents
import reverb
import numpy as np


from tf_agents.drivers import py_driver
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import tf_py_environment
from tf_agents.specs import tensor_spec
from tf_agents.networks import q_network
from tf_agents.networks import sequential
from tf_agents.utils import common
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import epsilon_greedy_policy




import sys
sys.path.append('/content/drive/MyDrive/E6885_Project')
import SortWaterEnv

In [3]:
tf.version.VERSION

'2.14.0'

In [4]:
num_iterations = 1000        #

initial_collect_steps = 100     #
collect_steps_per_iteration = 1   #
replay_buffer_max_length = 100000  #

batch_size = 64            #
learning_rate = 1e-3        #
log_interval = 200          #

num_eval_episodes = 10        #
eval_interval = 1000        #

In [5]:
############# create training and evaluation environment #############
num_bottles = 5
water_level = 4
env = SortWaterEnv.WaterSortEnv(num_bottles=num_bottles, water_level=water_level)
train_py_env = SortWaterEnv.WaterSortEnv(num_bottles=num_bottles, water_level=water_level)
eval_py_env = SortWaterEnv.WaterSortEnv(num_bottles=num_bottles, water_level=water_level)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [6]:
############ create a DQN agent ############
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

In [7]:
# Customized Q netword
class MaskedQNetwork(q_network.QNetwork):
  def __init__(self, input_tensor_spec, action_spec, fc_layer_params=(100,), **kwargs):
    # 从 input_tensor_spec 元组中提取观察值规格
    observation_spec = input_tensor_spec[0]

    # 调用基类的构造函数以构建网络
    super(MaskedQNetwork, self).__init__(observation_spec, action_spec, fc_layer_params=fc_layer_params, **kwargs)

  def call(self, observation, step_type=None, network_state=(), training=False):
    # 直接调用父类的 call 方法，处理观察值
    return super(MaskedQNetwork, self).call(
        observation, step_type, network_state, training)

In [8]:
observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

# create Q-network
q_net = MaskedQNetwork(
    (observation_spec['observation'], observation_spec['action_mask']),
    action_spec,
    fc_layer_params=fc_layer_params
)

In [9]:
def observation_and_action_constraint_splitter(obs):
	return obs['observation'], obs['action_mask']

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter,
    observation_and_action_constraint_splitter=observation_and_action_constraint_splitter
)

In [11]:
# an example, just to show what policies are used during evaluation and collecting
eval_policy = agent.policy
collect_policy = agent.collect_policy

In [12]:
# an example, just show how to create epsilon_greedy_policy. Not to be used
base_policy = agent.policy
epsilon = 0.1  # 例如，使用 0.1 作为 epsilon 值
epsilon_greedy_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(base_policy, epsilon=epsilon)

In [13]:
# example of greedy policy choosing action
example_py_env = SortWaterEnv.WaterSortEnv(num_bottles=num_bottles, water_level=water_level)
example_env = tf_py_environment.TFPyEnvironment(example_py_env)
time_step = example_env.reset()
print(time_step)
epsilon_greedy_policy.action(time_step)

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': {'action_mask': <tf.Tensor: shape=(1, 20), dtype=bool, numpy=
array([[False,  True,  True, False, False, False, False, False,  True,
        False,  True, False,  True, False,  True, False,  True, False,
         True,  True]])>,
                 'observation': <tf.Tensor: shape=(1, 5, 4), dtype=int32, numpy=
array([[[1, 3, 0, 0],
        [2, 2, 2, 1],
        [3, 0, 0, 0],
        [3, 0, 0, 0],
        [1, 1, 2, 3]]], dtype=int32)>},
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>})


PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([14], dtype=int32)>, state=(), info=())

In [14]:
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

In [15]:
# an example
compute_avg_return(eval_env, epsilon_greedy_policy, num_eval_episodes)

-19.9

In [16]:
################# Replay buffer #################
table_name = 'uniform_table'
replay_buffer_signature = tensor_spec.from_spec(
      agent.collect_data_spec)
replay_buffer_signature = tensor_spec.add_outer_dim(
    replay_buffer_signature)

table = reverb.Table(
    table_name,
    max_size=replay_buffer_max_length,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Fifo(),
    rate_limiter=reverb.rate_limiters.MinSize(1),
    signature=replay_buffer_signature)

reverb_server = reverb.Server([table])

replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
    agent.collect_data_spec,
    table_name=table_name,
    sequence_length=2,
    local_server=reverb_server)

rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
  replay_buffer.py_client,
  table_name,
  sequence_length=2)

In [17]:
agent.collect_data_spec

_TupleWrapper(Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(19, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'observation': BoundedTensorSpec(shape=(5, 4), dtype=tf.int32, name='observation', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32)), 'action_mask': TensorSpec(shape=(20,), dtype=tf.bool, name='action_mask')}),
 'policy_info': (),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))

In [18]:
agent.collect_data_spec._fields

('step_type',
 'observation',
 'action',
 'policy_info',
 'next_step_type',
 'reward',
 'discount')

In [20]:
# an example.
time_step = train_py_env.reset()
py_driver.PyDriver(
    train_py_env,
    py_tf_eager_policy.PyTFEagerPolicy(
      epsilon_greedy_policy, use_tf_function=True),
    [rb_observer],
    max_steps=20).run(time_step)

time_step:  TimeStep(
{'discount': array(1., dtype=float32),
 'observation': {'action_mask': array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False,  True, False,  True,
       False,  True]),
                 'observation': array([[1, 1, 3, 2],
       [2, 0, 0, 0],
       [3, 3, 3, 1],
       [1, 2, 0, 0],
       [2, 0, 0, 0]], dtype=int32)},
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})
action_step:  PolicyStep(action=array(0, dtype=int32), state=(), info=())
time_step:  TimeStep(
{'discount': array(1., dtype=float32),
 'observation': {'action_mask': array([False, False, False, False, False, False,  True,  True, False,
       False, False, False, False,  True, False,  True, False,  True,
       False,  True]),
                 'observation': array([[1, 1, 3, 0],
       [2, 2, 0, 0],
       [3, 3, 3, 1],
       [1, 2, 0, 0],
       [2, 0, 0, 0]], dtype=int32)},
 'reward': array(-1., dtype=float

(TimeStep(
 {'discount': array(1., dtype=float32),
  'observation': {'action_mask': array([False, False,  True, False, False, False, False,  True, False,
        False, False, False,  True, False, False, False, False,  True,
        False, False]),
                  'observation': array([[1, 1, 0, 0],
        [3, 3, 0, 0],
        [2, 2, 2, 2],
        [1, 1, 0, 0],
        [3, 3, 0, 0]], dtype=int32)},
  'reward': array(-1., dtype=float32),
  'step_type': array(1, dtype=int32)}),
 ())

In [21]:
env.get_state()

{'observation': array([[1, 3, 1, 0],
        [2, 2, 2, 1],
        [3, 3, 3, 2],
        [1, 0, 0, 0],
        [0, 0, 0, 0]], dtype=int32),
 'action_mask': array([False, False,  True,  True,  True, False,  True,  True, False,
        False, False,  True,  True, False, False,  True, False, False,
        False, False])}

In [22]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2).prefetch(3)

dataset

<_PrefetchDataset element_spec=(Trajectory(
{'action': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None),
 'discount': TensorSpec(shape=(64, 2), dtype=tf.float32, name=None),
 'next_step_type': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None),
 'observation': DictWrapper({'observation': TensorSpec(shape=(64, 2, 5, 4), dtype=tf.int32, name=None), 'action_mask': TensorSpec(shape=(64, 2, 20), dtype=tf.bool, name=None)}),
 'policy_info': (),
 'reward': TensorSpec(shape=(64, 2), dtype=tf.float32, name=None),
 'step_type': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None)}), SampleInfo(key=TensorSpec(shape=(64, 2), dtype=tf.uint64, name=None), probability=TensorSpec(shape=(64, 2), dtype=tf.float64, name=None), table_size=TensorSpec(shape=(64, 2), dtype=tf.int64, name=None), priority=TensorSpec(shape=(64, 2), dtype=tf.float64, name=None), times_sampled=TensorSpec(shape=(64, 2), dtype=tf.int32, name=None)))>

In [23]:
iterator = iter(dataset)

In [24]:
# demo: only 1 episode
agent.train = common.function(agent.train)

agent.train_step_counter.assign(0)

avg_return = compute_avg_return(eval_env, agent.policy, 1)

In [25]:
try:
  %%time
except:
  pass

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step.
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

# Reset the environment.
time_step = train_py_env.reset()

# Create a driver to collect experience.
collect_driver = py_driver.PyDriver(
    train_py_env,
    py_tf_eager_policy.PyTFEagerPolicy(
      agent.collect_policy, use_tf_function=True),
    [rb_observer],
    max_steps=collect_steps_per_iteration)

iter_count = 0
for _ in range(num_iterations):
  iter_count += 1
  print('iter_count: ',iter_count)
  # Collect a few steps and save to the replay buffer.
  time_step, _ = collect_driver.run(time_step)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

iter_count:  1
time_step:  TimeStep(
{'discount': array(1., dtype=float32),
 'observation': {'action_mask': array([False, False, False, False,  True, False,  True, False,  True,
       False,  True, False, False, False, False, False,  True, False,
       False,  True]),
                 'observation': array([[0, 0, 0, 0],
       [2, 2, 2, 1],
       [3, 3, 3, 1],
       [0, 0, 0, 0],
       [1, 3, 2, 1]], dtype=int32)},
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})
action_step:  PolicyStep(action=array(19, dtype=int32), state=(), info=())


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 'reward': array(-1., dtype=float32),
 'step_type': array(1, dtype=int32)})
action_step:  PolicyStep(action=array(3, dtype=int32), state=(), info=())
iter_count:  674
time_step:  TimeStep(
{'discount': array(1., dtype=float32),
 'observation': {'action_mask': array([False, False, False, False,  True,  True, False, False,  True,
       False, False, False,  True, False, False, False,  True, False,
       False, False]),
                 'observation': array([[0, 0, 0, 0],
       [2, 2, 3, 2],
       [3, 3, 2, 0],
       [3, 0, 0, 0],
       [1, 1, 1, 1]], dtype=int32)},
 'reward': array(-1., dtype=float32),
 'step_type': array(1, dtype=int32)})
action_step:  PolicyStep(action=array(5, dtype=int32), state=(), info=())
iter_count:  675
time_step:  TimeStep(
{'discount': array(1., dtype=float32),
 'observation': {'action_mask': array([False, False, False, False,  True, False,  True, False,  True,
       False, False, False,  

In [None]:
iter_count