In [None]:
import numpy as np

import spin_simulation as ss

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network, q_rnn_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from environments import spin_sys_discrete

In [None]:
import importlib
importlib.reload(spin_sys_discrete)

In [None]:
num_iterations = 10000 # @param {type:"integer"}

initial_collect_steps = 1000  # @param {type:"integer"}
collect_steps_per_iteration = 1  # @param {type:"integer"}
n_step_update = 2 # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

In [None]:
N=4
dim = 2**N
coupling = 1e3
delta = 500
(X,Y,Z) = ss.get_total_spin(N=N, dim=dim)
H_target = ss.get_H_WHH_0(X, Y, Z, delta)

In [None]:
# env = spin_sys_discrete.SpinSystemDiscreteEnv(N=4, dim=16, coupling=1e3,
#     delta=500, H_target=H_target, X=X, Y=Y, delay=5e-6, pulse_width=0,
#     delay_after=True)
# env.reset()

train_py_env = spin_sys_discrete.SpinSystemDiscreteEnv(N=4, dim=16, coupling=1e3,
    delta=500, H_target=H_target, X=X, Y=Y, delay=5e-6, pulse_width=0,
    delay_after=True)
eval_py_env = spin_sys_discrete.SpinSystemDiscreteEnv(N=4, dim=16, coupling=1e3,
    delta=500, H_target=H_target, X=X, Y=Y, delay=5e-6, pulse_width=0,
    delay_after=True)

print('Observation Spec:')
print(eval_py_env.time_step_spec().observation)

print('Reward Spec:')
print(eval_py_env.time_step_spec().reward)

print('Action Spec:')
print(eval_py_env.action_spec())

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

Define the q network.

I've been trying to use a Q-RNN, but I don't know what the behavior of that is exactly, so I'm trying both.

In [None]:
q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec()
)

In [None]:
target_q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec()
)

In [None]:
#q_net.summary()
q_net(np.zeros((1,5,5), dtype="float32"))[0].numpy()

In [None]:
q_rnn_net = q_rnn_network.QRnnNetwork(
    train_env.observation_spec(),
    train_env.action_spec()
)

Create optimizer and agent. **Make sure to change the q_network arg to the proper network above**.

In [None]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    n_step_update=n_step_update,
    target_q_network=target_q_net,
    target_update_period=10,
    gamma=0.99,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

In [None]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

In [None]:
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

In [None]:
def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        policy_state = policy.get_initial_state(environment.batch_size)
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step, policy_state = policy_state)
            policy_state = action_step.state
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [None]:
compute_avg_return(eval_env, random_policy, num_eval_episodes)

In [None]:
# TODO include other metrics

Create the replay buffer, and define methods to collect data.

In [None]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

In [None]:
def collect_step(environment, policy, policy_state, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step, policy_state)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    
    # Add trajectory to the replay buffer
    buffer.add_batch(traj)
    
    return action_step.state

def collect_data(env, policy, buffer, steps, policy_state = None):
    if policy_state is None:
        policy_state = policy.get_initial_state(env.batch_size)
    for _ in range(steps):
        policy_state = collect_step(env, policy, policy_state, buffer)

In [None]:
collect_data(train_env, random_policy, replay_buffer, steps=100)

In [None]:
#iter(replay_buffer.as_dataset()).next()

Define a dataset object, which samples the replay buffer and generates trajectories (a series of timesteps and action

In [None]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)


dataset

In [None]:
iterator = iter(dataset)

print(iterator)

In [None]:
#iterator.next()

## Train the agent

In [None]:
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print(returns)

# TODO

- Looks like Q network is always increasing values, even though rewards are always ~0. Understand why this is.
- Network is just flattening input, not taking advantage of sequential structure (need LSTM...). Create custom Q network.
- NOT WORKING!!!
- Try out PPO...

In [None]:
train_env.reset()
policy_state = agent.collect_policy.get_initial_state(train_env.batch_size)

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    for _ in range(collect_steps_per_iteration):
        #print(policy_state)
        policy_state = collect_step(train_env, agent.collect_policy, policy_state, replay_buffer)
        
    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print(f'step = {step}: loss = {train_loss}')

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, 1)
        print(f'step = {step}: Average Return = {avg_return}')
        if avg_return > 50:
            break
        returns.append(avg_return)

## Evaluate the agent

See what pulse sequences it's performing

In [None]:
time_step = eval_env.reset()
episode_return = 0.0
policy_state = agent.policy.get_initial_state(eval_env.batch_size)
while not time_step.is_last():
    action_step = agent.policy.action(time_step, policy_state = policy_state)
    policy_state = action_step.state
    time_step = eval_env.step(action_step.action)
    episode_return += time_step.reward
    print(f"action: {action_step.action}, reward: {time_step.reward}, return: {episode_return}")

In [None]:
agent.policy.submodules[1].submodules[1]