In [6]:
import tensorflow as tf
import numpy as np
import gym
import rl_utils
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers


print(tf.__version__)
print(gym.__version__)

logdir = 'logs/scalars'
file_writer = tf.summary.create_file_writer(logdir + "/AC_tf")
file_writer.set_as_default()

# tf.summary.trace_on(graph=True)

%tensorboard --logdir logs/scalars

2.8.0
0.21.0


UsageError: Line magic function `%tensorboard` not found.


In [7]:
actor_lr = 1e-3
critic_lr = 1e-2 # critic learning rate is faster than actor
num_episodes = 1000
hidden_dim = 128
gamma = 0.98

env_name = 'CartPole-v0'
env = gym.make(env_name)
env.seed(0)
tf.random.set_seed(0)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n


In [8]:
inputs_layer = keras.Input(shape=(state_dim))
common_layer = layers.Dense(128, activation='relu')(inputs_layer)
actor_layer = layers.Dense(action_dim, activation='softmax')(common_layer)
critic_layer = layers.Dense(1, activation='relu')(common_layer)

In [12]:
class ActorCritic:
    def __init__(self, actor_lr, critic_lr, gamma) -> None:
        self.actor = keras.Model(inputs=inputs_layer, outputs=actor_layer)
        self.critic = keras.Model(inputs=inputs_layer, outputs=critic_layer)
        
        self.actor_optimizer = keras.optimizers.Adam(learning_rate=actor_lr)
        self.critic_optimizer = keras.optimizers.Adam(learning_rate=critic_lr)
        
        self.gamma = gamma
        
    def take_action(self, state):
        state = tf.constant([state], dtype=tf.float32)
        probs = self.actor(state)
        # print(probs)
        action = tf.random.categorical(probs, 1)[0][0].numpy()
        return action
    
    def update(self, transition_dict):
        states = tf.constant(transition_dict['states'], dtype=tf.float32)
        actions = tf.reshape(tf.constant(transition_dict['actions'], dtype=tf.int32), [-1, 1])
        rewards = tf.reshape(tf.constant(transition_dict['rewards'], dtype=tf.float32), [-1, 1])
        next_states = tf.constant(transition_dict['next_states'], dtype=tf.float32)
        dones = tf.reshape(tf.constant(transition_dict['dones'], dtype=tf.float32), [-1, 1])

        with tf.GradientTape(persistent=True) as tape:
            td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones)
            td_error = td_target - self.critic(states)
            #?
            probs = tf.gather(self.actor(states), actions, axis=1)
            log_probs = tf.math.log(probs)
            actor_loss = tf.reduce_mean(-log_probs * td_error)
            # mse = tf.keras.losses.MeanSquaredError(reduction='mean')
            # critic_loss = mse(td_target, self.critic(states))
            critic_loss = tf.reduce_mean(tf.losses.MSE(td_target, self.critic(states)))
        
        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
        
        self.actor_optimizer.apply_gradients(zip(actor_grad, self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grad, self.critic.trainable_variables))   
        
        del tape

In [1]:
agent = ActorCritic(actor_lr, critic_lr, gamma)

return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes)



NameError: name 'ActorCritic' is not defined

In [None]:
# return_list = []

# for i in range(10):
#     for i_episode in range(int(num_episodes/10)):
#         episode_return = 0
#         transition_dict = {'states': [], 'actions': [],
#                                 'next_states': [], 'rewards': [], 'dones': []}
#         state = env.reset()
#         done = False
#         while not done:
#             action = agent.take_action(state)
#             next_state, reward, done, _ = env.step(action)
#             transition_dict['states'].append(state)
#             transition_dict['actions'].append(action)
#             transition_dict['next_states'].append(next_state)
#             transition_dict['rewards'].append(reward)
#             transition_dict['dones'].append(done)
#             state = next_state
#             episode_return += reward
#         return_list.append(episode_return)
#         agent.update(transition_dict)


In [None]:
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('Actor-Critic on {}'.format(env_name))
plt.show()

mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('Actor-Critic on {}'.format(env_name))
plt.show()


---

In [None]:
a = [1, 2, 3, 4, 5]
a1= np.array(a)
print(a1)

In [None]:
env_name = 'CartPole-v0'
env = gym.make(env_name)
env.seed(0)
tf.random.set_seed(0)

In [None]:
tf.random.set_seed(1234)
print(tf.random.uniform([1]))  # generates 'A1'
print(tf.random.uniform([1]))  # generates 'A2'


In [None]:
s = tf.constant(1.0)
print(s.dtype)
s =s + 1
print(s)

In [None]:
state = env.reset()
state = tf.constant([state], dtype=tf.float32)
print(state)
state = state + 1.0
print(state)

In [None]:
s_dict = {'states': []}
s_dict['states'].append(env.reset())
state, _, _, _ = env.step(0)
s_dict['states'].append(state)

print(s_dict['states'])

In [None]:
actor_net = keras.Model(inputs=inputs_layer, outputs=actor_layer)

# s = tf.constant([env.reset()], dtype=tf.float32)
# a = actor_net(s)
# print(s.shape)
# print(a)
# states = s_dict['states']
# for i in range(len(states)):
#     actor_net([1, states[i]])
s1 = tf.constant(s_dict['states'], dtype=tf.float32) # shape = (2, 4)
print(s1)
a1 = actor_net(s1)
print(a1)

In [None]:
v = tf.Variable([0.0])
with tf.GradientTape() as g:
    loss = tf.constant(v + v)
g.gradient(loss, v).numpy()


In [None]:
q = tf.constant([0.0])
with tf.GradientTape() as g:
    loss = (q + q)
print(g.gradient(loss, q))

In [None]:
q1 = tf.Variable([0.0])
q2 = q1 + 1


In [None]:
print(q1)
print(q2)
print(q2.numpy())
print(q2.shape)

In [None]:
# A trainable variable
x0 = tf.Variable(3.0, name='x0')
# Not trainable
x1 = tf.Variable(3.0, name='x1', trainable=False)
# Not a Variable: A variable + tensor returns a tensor.
x2 = tf.Variable(2.0, name='x2') + 1.0
# Not a variable
x3 = tf.constant(3.0, name='x3')

with tf.GradientTape() as tape:
  y = (x0**2) + (x1**2) + (x2**2)

grad = tape.gradient(y, [x0, x1, x2, x3])

for g in grad:
  print(g)


In [None]:
x0 = tf.constant(0.0)
x1 = tf.constant(0.0)

with tf.GradientTape() as tape0, tf.GradientTape() as tape1:
  tape0.watch(x0)
  tape1.watch(x1)

  y0 = tf.math.sin(x0)
  y1 = tf.nn.sigmoid(x1)

  y = y0 + y1

  ys = tf.reduce_sum(y)


In [None]:
y_true = [[0., 1. ,1], [0., 0. ,1]]
y_pred = [[1., 1. ,1], [1., 0., 1]]
# Using 'auto'/'sum_over_batch_size' reduction type.
mse = tf.keras.losses.MeanSquaredError(reduction='none')
mse(y_true, y_pred).numpy()
