In [37]:
## setup

import gym

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, \
    Concatenate
from tensorflow.keras.optimizers import Adam

In [39]:
## hyperparameters

# https://gym.openai.com/envs/Pendulum-v0/
ENV = 'Pendulum-v0'
THETA = 0.15
DT = 1e-2
BUFFER_CAPACITY = 100000
BATCH_SIZE = 64
STD = 0.2
CRITIC_LR = 0.002
ACTOR_LR = 0.001
EPISODES = 100
GAMMA = 0.99
TAU = 0.005

In [9]:
## environment

env= gym.make(ENV)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print(f'state space: {num_states}')
print(f'action space: {num_actions}')
print(f'continuous action max: {upper_bound}')
print(f'continuous action min: {lower_bound}')

state space: 3
action space: 1
continuous action max: 2.0
continuous action min: -2.0


In [11]:
## ornstein-uhlenbeck process

class OUActionNoise:
    """
    Ornstein-Uhlenbeck process models the exploration noise process
    Use temporally correlated noise in order to explore well 
    in physical environments that have momentum.
    In paper, theta = 0.1g, sigma = 0.2
    
    https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process

    dt = derivative of t, time
    """
    # x_initial?
    def __init__(self, mean, std, theta = THETA, dt = DT, x_initial = None):
        self.theta = theta
        self.mean = mean
        self.std = std
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (
            self.x_prev
             + self.theta * (self.mean - self.x_prev) * self.dt
             + self.std * np.sqrt(self.dt) * np.random.normal(size = self.mean.shape)
        )
        # it makes next noise dependent on current noise
        self.x_prev = x
        return x

    def reset(self):
        # default x_initial is None
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [29]:
## actor network and critic network

def get_actor():
    """
    Actor updates the policy distribution
    The value is used to selection action. Here it is continuous

    Makes tf.keras.Model by Functional API.

    Initialize for the last layer of the Actor to be between -0.003 and 0.003
    This prevents us from getting 1 or -1 output values in the initial stages.
    1 or -1 would squash our gradients to zero, as using tanh activation
    
    kernel_initializer defines the way to set the initial random weights of Keras layers
    tanh, hyperbolic tangent activation function producs numbers between -1 and 1.
    Because Pendulum environment has action space -2 to 2, outputs are multipled by upper_bound (2)
    """
    last_init = tf.random_uniform_initializer(minval = -0.003, maxval = 0.003)
    inputs = Input(shape = (num_states,))
    out = Dense(512, activation = 'relu')(inputs)
    out = BatchNormalization()(out)
    out = Dense(512, activation = 'relu')(out)
    out = BatchNormalization()(out)
    outputs = Dense(1, activation = 'tanh', kernel_initializer = last_init)(out)
    outputs = outputs * upper_bound
    model = Model(inputs, outputs)
    return model

def get_critic():
    """
    Critic estimates value function (Either action-value or state-value)
    Makes tf.keras.Model by Functional API
    """
    # state as input
    state_input = Input(shape = (num_states))
    state_out = Dense(16, activation = 'relu')(state_input)
    state_out = BatchNormalization()(state_out)
    state_out = Dense(32, activation = 'relu')(state_out)
    state_out = BatchNormalization()(state_out)
    # action as input
    action_input = Input(shape = (num_actions))
    action_out = Dense(32, activation = 'relu')(action_input)
    action_out = BatchNormalization()(action_out)
    # both are passed through separate layers before concatenating
    concat = Concatenate()([state_out, action_out])
    out = Dense(512, activation = 'relu')(concat)
    out = BatchNormalization()(out)
    out = Dense(512, activation = 'relu')(out)
    out = BatchNormalization()(out)
    outputs = Dense(1)(out)
    model = Model([state_input, action_input], outputs)
    return model

In [24]:
## experience replay buffer

class Buffer:
    """
    Experience replay buffer
    """
    def __init__(self, buffer_capacity = BUFFER_CAPACITY, batch_size = BATCH_SIZE):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        # initialize buffer_counter which is incremented by record method
        self.buffer_counter = 0
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    def record(self, obs_tuple):
        """
        When buffer_counter > buffer_capacity,
        index has a new index starting from 0 by %
        """
        index = self.buffer_counter % self.buffer_capacity
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.buffer_counter += 1

    def learn(self):
        """
        This method computes the loss and update the parameters
        """
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        # what is the shape of these?
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        # why?
        reward_batch = tf.cast(reward_batch, dtype = tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        # update Critic
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch)
            # critic([state_input, action_input])
            y = reward_batch + gamma * target_critic([next_state_batch, target_actions])
            critic_value = critic_model([state_batch, action_batch])
            # loss function calculates mean squared loss
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
        # calculate gradient
        critic_gradient = tape.gradient(critic_loss, critic_model.trainable_variables)
        # Adam optimizer and sums gradients
        critic_optimizer.apply_gradients(
            zip(critic_gradient, critic_model.trainable_variables)
        )

        # update Actor
        with tf.GradientTape() as tape:
            actions = actor_model(state_batch)
            critic_value = critic_model([state_batch, actions])
            # negative value because we want to maximize value given by Critic?
            actor_loss = -tf.math.reduce_mean(critic_value)
        # calculate gradient
        actor_gradient = tape.gradient(actor_loss, actor_model.trainable_variables)
        # Adam optimizer
        actor_optimizer.apply_gradients(
            zip(actor_gradient, actor_model.trainable_variables)
        )

In [23]:
## helper functions

def update_target(tau):
    """
    This updates target network parameters slowly.
    """
    # update target Critic
    new_weights = []
    target_variables = target_critic.weights
    for i, variable in enumerate(critic_model.weights):
        new_weights.append(variable * tau + target_variables[i] * (1 - tau))
    target_critic.set_weights(new_weights)

    # update target Actor
    new_weights = []
    target_variables = target_actor.weights
    for i, variable in enumerate(actor_model.weights):
        new_weights.append(variable * tau + target_variables[i] * (1 - tau))
    target_actor.set_weights(new_weights)

def policy(state, noise_object):
    """
    This returns an action sampled from Actor network plus noise for exploration.
    """
    # tf.squeeze removes dimensions of size 1 from the shape of tensor
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    # add noise to action
    sampled_actions = sampled_actions.numpy() + noise
    # make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return [np.squeeze(legal_action)]

In [41]:
## training

# initialize
ou_noise = OUActionNoise(mean = np.zeros(1), std = float(STD))
actor_model = get_actor()
critic_model = get_critic()
target_actor = get_actor()
target_critic = get_critic()
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())
critic_optimizer = Adam(CRITIC_LR)
actor_optimizer = Adam(ACTOR_LR)
buffer = Buffer(BUFFER_CAPACITY, BATCH_SIZE)
# store reward history of each episode
ep_reward_list = []
# store average reward history of last few episodes
avg_reward_list = []

In [40]:
## test get_weights()

# print(type(actor_model.get_weights()))
# print(len(actor_model.get_weights()))
# print(actor_model.get_weights()[0].shape)