In [1]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import time
import scipy.signal




Init Plugin
Init Graph Optimizer
Init Kernel


# Storage Model

In [2]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, num_actions) * logprobabilities_all, axis=1
    )
    return logprobability

class Trajectory_Storage:
    # T for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # T initialization
        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )


# Actor Model


In [3]:
class Actor(Model):
    def __init__(self, learning_rate, optimizer):
        super(Actor, self).__init__()


        self.l = [
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(32, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(1, activation="sigmoid", kernel_regularizer=tf.random_normal_initializer(stddev=0.02))
        ]

    @tf.function        
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

        

@tf.function
def sample_action(observation):
    logits = actor(observation)
   # tf.print(type(logits))
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
   # tf.print(action)
    return logits, action

# Critic Model

In [4]:
class Critic(Model):
    def __init__(self, learning_rate, optimizer):
        super(Critic, self).__init__()

        self.l = [
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(32, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.02))
        ]

    @tf.function 
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

# Update Policy Function

Training the Actor Model Using the typical PPO-Clipping Method

In [5]:
# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_policy(
    observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):

    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = tf.exp(
            logprobabilities(actor(observation_buffer), action_buffer)
            - logprobability_buffer
        )
        min_advantage = tf.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -tf.reduce_mean(
            tf.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
    optimizer.apply_gradients(zip(policy_grads, [actor.trainable_variables]))

    kl = tf.reduce_mean(
        logprobability_buffer
        - logprobabilities(actor(observation_buffer), action_buffer)
    )
    kl = tf.reduce_sum(kl)
    return kl


# Train the value function by regression on mean-squared error
@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        value_loss = tf.reduce_mean((return_buffer - critic(observation_buffer)) ** 2)
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    optimizer_2.apply_gradients(zip(value_grads, critic.trainable_variables))

In [6]:
# define Hyperparameters
epochs = 20
steps_per_epoch = 2000
lr_actor = 0.03
lr_critic = 0.03
train_policy_iterations = 20
train_value_iterations = 20
clip_ratio = 0.2
target_kl = 0.01
optimizer = Adam
optimizer_2 = Adam

render = True

# Test Inits 

In [7]:
tf.keras.backend.clear_session()

# define environment
env = gym.make("CartPole-v1")
# get observation_dims and amount of possible actions (1 for CartPole-v1)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage for observations, actions, rewards etc during trajectory
T = Trajectory_Storage(observation_dimensions=observation_dimensions, size=steps_per_epoch)

# init the actor and critics model
observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
actor = Actor(learning_rate=lr_actor, optimizer=Adam)
critic = Critic(learning_rate=lr_critic, optimizer=Adam)

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-03-16 11:34:30.168631: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-16 11:34:30.168762: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Training Loop

In [8]:
for epoch in range(epochs):
     # Initialize the sum of the returns, lengths and number of episodes for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    # Iterate over the steps of each epoch
    for t in range(steps_per_epoch):
        if render:
            env.render()

        # Get the logits, action, and take one step in the environment
        #print(observation)
        observation = observation.reshape(1, -1)
        
        #print(observation)
        logits, action = sample_action(observation)
        print(logits)
        observation_new, reward, done, _ = env.step(action[0].numpy())
        episode_return += reward
        episode_length += 1
        print(observation)
        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        T.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            T.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0

    # Get values from the buffer
    (
        observation_buffer,
        action_buffer,
        advantage_buffer,
        return_buffer,
        logprobability_buffer,
    ) = T.get()

    # Update the policy and implement early stopping using KL divergence
    for _ in range(train_policy_iterations):
        kl = train_policy(
            observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
        )
        if kl > 1.5 * target_kl:
            # Early Stopping
            break

    # Update the value function
    for _ in range(train_value_iterations):
        train_value_function(observation_buffer, return_buffer)

    # Print mean return and length for each epoch
    print(
        f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
    )

2022-03-16 11:34:31.690465: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-03-16 11:34:31.693946: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-03-16 11:34:31.694792: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-03-16 11:34:31.772392: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


tf.Tensor([[0.4999987]], shape=(1, 1), dtype=float32)
[[0.03657964 0.00617251 0.00056629 0.01699234]]
tf.Tensor([[0.49999875]], shape=(1, 1), dtype=float32)
[[ 0.03670309 -0.18895756  0.00090614  0.30985388]]
tf.Tensor([[0.4999693]], shape=(1, 1), dtype=float32)
[[ 0.03292394 -0.38409242  0.00710322  0.6028224 ]]
tf.Tensor([[0.4999349]], shape=(1, 1), dtype=float32)
[[ 0.02524209 -0.579313    0.01915967  0.8977343 ]]
tf.Tensor([[0.49989155]], shape=(1, 1), dtype=float32)
[[ 0.01365583 -0.7746893   0.03711435  1.1963776 ]]
tf.Tensor([[0.4998449]], shape=(1, 1), dtype=float32)
[[-0.00183796 -0.9702715   0.0610419   1.500458  ]]
tf.Tensor([[0.49980155]], shape=(1, 1), dtype=float32)
[[-0.02124339 -1.1660794   0.09105106  1.811558  ]]
tf.Tensor([[0.4997567]], shape=(1, 1), dtype=float32)
[[-0.04456498 -1.3620903   0.12728222  2.1310885 ]]
tf.Tensor([[0.49970192]], shape=(1, 1), dtype=float32)
[[-0.07180678 -1.5582243   0.169904    2.4602296 ]]
tf.Tensor([[0.49998796]], shape=(1, 1), dtype=

TypeError: in user code:

    /var/folders/jk/h_92czjx5jjcf631wf82ht9c0000gn/T/ipykernel_71471/582144715.py:22 train_policy  *
        optimizer.apply_gradients(zip(policy_grads, [actor.trainable_variables]))

    TypeError: apply_gradients() missing 1 required positional argument: 'grads_and_vars'
