In [75]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import time
import scipy.signal
from tqdm import tqdm




# Storage Model

In [76]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, num_actions) * logprobabilities_all, axis=1
    )
    return logprobability

class Trajectory_Storage:
    # T for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # T initialization
        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )


# Actor Model


In [77]:
class Actor(Model):
    def __init__(self):
        super(Actor, self).__init__()


        self.l = [
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(32, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(2, activation="sigmoid", kernel_regularizer=tf.random_normal_initializer(stddev=0.02))
        ]

    #@tf.function        
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

        

#@tf.function
def sample_action(observation):
    logits = actor(observation)
   # tf.print(type(logits))
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
   # tf.print(action)
    return logits, action

# Critic Model

In [78]:
class Critic(Model):
    def __init__(self):
        super(Critic, self).__init__()

        self.l = [
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(32, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.02)),
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.02))
        ]

    #@tf.function 
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

# Update Policy Function

Training the Actor Model Using the typical PPO-Clipping Method

In [79]:
# Train the policy by maxizing the PPO-Clip objective
#@tf.function
def train_policy(
    observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):

    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = tf.exp(
            logprobabilities(actor(observation_buffer), action_buffer)
            - logprobability_buffer
        )
        min_advantage = tf.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -tf.reduce_mean(
            tf.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
    # print("Policy grads: ")
    # print(policy_grads)
    # print("Actor Variables:")
    # print([actor.trainable_variables])
    # print(type(policy_grads), type(actor.trainable_variables))
    optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))

    kl = tf.reduce_mean(
        logprobability_buffer
        - logprobabilities(actor(observation_buffer), action_buffer)
    )
    kl = tf.reduce_sum(kl)
    return kl


# Train the value function by regression on mean-squared error
#@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        value_loss = tf.reduce_mean((return_buffer - critic(observation_buffer)) ** 2)
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    optimizer_2.apply_gradients(zip(value_grads, critic.trainable_variables))

In [80]:
# define Hyperparameters
epochs = 20
steps_per_epoch = 4000
lr_actor = 0.03
lr_critic = 0.03
train_policy_iterations = 20
train_value_iterations = 20
clip_ratio = 0.2
target_kl = 0.01
optimizer = Adam()
optimizer_2 = Adam()

render = False

# Test Inits 

In [81]:
tf.keras.backend.clear_session()

# define environment
env = gym.make("CartPole-v1")
# get observation_dims and amount of possible actions (1 for CartPole-v1)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage for observations, actions, rewards etc during trajectory
T = Trajectory_Storage(observation_dimensions=observation_dimensions, size=steps_per_epoch)

# init the actor and critics model
observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
actor = Actor()
critic = Critic()

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

# Training Loop

In [82]:
for epoch in range(epochs):
     # Initialize the sum of the returns, lengths and number of episodes for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    # Iterate over the steps of each epoch
    for t in tqdm(range(steps_per_epoch)):
        if render:
            env.render()

        # Get the logits, action, and take one step in the environment
        #print(observation)
        observation = observation.reshape(1, -1)
        
        #print(observation)
        logits, action = sample_action(observation)
        #print(logits)
        observation_new, reward, done, _ = env.step(action[0].numpy())
        episode_return += reward
        episode_length += 1
        #print(observation)
        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        T.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            T.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0

    # Get values from the buffer
    (
        observation_buffer,
        action_buffer,
        advantage_buffer,
        return_buffer,
        logprobability_buffer,
    ) = T.get()

    # Update the policy and implement early stopping using KL divergence
    for _ in range(train_policy_iterations):
        kl = train_policy(
            observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
        )
        if kl > 1.5 * target_kl:
            # Early Stopping
            break

    # Update the value function
    for _ in range(train_value_iterations):
        train_value_function(observation_buffer, return_buffer)

    # Print mean return and length for each epoch
    print(
        f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
    )

100%|██████████| 4000/4000 [00:21<00:00, 189.86it/s]


 Epoch: 1. Mean Return: 23.952095808383234. Mean Length: 23.952095808383234


100%|██████████| 4000/4000 [00:20<00:00, 196.85it/s]


 Epoch: 2. Mean Return: 23.952095808383234. Mean Length: 23.952095808383234


100%|██████████| 4000/4000 [00:19<00:00, 203.91it/s]


 Epoch: 3. Mean Return: 25.974025974025974. Mean Length: 25.974025974025974


100%|██████████| 4000/4000 [00:19<00:00, 200.67it/s]


 Epoch: 4. Mean Return: 33.61344537815126. Mean Length: 33.61344537815126


100%|██████████| 4000/4000 [00:19<00:00, 202.53it/s]


 Epoch: 5. Mean Return: 40.816326530612244. Mean Length: 40.816326530612244


100%|██████████| 4000/4000 [00:19<00:00, 201.43it/s]


 Epoch: 6. Mean Return: 47.05882352941177. Mean Length: 47.05882352941177


100%|██████████| 4000/4000 [00:19<00:00, 201.65it/s]


 Epoch: 7. Mean Return: 54.794520547945204. Mean Length: 54.794520547945204


100%|██████████| 4000/4000 [00:19<00:00, 202.72it/s]


 Epoch: 8. Mean Return: 75.47169811320755. Mean Length: 75.47169811320755


100%|██████████| 4000/4000 [00:19<00:00, 201.41it/s]


 Epoch: 9. Mean Return: 68.96551724137932. Mean Length: 68.96551724137932


100%|██████████| 4000/4000 [00:19<00:00, 202.01it/s]


 Epoch: 10. Mean Return: 76.92307692307692. Mean Length: 76.92307692307692


100%|██████████| 4000/4000 [00:19<00:00, 201.92it/s]


 Epoch: 11. Mean Return: 95.23809523809524. Mean Length: 95.23809523809524


100%|██████████| 4000/4000 [00:24<00:00, 162.05it/s]


 Epoch: 12. Mean Return: 86.95652173913044. Mean Length: 86.95652173913044


100%|██████████| 4000/4000 [00:21<00:00, 182.53it/s]


 Epoch: 13. Mean Return: 88.88888888888889. Mean Length: 88.88888888888889


100%|██████████| 4000/4000 [00:20<00:00, 194.56it/s]


 Epoch: 14. Mean Return: 105.26315789473684. Mean Length: 105.26315789473684


100%|██████████| 4000/4000 [00:20<00:00, 195.73it/s]


 Epoch: 15. Mean Return: 111.11111111111111. Mean Length: 111.11111111111111


100%|██████████| 4000/4000 [00:21<00:00, 187.41it/s]


 Epoch: 16. Mean Return: 105.26315789473684. Mean Length: 105.26315789473684


100%|██████████| 4000/4000 [00:20<00:00, 190.88it/s]


 Epoch: 17. Mean Return: 121.21212121212122. Mean Length: 121.21212121212122


100%|██████████| 4000/4000 [00:21<00:00, 190.38it/s]


 Epoch: 18. Mean Return: 108.10810810810811. Mean Length: 108.10810810810811


100%|██████████| 4000/4000 [00:20<00:00, 192.28it/s]


 Epoch: 19. Mean Return: 117.6470588235294. Mean Length: 117.6470588235294


100%|██████████| 4000/4000 [00:21<00:00, 190.09it/s]


 Epoch: 20. Mean Return: 121.21212121212122. Mean Length: 121.21212121212122
