In [1]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
import time
import scipy.signal
from tqdm import tqdm




# Storage Model

In [2]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

def logprobabilities(logits, a):
    #print(f'action: {a}') #[1]
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    #[-1.3234,-1.123455,-1.753454,-1,23494]
    logprobability = tf.reduce_sum(
        tf.one_hot(a, num_actions) * logprobabilities_all, axis=1
    )
    # print(f'Logits and action{logits, a}')

    # print(f'Logprobs_All {logprobabilities_all}')
    # print(f'Log-Prob: {logprobability}')
    return logprobability

class Trajectory_Storage:
    # T for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # T initialization
        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )


# Actor Model


In [3]:
class Actor(Model):
    def __init__(self):
        super(Actor, self).__init__()


        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(4, activation="softmax")
        ]

    #@tf.function        
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

        

#@tf.function
def sample_action(observation):
    logits = actor(observation)
   # tf.print(type(logits))
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
   # tf.print(action)
    return logits, action

# Critic Model

In [4]:
class Critic(Model):
    def __init__(self):
        super(Critic, self).__init__()

        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]

    #@tf.function 
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

# Update Policy Function

Training the Actor Model Using the typical PPO-Clipping Method

In [5]:
# Train the policy by maxizing the PPO-Clip objective
#@tf.function
def train_policy(
    observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):

    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = tf.exp(
            logprobabilities(actor(observation_buffer), action_buffer)
            - logprobability_buffer
        )
        # print(f'ratio: {ratio}')
        min_advantage = tf.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )
        # take the minimum of the clipped advantage and normal surrogate objective
        policy_loss = -tf.reduce_mean(
            tf.minimum(ratio * advantage_buffer, min_advantage)
        )
    print(f'Policy Loss: {policy_loss}')
    print(tape)
    

    print(policy_loss)
    print(type(policy_loss))
    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
    #print(f' Plicy Grads: {policy_grads}')

    print(actor.trainable_variables)
    
    optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))

    kl = tf.reduce_mean(
        logprobability_buffer
        - logprobabilities(actor(observation_buffer), action_buffer)
    )
    kl = tf.reduce_sum(kl)
    return kl


# Train the value function by regression on mean-squared error
#@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        print(type(critic(observation_buffer)))
        value_loss = tf.reduce_mean((return_buffer - critic(observation_buffer)) ** 2) # MSE to update Critics Value function
    print(f'Value Loss: {value_loss}')
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    

    optimizer_2.apply_gradients(zip(value_grads, critic.trainable_variables))




In [6]:
def train_critic(observation_buffer, action_buffer, logprobability_buffer, advantage_buffer):
    with tf.GradientTape() as tape:
        ratio = tf.exp(logprobabilities(critic(observation_buffer), ))
    return 

In [7]:
# define Hyperparameters
epochs = 1000
steps_per_epoch = 1000 # ~10 Episodes per epoch, then compute new parameters (smaller batching)
lr_actor = 3e-4
lr_critic = lr_actor
train_policy_iterations = 1
train_value_iterations = 80
clip_ratio = 0.2
target_kl = 0.01
optimizer = Adam()
optimizer_2 = Adam()

render = False

# Test Inits 

In [8]:
tf.keras.backend.clear_session()

# define environment
env = gym.make("LunarLander-v2")
# get observation_dims and amount of possible actions (1 for CartPole-v1)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage for observations, actions, rewards etc during trajectory
T = Trajectory_Storage(observation_dimensions=observation_dimensions, size=steps_per_epoch)

# init the actor and critics model
observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
actor = Actor()
critic = Critic()

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

# Training Loop

In [9]:
episodes_total = 0
for epoch in range(epochs): 
     # Initialize the sum of the returns, lengths and number of episodes for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    # Iterate over the steps of each epoch
    for t in tqdm(range(steps_per_epoch)):
        if render:
            env.render()

        # Get the logits, action, and take one step in the environment
        #print(observation)
        observation = observation.reshape(1, -1)
        #print(type(observation))
        
        #print(observation)
        logits, action = sample_action(observation)
        #print(action)
        observation_new, reward, done, _ = env.step(action[0].numpy())
        episode_return += reward
        episode_length += 1
        #print(observation)
        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)
        #print(f' Log Prob: {logprobability_t}')

        # Store obs, act, rew, v_t, logp_pi_t
        T.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation.reshape(1, -1))
            T.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0

    # Get values from the buffer
    (
        observation_buffer,
        action_buffer,
        advantage_buffer,
        return_buffer,
        logprobability_buffer,
    ) = T.get()
    #print(f'Advantage Buffer { advantage_buffer}')

    # Update the policy and implement early stopping using KL divergence
    for _ in range(train_policy_iterations):
        kl = train_policy(
            observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
        )
        if kl > 1.5 * target_kl:
            # Early Stopping
            break
    
    # Update the value function
    for _ in range(train_value_iterations):
        print("Type observation")
        print(type(observation_buffer))
        print("Type observation")
        print(type(return_buffer))
        train_value_function(observation_buffer, return_buffer)

    episodes_total += num_episodes
    # Print mean return and length for each epoch
    print(
        f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}. Num Episodes: {num_episodes}. Total episodes: {episodes_total}"
    )
    

100%|██████████| 1000/1000 [00:04<00:00, 200.08it/s]


Policy Loss: -1.1444091896350983e-08
<tensorflow.python.eager.backprop.GradientTape object at 0x000001B508162940>
tf.Tensor(-1.1444092e-08, shape=(), dtype=float32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
<tensorflow.python.eager.tape.Tape object at 0x000001B50815FF70>
Flat Targets
[<tf.Tensor: shape=(), dtype=float32, numpy=-1.1444092e-08>]
Flat Sources
[<tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>]
Output gradients
None
UnconnectedGradients.NONE
[<tf.Tensor: shape=(8, 128), dtype=float32, numpy=
array([[ 5.217717

100%|██████████| 1000/1000 [00:03<00:00, 298.41it/s]


Policy Loss: 8.201599399626502e-08
<tensorflow.python.eager.backprop.GradientTape object at 0x000001B5081B6C70>
tf.Tensor(8.2015994e-08, shape=(), dtype=float32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
<tensorflow.python.eager.tape.Tape object at 0x000001B50815FE20>
Flat Targets
[<tf.Tensor: shape=(), dtype=float32, numpy=8.2015994e-08>]
Flat Sources
[<tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>, <tf.Tensor: shape=(), dtype=resource, value=<Resource Tensor>>]
Output gradients
None
UnconnectedGradients.NONE
[<tf.Tensor: shape=(8, 128), dtype=float32, numpy=
array([[-3.0719737e-0

In [None]:
import matplotlib.pyplot as plt

plt.bar(range(epochs), mean_returns)
plt.ylabel('Average Return')
plt.show()

In [None]:
# test the trained variable 

test_length = 100
passed = []

observation= env.reset()
reward_sum = 0
done = False
for i in tqdm(range(test_length)):
    observation = env.reset()
    terminal = False
    done = False
    reward_sum = 0

    while not terminal:

        env.render()
        
        observation = observation.reshape(1, -1)
            
        logits, action = sample_action(observation)
        
        observation_new, reward, done, _ = env.step(action[0].numpy())
        reward_sum += reward
        terminal = done 
        observation = observation_new
   
   
    passed.append(reward_sum)



print(f'Average reward over 100 episodes {sum(passed)/test_length}.')