### Packages

In [704]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal
import time
from tensorflow.keras import Model
import matplotlib.pyplot as plt
import random
import tensorflow_probability as tfp

### Env Setup

In [1046]:
problem = "Hopper-v3"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

num_states, num_states, upper_bound, lower_bound

EPSILON = 1e-10

### Actor Model

In [1068]:
class Actor(Model):

    def __init__(self, action_dimensions):
        super().__init__()
        self.action_dim = action_dimensions
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(num_actions),
                                                                    scale_diag=tf.ones(num_actions))
        self.dense1_layer = layers.Dense(256, activation="relu")
        self.dense2_layer = layers.Dense(256, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):

        a1 = self.dense1_layer(state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)

        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))
 
        action = tf.tanh(action_)

        log_pi_ = dist.log_prob(action_)     
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
        return action*upper_bound, log_pi


#### Testing

In [1069]:
actor_test = Actor(num_actions)

In [1070]:
obs = env.reset()
obs

array([ 1.25089022e+00, -1.03124107e-03, -4.10838194e-03,  4.27967210e-03,
       -8.51193861e-04,  2.23144475e-03,  3.69466683e-03,  2.38772747e-03,
        7.14047048e-04,  1.07847778e-03,  2.19576116e-03])

In [1071]:
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test = actor_test(tf_obs)


In [1072]:
actor_test.summary()

Model: "actor_155"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1105 (Dense)          multiple                  3072      
                                                                 
 dense_1106 (Dense)          multiple                  65792     
                                                                 
 dense_1107 (Dense)          multiple                  771       
                                                                 
 dense_1108 (Dense)          multiple                  771       
                                                                 
Total params: 70,406
Trainable params: 70,406
Non-trainable params: 0
_________________________________________________________________


### Critic Model

In [1073]:
class Critic_Wrapper():
    def __init__(self, state_dim):
        self.s_dim=state_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(256, activation="relu")(state_input)
        # state_out = layers.Dense(32, activation="relu")(state_out)

        out = layers.Dense(256, activation="relu")(state_out)
        outputs = layers.Dense(1, dtype='float64')(out)

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input], outputs)

        return model


#### Testing

In [1074]:
critic_gen = Critic_Wrapper(num_states)
critic_test = critic_gen.get_critic()

In [1075]:
obs = env.reset()
obs

array([ 1.24508256e+00, -4.97162901e-03, -2.35789093e-03,  4.62150897e-03,
       -1.64061563e-03, -2.85269475e-03,  2.34583407e-03, -3.23493562e-04,
        4.80751533e-03, -2.23360398e-03, -4.86993917e-03])

In [1076]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.24508256e+00, -4.97162901e-03, -2.35789093e-03,
         4.62150897e-03, -1.64061563e-03, -2.85269475e-03,
         2.34583407e-03, -3.23493562e-04,  4.80751533e-03,
        -2.23360398e-03, -4.86993917e-03]])>

In [1077]:
v_test = tf.squeeze(critic_test([tf_obs]))
v_test

<tf.Tensor: shape=(), dtype=float64, numpy=-0.07684647377848594>

In [1078]:
critic_test.summary()

Model: "model_147"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_190 (InputLayer)      [(None, 11)]              0         
                                                                 
 dense_1109 (Dense)          (None, 256)               3072      
                                                                 
 dense_1110 (Dense)          (None, 256)               65792     
                                                                 
 dense_1111 (Dense)          (None, 1)                 257       
                                                                 
Total params: 69,121
Trainable params: 69,121
Non-trainable params: 0
_________________________________________________________________


### Replay Buffer

In [1079]:
class Buffer:

    def __init__(self, observation_dimensions, action_dimensions, size, minibatch_size=256, gamma=0.99, lam=0.95):

        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros((size, action_dimensions), dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        
        self.gamma, self.lam = gamma, lam
        self.batch_size = minibatch_size
        
        self.buffer_cap = size
        self.pointer = 0
        self.trajectory_start_indices = []
        self.trajectory_start_indices.append(0)

    def store(self, observation, action, reward, logprobability, done):

        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1
        if done and not self.pointer > self.buffer_cap-1:
            self.trajectory_start_indices.append(self.pointer)


    def get(self):
        # Get all data of the buffer
        if self.trajectory_start_indices[-1] == self.buffer_cap-1:
            rindex = np.random.choice(range(len(self.trajectory_start_indices)-1), self.batch_size)
        else:
            rindex = np.random.choice(range(len(self.trajectory_start_indices)), self.batch_size)
        
        isolated_obs=[]
        isolated_a=[]
        isolated_r=[]
        isolated_log_a=[]
        for ri in rindex:
            
            if  ri == len(self.trajectory_start_indices)-1:
                isolated_obs.append(self.observation_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                isolated_a.append(self.action_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                isolated_r.append(self.reward_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                isolated_log_a.append(self.logprobability_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                
            else:
                isolated_obs.append(self.observation_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])
                isolated_a.append(self.action_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])
                isolated_r.append(self.reward_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])
                isolated_log_a.append(self.logprobability_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])

        return (
            isolated_obs,
            isolated_a,
            isolated_r,
            isolated_log_a,
        )
    
    def batch_sample(self, critic_handle):
        s_b, a_b, r_b, l_b = self.get()
        ss_b = []
        as_b = []
        rs_b = []
        ls_b = []
        adv_b = []
        ret_b = []
        sample_idxs = [np.random.choice(range(len(a)-1)) for a in s_b]
        
        for i in range(self.batch_size):
            ss_b.append(s_b[i][sample_idxs[i]])
            as_b.append(a_b[i][sample_idxs[i]])
            rs_b.append(r_b[i][sample_idxs[i]])
            ls_b.append(l_b[i][sample_idxs[i]])
            adv_b.append(self.adv_t(r_b[i][sample_idxs[i]:-1],
                                      critic_handle,
                                      s_b[i][sample_idxs[i]:-1],
                                      s_b[i][sample_idxs[i]+1:]))
            ret_b.append(self.ret_t(r_b[i][sample_idxs[i]:]))
        return (
            tf.convert_to_tensor(ss_b),
            tf.convert_to_tensor(as_b),
            tf.convert_to_tensor(adv_b),
            tf.convert_to_tensor(ret_b),
            tf.convert_to_tensor(ls_b),
            tf.convert_to_tensor(rs_b)
        )
        
    def adv_t(self, r_t, vf, s_t, s_t1):
        ite_gamma_lam = [(self.gamma*self.lam)**i for i in range(len(r_t))]
        delta_ts = r_t + self.gamma*tf.squeeze(vf(s_t1)) - tf.squeeze(vf(s_t))

        return np.sum(np.multiply(ite_gamma_lam, delta_ts))
    
    def ret_t(self, r_t):
        ite_gamma = [self.gamma**i for i in range(len(r_t))]
        
        return np.sum(np.multiply(ite_gamma, r_t))
    
    def clear(self):
        self.pointer = 0
        self.trajectory_start_indices = []
        self.trajectory_start_indices.append(0)

#### Testing

In [1080]:
buffer = Buffer(num_states, num_actions, 100, 5)
actor_test = Actor(num_actions)
critic_gen = Critic_Wrapper(num_states)
critic_test = critic_gen.get_critic()

In [1081]:
obs = env.reset()
buffer.clear()
for x in range(100):
    tf_obs = tf.expand_dims(obs, 0)
    a, log_a = actor_test(tf_obs)
    a = a[0]
    obs_new, r, d, _ = env.step(a)
    
    buffer.store(obs, a, r, log_a, d)
    if d:
        obs = env.reset()
    else:
        obs = obs_new
    
print(buffer.trajectory_start_indices)

[0, 10, 23, 33, 51, 62, 75, 85, 97]


In [1082]:
s_b, a_b, r_b, l_b = buffer.get()
# np.array(s_b, dtype=object).shape, np.array(a_b, dtype=object).shape, np.array(r_b, dtype=object).shape, np.array(l_b, dtype=object).shape
# np.array(s_b, dtype=object), np.array(a_b, dtype=object), np.array(r_b, dtype=object), np.array(l_b, dtype=object)
# print("trajectory len: ", len(s_b[0][0]))


In [1083]:
buffer.batch_sample(critic_test)

(<tf.Tensor: shape=(5, 11), dtype=float32, numpy=
 array([[ 1.2376944e+00, -3.0259125e-02, -1.4985656e-02, -4.8460253e-02,
         -1.9187263e-03, -3.0959690e-01, -5.6984997e-01, -1.9444648e+00,
         -4.6029386e-01, -3.4490345e+00,  1.6275064e+00],
        [ 1.2524438e+00,  3.1704935e-03, -3.2406251e-03,  1.4117083e-03,
          4.8490930e-03,  7.0585421e-04,  1.6089141e-03,  1.5156346e-03,
          4.6885465e-03, -2.6915667e-03, -3.4802600e-03],
        [ 1.2482835e+00, -1.1809201e-02, -2.3270547e-03, -8.1993453e-03,
         -5.0022192e-03, -6.9500931e-02, -1.5239210e-01, -1.2530603e+00,
         -1.5351959e+00,  9.0561099e-02,  9.1368988e-02],
        [ 1.2418537e+00, -6.5774612e-02, -6.5061979e-02, -9.5020160e-03,
         -1.2787361e-02, -2.5728980e-01, -3.5327318e-01, -2.3560221e+00,
         -2.0448375e+00, -1.2154794e+00, -1.9028935e+00],
        [ 1.2401736e+00, -1.7281789e-02, -1.9822193e-03, -2.0817978e-02,
          2.2801356e-02, -4.3799173e-02, -3.6060297e-01, -1.1

### PPO

In [1085]:
class PPO:
    
    def __init__(self, env, observation_dimensions, action_dimensions, horizon,
                 minibatch_size=256, gamma=0.99, lam=0.95, diagnostic_length=1, lr=3e-4):
        
        self.env = env
        self.actor = Actor(action_dimensions)
        self.critic_gen = Critic_Wrapper(observation_dimensions)
        self.critic = self.critic_gen.get_critic()
        self.buffer = Buffer(observation_dimensions, action_dimensions, horizon, minibatch_size, gamma, lam)
        
        self.p_opt= tf.keras.optimizers.Adam(learning_rate=lr,
                                                            )
        self.v_opt= tf.keras.optimizers.Adam(learning_rate=lr,
                                                            )
        self.clip_epsilon = 0.2
        
        self.diagnostics_buffer = []
        self.diagno_index = 0
        self.diagno_length = diagnostic_length
        
        self.gamma, self.lam, self.horizon = gamma, lam, horizon
        
    def train(self, iterations, epochs=20):
        
        for i in range(iterations):
            
            obs = self.env.reset()
            
            for t in range(self.horizon):
                
                tf_obs = tf.expand_dims(obs, 0)
                a, log_a = self.actor(tf_obs)
                a=a[0]
            
                obs_new, r, d, _ = self.env.step(a)
                
                self.buffer.store(obs, a, r, log_a, d)
                
                if d:
                    obs = self.env.reset()
                else:
                    obs = obs_new

            for _ in range(epochs):
                (
                    obs_b,
                    a_b,
                    adv_b,
                    ret_b,
                    log_b,
                    r_b,
                ) = self.buffer.batch_sample(self.critic)
                self.update(obs_b, adv_b, log_b, ret_b)
            self.show_diagnostics()    
            self.buffer.clear()
            
    def update(self, obs_b, adv_b, log_b, ret_b):
        with tf.GradientTape() as tape:
            a, log_a = self.actor(obs_b)
            ratio = tf.exp(log_a - log_b)
            c_ratio = tf.clip_by_value(ratio, 1.0-self.clip_epsilon, 1.0+self.clip_epsilon)

            rt_at = tf.minimum(tf.math.multiply(ratio, tf.cast(adv_b, tf.float32)), 
                               tf.math.multiply(c_ratio, tf.cast(adv_b, tf.float32)))

            L_theta_clip = -tf.reduce_mean(rt_at)
        J_theta_clip = tape.gradient(L_theta_clip, self.actor.trainable_variables)
        self.p_opt.apply_gradients(zip(J_theta_clip, self.actor.trainable_variables))
        
        with tf.GradientTape() as tape1:
            v_theta = tf.squeeze(self.critic(obs_b))
            v_mse = tf.reduce_mean((v_theta - ret_b)**2)
        J_phi = tape1.gradient(v_mse, self.critic.trainable_variables)
        self.v_opt.apply_gradients(zip(J_phi, self.critic.trainable_variables))
        self.record_diagnostics(["policy loss: ", np.array(L_theta_clip), "value loss: ", np.array(v_mse)])
        
    def record_diagnostics(self, data):
        if len(self.diagnostics_buffer) == self.diagno_length:
            self.diagnostics_buffer[self.diagno_index] = data

        if len(self.diagnostics_buffer) < self.diagno_length:
            self.diagnostics_buffer.append(data)
        self.diagno_index = (self.diagno_index+1)%self.diagno_length
    def show_diagnostics(self):
        for i in range(len(self.diagnostics_buffer)):
            print(self.diagnostics_buffer[(self.diagno_index+i)%len(self.diagnostics_buffer)])
            
    def save_weights(self, a_path, c_path):
        raise NotImplementedError

    def load_weights(self, a_path, c_path):
        raise NotImplementedError

#### Testing

In [None]:
ppo1 = PPO(env, num_states, num_actions, 1000)
ppo1.train(200, 100)

['policy loss: ', array(-12.2777605, dtype=float32), 'value loss: ', array(431.26966668)]
['policy loss: ', array(-2.2781205, dtype=float32), 'value loss: ', array(58.27813721)]
['policy loss: ', array(-2.4102178, dtype=float32), 'value loss: ', array(95.39632108)]
['policy loss: ', array(-4.154267, dtype=float32), 'value loss: ', array(234.18128393)]
['policy loss: ', array(0.04454153, dtype=float32), 'value loss: ', array(155.26749576)]
['policy loss: ', array(-2.2033563, dtype=float32), 'value loss: ', array(152.80047098)]
['policy loss: ', array(-0.6193781, dtype=float32), 'value loss: ', array(23.75075243)]
['policy loss: ', array(0.15399389, dtype=float32), 'value loss: ', array(100.26242441)]
['policy loss: ', array(-1.0843996, dtype=float32), 'value loss: ', array(53.32222099)]
['policy loss: ', array(0.12840986, dtype=float32), 'value loss: ', array(108.94982341)]
['policy loss: ', array(0.20509718, dtype=float32), 'value loss: ', array(85.00054821)]
