### Packages

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal
import time
from tensorflow.keras import Model
import matplotlib.pyplot as plt
import random
import tensorflow_probability as tfp

### Env Setup

In [3]:
problem = "Hopper-v3"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

EPSILON = 1e-10

### Actor Model

In [4]:
class Actor(Model):

    def __init__(self, action_dimensions, action_bound):
        super().__init__()
        self.action_dim, self.upper_bound = action_dimensions, action_bound
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(self.action_dim),
                                                                    scale_diag=tf.ones(self.action_dim))
        self.dense1_layer = layers.Dense(256, activation="relu")
        self.dense2_layer = layers.Dense(256, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):

        a1 = self.dense1_layer(state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)
        sigma = tf.clip_by_value(sigma, EPSILON, 2.718)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)
        
        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))
 
        action = tf.tanh(action_)

        log_pi_ = dist.log_prob(action_)     
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
        return action*self.upper_bound, log_pi, dist.entropy()


#### Testing

In [8]:
actor_test = Actor(num_actions, upper_bound)

In [9]:
obs = env.reset()
obs
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test, h_a_test = actor_test(tf_obs)

In [10]:
obs_new, _, _, _ = env.step(a_test[0])
tf_obs_new = tf.expand_dims(obs_new, 0)
a_test, log_a_test, h_a_test = actor_test(tf.convert_to_tensor([tf_obs, tf_obs_new]))

In [11]:
actor_test.summary()

Model: "actor_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             multiple                  3072      
                                                                 
 dense_5 (Dense)             multiple                  65792     
                                                                 
 dense_6 (Dense)             multiple                  771       
                                                                 
 dense_7 (Dense)             multiple                  771       
                                                                 
Total params: 70,406
Trainable params: 70,406
Non-trainable params: 0
_________________________________________________________________


### Critic Model

In [12]:
class Critic_Wrapper():
    def __init__(self, state_dim):
        self.s_dim=state_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(256, activation="relu")(state_input)
        # state_out = layers.Dense(32, activation="relu")(state_out)

        out = layers.Dense(256, activation="relu")(state_out)
        outputs = layers.Dense(1, dtype='float64')(out)

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input], outputs)

        return model


#### Testing

In [13]:
critic_gen = Critic_Wrapper(num_states)
critic_test = critic_gen.get_critic()

In [14]:
obs = env.reset()
obs

array([ 1.24501840e+00,  5.15365080e-04,  1.54120394e-03, -3.70868586e-03,
        1.55022966e-03, -4.18266036e-04,  2.57481618e-03,  2.93924384e-03,
       -4.18489937e-03,  1.51059561e-03,  4.68795802e-03])

In [15]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test, h_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.24501840e+00,  5.15365080e-04,  1.54120394e-03,
        -3.70868586e-03,  1.55022966e-03, -4.18266036e-04,
         2.57481618e-03,  2.93924384e-03, -4.18489937e-03,
         1.51059561e-03,  4.68795802e-03]])>

In [16]:
v_test = tf.squeeze(critic_test([tf_obs]))
v_test

<tf.Tensor: shape=(), dtype=float64, numpy=0.06604118909886372>

In [18]:
critic_test.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 11)]              0         
                                                                 
 dense_8 (Dense)             (None, 256)               3072      
                                                                 
 dense_9 (Dense)             (None, 256)               65792     
                                                                 
 dense_10 (Dense)            (None, 1)                 257       
                                                                 
Total params: 69,121
Trainable params: 69,121
Non-trainable params: 0
_________________________________________________________________


### Replay Buffer

In [19]:
class Buffer:

    def __init__(self, observation_dimensions, action_dimensions, size, minibatch_size=256, gamma=0.99, lam=0.95):

        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros((size, action_dimensions), dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        
        self.gamma, self.lam = gamma, lam
        self.batch_size = minibatch_size
        
        self.buffer_cap = size
        self.pointer = 0
        self.trajectory_start_indices = []
        self.trajectory_start_indices.append(0)

    def store(self, observation, action, reward, logprobability, done):

        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1
        if done and not self.pointer > self.buffer_cap-1:
            self.trajectory_start_indices.append(self.pointer)


    def get(self):
        # Get all data of the buffer
        if self.trajectory_start_indices[-1] == self.buffer_cap-1:
            rindex = np.random.choice(range(len(self.trajectory_start_indices)-1), self.batch_size)
        else:
            rindex = np.random.choice(range(len(self.trajectory_start_indices)), self.batch_size)
        
        isolated_obs=[]
        isolated_a=[]
        isolated_r=[]
        isolated_log_a=[]
        for ri in rindex:
            
            if  ri == len(self.trajectory_start_indices)-1:
                isolated_obs.append(self.observation_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                isolated_a.append(self.action_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                isolated_r.append(self.reward_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                isolated_log_a.append(self.logprobability_buffer[self.trajectory_start_indices[ri]:
                                                       self.buffer_cap])
                
            else:
                isolated_obs.append(self.observation_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])
                isolated_a.append(self.action_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])
                isolated_r.append(self.reward_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])
                isolated_log_a.append(self.logprobability_buffer[self.trajectory_start_indices[ri]:
                                                       self.trajectory_start_indices[ri+1]])

        return (
            isolated_obs,
            isolated_a,
            isolated_r,
            isolated_log_a,
        )
    
    def batch_sample(self, critic_handle):
        s_b, a_b, r_b, l_b = self.get()
        ss_b = []
        as_b = []
        rs_b = []
        ls_b = []
        adv_b = []
        ret_b = []
        sample_idxs = [np.random.choice(range(len(a)-1)) for a in s_b]
        
        for i in range(self.batch_size):
            ss_b.append(s_b[i][sample_idxs[i]])
            as_b.append(a_b[i][sample_idxs[i]])
            rs_b.append(r_b[i][sample_idxs[i]])
            ls_b.append(l_b[i][sample_idxs[i]])
            adv_b.append(self.adv_t(r_b[i][sample_idxs[i]:-1],
                                      critic_handle,
                                      s_b[i][sample_idxs[i]:-1],
                                      s_b[i][sample_idxs[i]+1:]))
            ret_b.append(self.ret_t(r_b[i][sample_idxs[i]:]))
        return (
            tf.convert_to_tensor(ss_b),
            tf.convert_to_tensor(as_b),
            tf.convert_to_tensor(adv_b),
            tf.convert_to_tensor(ret_b),
            tf.convert_to_tensor(ls_b),
            tf.convert_to_tensor(rs_b)
        )
        
    def adv_t(self, r_t, vf, s_t, s_t1):
        ite_gamma_lam = [(self.gamma*self.lam)**i for i in range(len(r_t))]
        delta_ts = r_t + self.gamma*tf.squeeze(vf(s_t1)) - tf.squeeze(vf(s_t))

        return np.sum(np.multiply(ite_gamma_lam, delta_ts))
    
    def ret_t(self, r_t):
        ite_gamma = [self.gamma**i for i in range(len(r_t))]
        
        return np.sum(np.multiply(ite_gamma, r_t))
    
    def clear(self):
        self.pointer = 0
        self.trajectory_start_indices = []
        self.trajectory_start_indices.append(0)

#### Testing

In [20]:
buffer = Buffer(num_states, num_actions, 100, 5)
actor_test = Actor(num_actions, upper_bound)
critic_gen = Critic_Wrapper(num_states)
critic_test = critic_gen.get_critic()

In [21]:
obs = env.reset()
buffer.clear()
for x in range(100):
    tf_obs = tf.expand_dims(obs, 0)
    a, log_a, h_a = actor_test(tf_obs)
    a = a[0]
    obs_new, r, d, _ = env.step(a)
    
    buffer.store(obs, a, r, log_a, d)
    if d:
        obs = env.reset()
    else:
        obs = obs_new
    
print(buffer.trajectory_start_indices)

[0, 14, 28, 60, 74, 84, 97]


In [22]:
buffer.batch_sample(critic_test)

(<tf.Tensor: shape=(5, 11), dtype=float32, numpy=
 array([[ 1.24030161e+00, -5.81849068e-02, -5.85331880e-02,
         -3.59406322e-02,  6.55986294e-02, -1.68974608e-01,
         -4.99858081e-01, -2.47236943e+00, -2.76374245e+00,
         -3.40221137e-01,  8.56482446e-01],
        [ 1.24557483e+00, -2.56889015e-02, -3.31838280e-02,
          1.84853328e-03, -1.52500365e-02, -1.42861128e-01,
         -1.83005795e-01, -1.18709588e+00, -1.09199917e+00,
         -4.16546851e-01, -2.14080667e+00],
        [ 1.24432516e+00, -5.54031320e-02, -5.30344062e-02,
         -2.16464493e-02,  4.37290780e-03, -2.92652011e-01,
         -2.82289386e-01, -2.13829947e+00, -1.26871884e+00,
         -2.22442412e+00, -1.02577460e+00],
        [ 1.23615491e+00, -2.75857691e-02, -1.34947700e-02,
         -3.98095623e-02, -8.06286260e-02, -2.44291142e-01,
         -3.17295462e-01, -1.75193965e+00, -1.66425800e+00,
         -2.83549607e-01, -3.75329781e+00],
        [ 1.25126958e+00, -8.24463132e-05, -9.14781447

### PPO

In [23]:
class PPO:
    
    def __init__(self, env, observation_dimensions, action_dimensions, action_bound, horizon,
                 minibatch_size=256, gamma=0.99, lam=0.95, diagnostic_length=1, lr=3e-4):
        
        self.env = env
        self.actor = Actor(action_dimensions, action_bound)
        self.critic_gen = Critic_Wrapper(observation_dimensions)
        self.critic = self.critic_gen.get_critic()
        self.buffer = Buffer(observation_dimensions, action_dimensions, horizon, minibatch_size, gamma, lam)
        
        self.p_opt= tf.keras.optimizers.Adam(learning_rate=lr,
                                                            )
        self.v_opt= tf.keras.optimizers.Adam(learning_rate=lr,
                                                            )
        self.clip_epsilon = 0.2
        
        self.diagnostics_buffer = []
        self.diagno_index = 0
        self.diagno_length = diagnostic_length
        
        self.gamma, self.lam, self.horizon = gamma, lam, horizon
        
    def train(self, iterations, epochs=20):
        
        for i in range(iterations):
            
            obs = self.env.reset()
            
            for t in range(self.horizon):
                
                tf_obs = tf.expand_dims(obs, 0)
                a, log_a, h_a = self.actor(tf_obs)
                a=a[0]
            
                obs_new, r, d, _ = self.env.step(a)
                
                self.buffer.store(obs, a, r, log_a, d)
                
                if d:
                    obs = self.env.reset()
                else:
                    obs = obs_new

            for _ in range(epochs):
                (
                    obs_b,
                    a_b,
                    adv_b,
                    ret_b,
                    log_b,
                    r_b,
                ) = self.buffer.batch_sample(self.critic)
                self.update(obs_b, adv_b, log_b, ret_b)
            self.show_diagnostics()    
            self.buffer.clear()
            
    def update(self, obs_b, adv_b, log_b, ret_b):
        with tf.GradientTape() as tape:
            a, log_a, h_a = self.actor(obs_b)
            ratio = tf.exp(log_a - log_b)
            c_ratio = tf.clip_by_value(ratio, 1.0-self.clip_epsilon, 1.0+self.clip_epsilon)

            rt_at = tf.minimum(tf.math.multiply(ratio, tf.cast(adv_b, tf.float32)), 
                               tf.math.multiply(c_ratio, tf.cast(adv_b, tf.float32)))
            
            L_theta_clip = -tf.reduce_mean(rt_at+h_a)
        J_theta_clip = tape.gradient(L_theta_clip, self.actor.trainable_variables)
        self.p_opt.apply_gradients(zip(J_theta_clip, self.actor.trainable_variables))
        
        with tf.GradientTape() as tape1:
            v_theta = tf.squeeze(self.critic(obs_b))
            v_mse = tf.reduce_mean((v_theta - ret_b)**2)
        J_phi = tape1.gradient(v_mse, self.critic.trainable_variables)
        self.v_opt.apply_gradients(zip(J_phi, self.critic.trainable_variables))
        self.record_diagnostics(["policy loss: ", np.array(L_theta_clip), "value loss: ", np.array(v_mse)])
        
    def record_diagnostics(self, data):
        if len(self.diagnostics_buffer) == self.diagno_length:
            self.diagnostics_buffer[self.diagno_index] = data

        if len(self.diagnostics_buffer) < self.diagno_length:
            self.diagnostics_buffer.append(data)
        self.diagno_index = (self.diagno_index+1)%self.diagno_length
    def show_diagnostics(self):
        for i in range(len(self.diagnostics_buffer)):
            print(self.diagnostics_buffer[(self.diagno_index+i)%len(self.diagnostics_buffer)])
            
    def save_weights(self, a_path, c_path):
        self.actor.save_weights(a_path)
        print("Saved actor weights")
        self.critic.save_weights(c_path)
        print("Saved critic weights")

    def load_weights(self, a_path, c_path):
        try:
            self.actor.load_weights(a_path)
            print("Loaded actor weights")
            self.critic.load_weights(c_path)
            print("Loaded critic weights")
        except ValueError:
            print("ERROR: Please make sure weights are saved as .ckpt")
            
        
    def eval_rollout(self, eval_env):
        eps_r = 0
        eval_obs = eval_env.reset()
        
        while True:
#             eval_env.render()

            tf_eval_obs = tf.expand_dims(tf.convert_to_tensor(eval_obs), 0)

            eval_a, eval_log_a, eval_h_a = self.actor(tf_eval_obs, eval_mode=True)

            eval_a = eval_a[0]

            eval_obs_new, eval_r, eval_d, _ = eval_env.step(eval_a)

            eps_r += eval_r

            if eval_d:
                break
                
            eval_obs = eval_obs_new
            
        print("rollout episodic reward: ", eps_r)


#### Testing

In [24]:
ppo1 = PPO(env, num_states, num_actions, upper_bound, 100)
ppo1.train(1, 1)

['policy loss: ', array(-6.724763, dtype=float32), 'value loss: ', array(28.13324966)]


In [25]:
ppo1.save_weights('./weights/a_test.ckpt', './weights/c_test.ckpt')

Saved actor weights
Saved critic weights


In [26]:
ppo2 = PPO(env, num_states, num_actions, upper_bound, 500)


In [27]:
ppo2.load_weights('./weights/a_test.ckpt', './weights/c_test.ckpt')

Loaded actor weights
Loaded critic weights


In [28]:
ppo2.train(1, 5)

['policy loss: ', array(-8.533433, dtype=float32), 'value loss: ', array(39.21980823)]


In [29]:
ppo2.eval_rollout(env)

rollout episodic reward:  15.432678150137994
