In [18]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
import numpy as np
import os
import json
import random
import tensorflow_probability as tfp
from tensorflow.keras import regularizers
import glfw

### Env Setup

In [19]:
problem = "Hopper-v3"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

### Actor Model

#### Note:

Same as PPO actor. Gaussian policy

In [20]:
EPSILON = 1e-10

class Actor(Model):

    def __init__(self, action_dimensions, action_bound):
        super().__init__()
        self.action_dim, self.upper_bound = action_dimensions, action_bound
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(self.action_dim),
                                                                    scale_diag=tf.ones(self.action_dim))
        self.dense1_layer = layers.Dense(64, activation="relu")
        self.dense2_layer = layers.Dense(64, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):

        a1 = self.dense1_layer(state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)
        sigma = tf.clip_by_value(sigma, EPSILON, 2.718)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)
        
        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))
 
        action = tf.tanh(action_)

        log_pi_ = dist.log_prob(action_)     
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
        return action*self.upper_bound, log_pi


In [21]:
actor_test = Actor(num_actions, upper_bound)

In [22]:
obs = env.reset()
obs
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test = actor_test(tf_obs)
print(a_test, log_a_test)

tf.Tensor([[-0.9719501   0.84101444  0.5297327 ]], shape=(1, 3), dtype=float32) tf.Tensor([-0.8370919], shape=(1,), dtype=float32)


In [23]:
actor_test.summary()

Model: "actor_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             multiple                  768       
                                                                 
 dense_9 (Dense)             multiple                  4160      
                                                                 
 dense_10 (Dense)            multiple                  195       
                                                                 
 dense_11 (Dense)            multiple                  195       
                                                                 
Total params: 5,318
Trainable params: 5,318
Non-trainable params: 0
_________________________________________________________________


### Critic Wrapper

#### Note:

Different from PPO, critic evaluate state-action pairs.

In [24]:
class Critic_Wrapper():
    def __init__(self, state_dim, action_dim):
        self.s_dim=state_dim
        self.a_dim=action_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(32, activation="relu")(state_input)

        # Action as input
        action_input = layers.Input(shape=(self.a_dim))
        action_out = layers.Dense(32, activation="relu")(action_input)

        # Concatenating
        concat = layers.Concatenate()([state_out, action_out])
        out = layers.Dense(64, activation="relu")(concat)
        outputs = tf.squeeze(layers.Dense(1)(out))

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model

In [25]:
critic_gen = Critic_Wrapper(num_states, num_actions)
critic_test = critic_gen.get_critic()

In [26]:
obs = env.reset()
obs

array([ 1.25401705e+00, -1.82400569e-03,  1.70315749e-03, -2.07642951e-03,
       -2.80744649e-03,  4.37307766e-03, -2.14324323e-03,  9.02038884e-04,
        3.32820211e-03, -1.83249526e-03, -1.83357552e-03])

In [27]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.25401705e+00, -1.82400569e-03,  1.70315749e-03,
        -2.07642951e-03, -2.80744649e-03,  4.37307766e-03,
        -2.14324323e-03,  9.02038884e-04,  3.32820211e-03,
        -1.83249526e-03, -1.83357552e-03]])>

In [28]:
v_test = critic_test([tf_obs, a_test])
v_test

<tf.Tensor: shape=(), dtype=float32, numpy=-0.08869143>

In [29]:
obs_new, _, _, _ = env.step(a_test[0])
tf_obs_new = tf.expand_dims(obs_new, 0)
statex2 = tf.convert_to_tensor([obs, obs_new])
print(statex2.shape)

(2, 11)


In [30]:
a_2, loga_2 = actor_test(statex2)

print(a_2.shape, loga_2.shape)

(2, 3) (2,)


In [31]:
v_2 = critic_test([statex2, a_2])
print(v_2.shape)

(2,)


In [32]:
critic_test.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 3)]          0           []                               
                                                                                                  
 dense_12 (Dense)               (None, 32)           384         ['input_3[0][0]']                
                                                                                                  
 dense_13 (Dense)               (None, 32)           128         ['input_4[0][0]']                
                                                                                            

### Replay Buffer

In [33]:
class Buffer:
    def __init__(self, obs_dim, a_dim, buffer_capacity=100000, batch_size=256):
        
        self.obs_dim = obs_dim
        self.a_dim = a_dim
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.action_buffer = np.zeros((self.buffer_capacity, self.a_dim))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.done_buffer = np.zeros((self.buffer_capacity, 1))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]
        self.buffer_counter += 1
        
    def sample(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.squeeze(tf.convert_to_tensor(self.reward_buffer[batch_indices]))
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        done_batch = tf.squeeze(tf.convert_to_tensor(self.done_buffer[batch_indices]))
        
        return (state_batch,
               action_batch,
               reward_batch,
               next_state_batch,
               done_batch)


In [34]:
buffer1 = Buffer(num_states, num_actions, 100, 10)

In [35]:
prev_obs = env.reset()

for i in range(buffer1.buffer_capacity):
    a, _ = actor_test(tf.expand_dims(prev_obs, 0))
    obs, r, d, _ = env.step(a[0])
    
    buffer1.record((prev_obs, a[0], r, obs, d))
    
    prev_obs = obs
    

In [36]:
buffer1.sample()

(<tf.Tensor: shape=(10, 11), dtype=float64, numpy=
 array([[ 1.00420107e+00, -2.06170734e+00, -1.95111438e+00,
          3.76500399e-03, -5.14848656e-01, -9.16336247e-01,
         -1.96551198e+00, -1.00000000e+01, -1.00000000e+01,
         -9.33190497e-02, -7.16679968e+00],
        [ 1.46937006e-01, -4.19061693e+00, -2.58921517e+00,
         -1.73384490e-02, -7.91540417e-01, -4.91787990e-02,
          9.17468702e-02, -2.22760193e+00,  3.46213781e-01,
         -6.39974549e-01, -4.49070893e-01],
        [ 6.85111071e-01, -3.37399353e+00, -2.59992060e+00,
         -1.45989893e-01, -7.95965668e-01, -6.68122644e-01,
         -2.36964855e+00, -9.51589820e+00, -5.53315352e+00,
         -1.17231744e+00, -1.08068486e+00],
        [ 1.19148413e+00, -4.88641252e-01, -5.30868315e-01,
          1.21776081e-02, -3.42407863e-02, -1.39810334e+00,
         -5.14817619e-01, -8.11386107e+00, -7.10229354e+00,
         -1.96788365e+00,  1.99344729e-01],
        [ 7.41495188e-01, -3.18896483e+00, -2.5151017

### Soft Actor Critic

In [40]:
class SAC:
    
    def __init__(self, env, observation_dimensions, action_dimensions, action_bound, buffer_capacity,
                 minibatch_size=256, gamma=0.99, tau=0.95, lr=3e-4):
        
        self.env = env
        
        self.a = Actor(action_dimensions, action_bound)
        self.c_gen = Critic_Wrapper(observation_dimensions, action_dimensions)
        self.c1 = self.c_gen.get_critic()
        self.c2 = self.c_gen.get_critic()
        self.tc1 = self.c_gen.get_critic()
        self.tc2 = self.c_gen.get_critic()
        
        self.tc1.set_weights(self.c1.get_weights())
        self.tc2.set_weights(self.c2.get_weights())

        self.te = -np.prod(action_dimensions)
        self.alpha = tf.Variable(0.0, dtype=tf.float32)
        
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c1_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c2_opt = tf.keras.optimizers.Adam(learning_rate=lr)                                                  
        self.alpha_opt = tf.keras.optimizers.Adam(learning_rate=lr)   
        
        self.buffer = Buffer(observation_dimensions, action_dimensions, buffer_capacity, minibatch_size)
        
        self.gamma, self.tau = gamma, tau
        
    def train(self, max_env_step):
        t = 0
        epo = 0
        while t < max_env_step:
            p_s = self.env.reset()
            a_losses = []
            c1_losses = []
            c2_losses = []
            alpha_losses = []
            while True:
                a, log_a = self.a(tf.expand_dims(p_s, 0))
                a=a[0]
                s, r, d, _ = self.env.step(a)
                end = 0 if d else 1
                
                self.buffer.record((p_s, a, r, s, end))
                data = self.buffer.sample()
                
                a_loss, c1_loss, c2_loss, alpha_loss = self.update(data)
                
                a_losses.append(a_loss.numpy())
                c1_losses.append(c1_loss.numpy())
                c2_losses.append(c2_loss.numpy())
                alpha_losses.append(alpha_loss.numpy())
                
                t = t+1
                
                if d:
                    break
                p_s = s
                
            print("Epoch {:04d}".format(epo), "Policy Avg. Loss: ", np.mean(a_losses), 
                  ", Critic 1 Avg. Loss: ",  np.mean(c1_losses), 
                  ", Critic 2 Avg. Loss: ",  np.mean(c2_losses), 
                  ", Alpha 1 Avg. Loss: ",  np.mean(alpha_losses), flush=True)
            epo = epo+1


    @tf.function
    def update(self, data):
        s_b, a_b, r_b, ns_b, d_b = data
        with tf.GradientTape() as tape_c1, tf.GradientTape() as tape_c2:
            q1 = self.c1([s_b, a_b])
            q2 = self.c2([s_b, a_b])
            na, nlog_a = self.a(ns_b)
            
            tq1 = self.tc1([ns_b, na])
            tq2 = self.tc2([ns_b, na])
            
            min_qt = tf.math.minimum(tq1,tq2)
            
            soft_qt = min_qt - (self.alpha*nlog_a)
            
            y = tf.stop_gradient(r_b+self.gamma*d_b*tf.cast(soft_qt, dtype=tf.float64))
            
            L_c1 = 0.5*tf.reduce_mean((y-tf.cast(q1, dtype=tf.float64))**2)
            L_c2 = 0.5*tf.reduce_mean((y-tf.cast(q2, dtype=tf.float64))**2)
        c1_grad = tape_c1.gradient(L_c1, self.c1.trainable_variables)
        c2_grad = tape_c2.gradient(L_c2, self.c2.trainable_variables)
        
        self.c1_opt.apply_gradients(zip(c1_grad, self.c1.trainable_variables))
        self.c2_opt.apply_gradients(zip(c2_grad, self.c2.trainable_variables))
        
        for (tc1w, c1w) in zip(self.tc1.variables, self.c1.variables):
            tc1w.assign(tc1w*self.tau + c1w*(1.0-self.tau))
        for (tc2w, c2w) in zip(self.tc2.variables, self.c2.variables):
            tc2w.assign(tc2w*self.tau + c2w*(1.0-self.tau))
            
        with tf.GradientTape() as tape_a, tf.GradientTape() as tape_alpha:
            a, log_a = self.a(s_b)
            qa1 = self.c1([s_b, a])
            qa2 = self.c2([s_b, a])
            
            soft_qa = tf.math.minimum(qa1,qa2)

            L_a = -tf.reduce_mean(soft_qa-self.alpha*log_a)
            L_alpha = -tf.reduce_mean(self.alpha*tf.stop_gradient(log_a + self.te))
        grad_a = tape_a.gradient(L_a, self.a.trainable_variables)
        grad_alpha = tape_alpha.gradient(L_alpha, [self.alpha])
        self.a_opt.apply_gradients(zip(grad_a, self.a.trainable_variables))
        self.alpha_opt.apply_gradients(zip(grad_alpha, [self.alpha]))
        
        return L_a, L_c1, L_c2, L_alpha
    
    def save_weights(self, dir_path):
        cp = tf.train.Checkpoint(step=self.alpha)
        self.a.save_weights(dir_path+"/a.ckpt")
        print("Saved actor weights", flush=True)
        self.c1.save_weights(dir_path+"/c1.ckpt")
        print("Saved critic 1 weights", flush=True)
        self.c2.save_weights(dir_path+"/c2.ckpt")
        print("Saved critic 2 weights", flush=True)
        cp.save(dir_path+"/alpha")
        print("Saved alpha weights", flush=True)

    def load_weights(self, dir_path):
        try:
            cp = tf.train.Checkpoint(step=self.alpha)
            self.a.load_weights(dir_path+"/a.ckpt")
            print("Loaded actor weights", flush=True)
            self.c1.load_weights(dir_path+"/c1.ckpt")
            print("Loaded critic 1 weights", flush=True)
            self.c2.load_weights(dir_path+"/c2.ckpt")
            print("Loaded critic 2 weights", flush=True)
            cp.restore(dir_path+"/alpha-1")
            print("Loaded alpha weights", flush=True)
            self.tc1.set_weights(self.c1.get_weights())
            self.tc2.set_weights(self.c2.get_weights())

        except ValueError:
            print("ERROR: Please make sure weights are saved as .ckpt", flush=True)
            
    def eval_rollout(self, problem):
        eps_r = 0
        eval_env = gym.make(problem)
        eval_obs = eval_env.reset()

        while True:
            eval_env.render()

            tf_eval_obs = tf.expand_dims(tf.convert_to_tensor(eval_obs), 0)

            eval_a, eval_log_a = self.a(tf_eval_obs, eval_mode=True)

            eval_a = eval_a[0]

            eval_obs_new, eval_r, eval_d, _ = eval_env.step(eval_a)

            eps_r += eval_r

            if eval_d:
                break
                
            eval_obs = eval_obs_new

        glfw.destroy_window(eval_env.viewer.window)
        eval_env.close()
        print("rollout episodic reward: ", eps_r, flush=True)
        
        return eps_r


In [41]:
sac1 = SAC(env, num_states, num_actions, upper_bound, 1000000)

In [42]:
sac1.train(1000)

Epoch 0000 Policy Avg. Loss:  0.35969567 , Critic 1 Avg. Loss:  0.1721895517200507 , Critic 2 Avg. Loss:  0.17765680691402877 , Alpha 1 Avg. Loss:  -0.0061950134
Epoch 0001 Policy Avg. Loss:  -0.764436 , Critic 1 Avg. Loss:  0.13202453921192908 , Critic 2 Avg. Loss:  0.15646447713014325 , Alpha 1 Avg. Loss:  -0.041318085
Epoch 0002 Policy Avg. Loss:  -2.67977 , Critic 1 Avg. Loss:  0.2942117204913821 , Critic 2 Avg. Loss:  0.27354679253686137 , Alpha 1 Avg. Loss:  0.033842966
Epoch 0003 Policy Avg. Loss:  -4.350079 , Critic 1 Avg. Loss:  0.42745284538773104 , Critic 2 Avg. Loss:  0.4160114465068275 , Alpha 1 Avg. Loss:  0.15859634
Epoch 0004 Policy Avg. Loss:  -5.480203 , Critic 1 Avg. Loss:  0.6094894317218219 , Critic 2 Avg. Loss:  0.5686217444182767 , Alpha 1 Avg. Loss:  0.043561686
Epoch 0005 Policy Avg. Loss:  -6.350147 , Critic 1 Avg. Loss:  0.7244659166415823 , Critic 2 Avg. Loss:  0.7043358277428651 , Alpha 1 Avg. Loss:  -0.249476
Epoch 0006 Policy Avg. Loss:  -6.617394 , Criti

In [43]:
sac1.save_weights("/home/tony/rl_models/MA2C/sac/devel/weights")

Saved actor weights
Saved critic 1 weights
Saved critic 2 weights
Saved alpha weights


In [44]:
print(sac1.alpha)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.14045984>


In [45]:
sac2 = SAC(env, num_states, num_actions, upper_bound, 1000000)

In [46]:
sac2.load_weights("/home/tony/rl_models/MA2C/sac/devel/weights")

Loaded actor weights
Loaded critic 1 weights
Loaded critic 2 weights
Loaded alpha weights


In [47]:
sac2.alpha

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.14045984>

In [48]:
sac2.train(500)

Epoch 0000 Policy Avg. Loss:  -23.318287 , Critic 1 Avg. Loss:  0.838302681979099 , Critic 2 Avg. Loss:  0.898270274492932 , Alpha 1 Avg. Loss:  -0.4271607
Epoch 0001 Policy Avg. Loss:  -20.082031 , Critic 1 Avg. Loss:  3.106233222639545 , Critic 2 Avg. Loss:  3.004582268098048 , Alpha 1 Avg. Loss:  -0.17873415
Epoch 0002 Policy Avg. Loss:  -19.528559 , Critic 1 Avg. Loss:  3.3412309989369233 , Critic 2 Avg. Loss:  3.4306676653636536 , Alpha 1 Avg. Loss:  -0.22907627
Epoch 0003 Policy Avg. Loss:  -19.577868 , Critic 1 Avg. Loss:  3.645833821023125 , Critic 2 Avg. Loss:  3.646975291057017 , Alpha 1 Avg. Loss:  -0.042650133
Epoch 0004 Policy Avg. Loss:  -19.931627 , Critic 1 Avg. Loss:  4.450653939904594 , Critic 2 Avg. Loss:  4.445639566240925 , Alpha 1 Avg. Loss:  -0.0013434519
Epoch 0005 Policy Avg. Loss:  -20.515638 , Critic 1 Avg. Loss:  5.364437359725915 , Critic 2 Avg. Loss:  5.233526948583891 , Alpha 1 Avg. Loss:  0.055890635
Epoch 0006 Policy Avg. Loss:  -21.156134 , Critic 1 Av

In [49]:
sac2.eval_rollout(problem)

Creating window glfw
rollout episodic reward:  32.67973995901737


32.67973995901737