In [3]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
import numpy as np
import os
import json
import random
import tensorflow_probability as tfp
from tensorflow.keras import regularizers
import glfw

### Env Setup

In [4]:
problem = "Hopper-v3"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

### Actor Model

#### Note:

Same as PPO actor. Gaussian policy

In [5]:
EPSILON = 1e-10

class Actor(Model):

    def __init__(self, action_dimensions, action_bound):
        super().__init__()
        self.action_dim, self.upper_bound = action_dimensions, action_bound
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(self.action_dim),
                                                                    scale_diag=tf.ones(self.action_dim))
        self.dense1_layer = layers.Dense(64, activation="relu")
        self.dense2_layer = layers.Dense(64, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):

        a1 = self.dense1_layer(state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)
        sigma = tf.clip_by_value(sigma, EPSILON, 2.718)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)
        
        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))
 
        action = tf.tanh(action_)

        log_pi_ = dist.log_prob(action_)     
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
        return action*self.upper_bound, log_pi


In [6]:
actor_test = Actor(num_actions, upper_bound)

2022-08-30 18:11:31.393652: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 18:11:31.409017: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 18:11:31.409454: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 18:11:31.411937: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [7]:
obs = env.reset()
obs
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test = actor_test(tf_obs)
print(a_test, log_a_test)

tf.Tensor([[0.8582691  0.60904384 0.00860028]], shape=(1, 3), dtype=float32) tf.Tensor([-1.4346583], shape=(1,), dtype=float32)


2022-08-30 18:11:32.214426: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [8]:
actor_test.summary()

Model: "actor"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  768       
                                                                 
 dense_1 (Dense)             multiple                  4160      
                                                                 
 dense_2 (Dense)             multiple                  195       
                                                                 
 dense_3 (Dense)             multiple                  195       
                                                                 
Total params: 5,318
Trainable params: 5,318
Non-trainable params: 0
_________________________________________________________________


### Critic Wrapper

#### Note:

Different from PPO, critic evaluate state-action pairs.

In [9]:
class Critic_Wrapper():
    def __init__(self, state_dim, action_dim):
        self.s_dim=state_dim
        self.a_dim=action_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(32, activation="relu")(state_input)

        # Action as input
        action_input = layers.Input(shape=(self.a_dim))
        action_out = layers.Dense(32, activation="relu")(action_input)

        # Concatenating
        concat = layers.Concatenate()([state_out, action_out])
        out = layers.Dense(64, activation="relu")(concat)
        outputs = tf.squeeze(layers.Dense(1)(out))

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model

In [10]:
critic_gen = Critic_Wrapper(num_states, num_actions)
critic_test = critic_gen.get_critic()

In [11]:
obs = env.reset()
obs

array([ 1.25289728e+00, -1.24963452e-03, -4.69490706e-03,  2.35503294e-03,
       -1.52612720e-03,  2.06331169e-04, -3.52340801e-03, -6.81939228e-04,
        4.28433440e-03,  1.21088773e-03,  4.06597211e-03])

In [12]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.25289728e+00, -1.24963452e-03, -4.69490706e-03,
         2.35503294e-03, -1.52612720e-03,  2.06331169e-04,
        -3.52340801e-03, -6.81939228e-04,  4.28433440e-03,
         1.21088773e-03,  4.06597211e-03]])>

In [13]:
v_test = critic_test([tf_obs, a_test])
v_test

<tf.Tensor: shape=(), dtype=float32, numpy=0.16794941>

In [14]:
obs_new, _, _, _ = env.step(a_test[0])
tf_obs_new = tf.expand_dims(obs_new, 0)
statex2 = tf.convert_to_tensor([obs, obs_new])
print(statex2.shape)

(2, 11)


In [15]:
a_2, loga_2 = actor_test(statex2)

print(a_2.shape, loga_2.shape)

(2, 3) (2,)


In [16]:
v_2 = critic_test([statex2, a_2])
print(v_2.shape)

(2,)


In [17]:
critic_test.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 3)]          0           []                               
                                                                                                  
 dense_4 (Dense)                (None, 32)           384         ['input_1[0][0]']                
                                                                                                  
 dense_5 (Dense)                (None, 32)           128         ['input_2[0][0]']                
                                                                                              

### Replay Buffer

In [18]:
class Buffer:
    def __init__(self, obs_dim, a_dim, buffer_capacity=100000, batch_size=256):
        
        self.obs_dim = obs_dim
        self.a_dim = a_dim
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.action_buffer = np.zeros((self.buffer_capacity, self.a_dim))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.done_buffer = np.zeros((self.buffer_capacity, 1))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]
        self.buffer_counter += 1
        
    def sample(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.squeeze(tf.convert_to_tensor(self.reward_buffer[batch_indices]))
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        done_batch = tf.squeeze(tf.convert_to_tensor(self.done_buffer[batch_indices]))
        
        return (state_batch,
               action_batch,
               reward_batch,
               next_state_batch,
               done_batch)


In [19]:
buffer1 = Buffer(num_states, num_actions, 100, 10)

In [20]:
prev_obs = env.reset()

for i in range(buffer1.buffer_capacity):
    a, _ = actor_test(tf.expand_dims(prev_obs, 0))
    obs, r, d, _ = env.step(a[0])
    
    buffer1.record((prev_obs, a[0], r, obs, d))
    
    prev_obs = obs
    

In [21]:
buffer1.sample()

(<tf.Tensor: shape=(10, 11), dtype=float64, numpy=
 array([[ 1.18538629e+00, -2.43833702e-01,  1.64513053e-03,
         -4.31658168e-01,  1.54749131e-01, -5.63677487e-01,
         -4.55972140e-01, -2.16385357e+00, -1.97891503e-02,
         -3.75202752e+00,  2.04522712e+00],
        [ 1.05683490e+00, -6.10759501e-01,  9.50808670e-03,
         -9.71017115e-01,  3.58387415e-01, -3.97285040e-01,
         -4.43918546e-01, -1.15316134e+00, -7.75623207e-02,
         -1.63858226e+00,  1.49019619e+00],
        [ 1.14626738e+00, -3.59140357e-01,  1.93936090e-03,
         -6.30134803e-01,  2.76810101e-01, -3.90021481e-01,
         -4.24060350e-01, -1.15520027e+00, -1.74236089e-02,
         -1.74144622e+00,  5.39387238e-01],
        [ 1.24455034e+00, -5.48225928e-03,  2.68298116e-03,
         -5.37246848e-03, -5.24586539e-06, -7.42097135e-02,
         -2.55240233e-01, -3.27880187e-01,  2.43391650e-01,
         -1.23259300e+00,  1.51508172e+00],
        [ 1.07632367e+00, -5.61102814e-01,  1.5111619

### Soft Actor Critic

In [22]:
class SAC:
    
    def __init__(self, env, observation_dimensions, action_dimensions, action_bound, buffer_capacity,
                 minibatch_size=256, gamma=0.99, tau=0.95, lr=3e-4):
        
        self.env = env
        
        self.a = Actor(action_dimensions, action_bound)
        self.c_gen = Critic_Wrapper(observation_dimensions, action_dimensions)
        self.c1 = self.c_gen.get_critic()
        self.c2 = self.c_gen.get_critic()
        self.tc1 = self.c_gen.get_critic()
        self.tc2 = self.c_gen.get_critic()
        
        self.tc1.set_weights(self.c1.get_weights())
        self.tc2.set_weights(self.c2.get_weights())

        self.te = -np.prod(action_dimensions)
        self.alpha = tf.Variable(0.0, dtype=tf.float32)
        
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c1_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c2_opt = tf.keras.optimizers.Adam(learning_rate=lr)                                                  
        self.alpha_opt = tf.keras.optimizers.Adam(learning_rate=lr)   
        
        self.buffer = Buffer(observation_dimensions, action_dimensions, buffer_capacity, minibatch_size)
        
        self.gamma, self.tau = gamma, tau
        
    def train(self, max_env_step):
        t = 0
        epo = 0
        while t < max_env_step:
            p_s = self.env.reset()
            a_losses = []
            c1_losses = []
            c2_losses = []
            alpha_losses = []
            while True:
                a, log_a = self.a(tf.expand_dims(p_s, 0))
                a=a[0]
                s, r, d, _ = self.env.step(a)
                end = 0 if d else 1
                
                self.buffer.record((p_s, a, r, s, end))
                data = self.buffer.sample()
                
                a_loss, c1_loss, c2_loss, alpha_loss = self.update(data)
                
                a_losses.append(a_loss.numpy())
                c1_losses.append(c1_loss.numpy())
                c2_losses.append(c2_loss.numpy())
                alpha_losses.append(alpha_loss.numpy())
                
                t = t+1
                
                if d:
                    break
                p_s = s
                
            print("Epoch {:04d}".format(epo), "Policy Avg. Loss: ", np.mean(a_losses), 
                  ", Critic 1 Avg. Loss: ",  np.mean(c1_losses), 
                  ", Critic 2 Avg. Loss: ",  np.mean(c2_losses), 
                  ", Alpha 1 Avg. Loss: ",  np.mean(alpha_losses), flush=True)
            epo = epo+1


    @tf.function
    def update(self, data):
        s_b, a_b, r_b, ns_b, d_b = data
        with tf.GradientTape() as tape_c1, tf.GradientTape() as tape_c2:
            q1 = self.c1([s_b, a_b])
            q2 = self.c2([s_b, a_b])
            na, nlog_a = self.a(ns_b)
            
            tq1 = self.tc1([ns_b, na])
            tq2 = self.tc2([ns_b, na])
            
            min_qt = tf.math.minimum(tq1,tq2)
            
            soft_qt = min_qt - (self.alpha*nlog_a)
            
            y = tf.stop_gradient(r_b+self.gamma*d_b*tf.cast(soft_qt, dtype=tf.float64))
            
            L_c1 = 0.5*tf.reduce_mean((y-tf.cast(q1, dtype=tf.float64))**2)
            L_c2 = 0.5*tf.reduce_mean((y-tf.cast(q2, dtype=tf.float64))**2)
        c1_grad = tape_c1.gradient(L_c1, self.c1.trainable_variables)
        c2_grad = tape_c2.gradient(L_c2, self.c2.trainable_variables)
        
        self.c1_opt.apply_gradients(zip(c1_grad, self.c1.trainable_variables))
        self.c2_opt.apply_gradients(zip(c2_grad, self.c2.trainable_variables))
        
        for (tc1w, c1w) in zip(self.tc1.variables, self.c1.variables):
            tc1w.assign(tc1w*self.tau + c1w*(1.0-self.tau))
        for (tc2w, c2w) in zip(self.tc2.variables, self.c2.variables):
            tc2w.assign(tc2w*self.tau + c2w*(1.0-self.tau))
            
        with tf.GradientTape() as tape_a, tf.GradientTape() as tape_alpha:
            a, log_a = self.a(s_b)
            qa1 = self.c1([s_b, a])
            qa2 = self.c2([s_b, a])
            
            soft_qa = tf.reduce_mean([qa1,qa2], axis=0)
            
            L_a = -tf.reduce_mean(soft_qa-self.alpha*log_a)
            L_alpha = -tf.reduce_mean(self.alpha*tf.stop_gradient(log_a + self.te))
        grad_a = tape_a.gradient(L_a, self.a.trainable_variables)
        grad_alpha = tape_alpha.gradient(L_alpha, [self.alpha])
        self.a_opt.apply_gradients(zip(grad_a, self.a.trainable_variables))
        self.alpha_opt.apply_gradients(zip(grad_alpha, [self.alpha]))
        
        return L_a, L_c1, L_c2, L_alpha
    
    def save_weights(self, dir_path):
        self.a.save_weights(dir_path+"/a.ckpt")
        print("Saved actor weights", flush=True)
        self.c1.save_weights(dir_path+"/c1.ckpt")
        print("Saved critic 1 weights", flush=True)
        self.c2.save_weights(dir_path+"/c2.ckpt")
        print("Saved critic 2 weights", flush=True)

    def load_weights(self, dir_path):
        try:
            self.a.load_weights(dir_path+"/a.ckpt")
            print("Loaded actor weights", flush=True)
            self.c1.load_weights(dir_path+"/c1.ckpt")
            print("Loaded critic 1 weights", flush=True)
            self.c2.load_weights(dir_path+"/c2.ckpt")
            print("Loaded critic 2 weights", flush=True)
        except ValueError:
            print("ERROR: Please make sure weights are saved as .ckpt", flush=True)
            
    def eval_rollout(self, problem):
        eps_r = 0
        eval_env = gym.make(problem)
        eval_obs = eval_env.reset()

        while True:
            eval_env.render()

            tf_eval_obs = tf.expand_dims(tf.convert_to_tensor(eval_obs), 0)

            eval_a, eval_log_a = self.a(tf_eval_obs, eval_mode=True)

            eval_a = eval_a[0]

            eval_obs_new, eval_r, eval_d, _ = eval_env.step(eval_a)

            eps_r += eval_r

            if eval_d:
                break
                
            eval_obs = eval_obs_new

        glfw.destroy_window(eval_env.viewer.window)
        eval_env.close()
        print("rollout episodic reward: ", eps_r, flush=True)


In [23]:
sac1 = SAC(env, num_states, num_actions, upper_bound, 1000000)

In [24]:
sac1.train(1000)

Instructions for updating:
`scale_identity_multiplier` is deprecated; please combine it into `scale_diag` directly instead.
Epoch 0000 Policy Avg. Loss:  -0.17323576 , Critic 1 Avg. Loss:  0.23493227020204865 , Critic 2 Avg. Loss:  0.22517719571923336 , Alpha 1 Avg. Loss:  -0.016376026
Epoch 0001 Policy Avg. Loss:  -0.4421635 , Critic 1 Avg. Loss:  0.1458823643086784 , Critic 2 Avg. Loss:  0.17533609958233584 , Alpha 1 Avg. Loss:  -0.02802293
Epoch 0002 Policy Avg. Loss:  -0.69007957 , Critic 1 Avg. Loss:  0.1272448478755582 , Critic 2 Avg. Loss:  0.1835665959286314 , Alpha 1 Avg. Loss:  -0.031876553
Epoch 0003 Policy Avg. Loss:  -1.0074254 , Critic 1 Avg. Loss:  0.1579946972764552 , Critic 2 Avg. Loss:  0.2038181972036123 , Alpha 1 Avg. Loss:  -0.031908013
Epoch 0004 Policy Avg. Loss:  -1.3389686 , Critic 1 Avg. Loss:  0.19787382896290398 , Critic 2 Avg. Loss:  0.23847185554383746 , Alpha 1 Avg. Loss:  -0.0073544085
Epoch 0005 Policy Avg. Loss:  -1.788686 , Critic 1 Avg. Loss:  0.3116

Epoch 0049 Policy Avg. Loss:  -16.847857 , Critic 1 Avg. Loss:  3.3761103894118674 , Critic 2 Avg. Loss:  3.296152030299844 , Alpha 1 Avg. Loss:  0.11224912
Epoch 0050 Policy Avg. Loss:  -16.121824 , Critic 1 Avg. Loss:  3.034699540538434 , Critic 2 Avg. Loss:  3.0491944833148543 , Alpha 1 Avg. Loss:  -0.16300718
Epoch 0051 Policy Avg. Loss:  -16.246952 , Critic 1 Avg. Loss:  2.5643210310657985 , Critic 2 Avg. Loss:  2.613567626371913 , Alpha 1 Avg. Loss:  -0.2824378
Epoch 0052 Policy Avg. Loss:  -16.179592 , Critic 1 Avg. Loss:  2.700810109046626 , Critic 2 Avg. Loss:  2.7536229158395846 , Alpha 1 Avg. Loss:  -0.28990152


In [25]:
sac1.save_weights("/Users/anthonylaw/Desktop/Work_Research/rl_models/MA2C/sac/devel/test_out/weights")

PermissionDeniedError: /Users; Permission denied

In [None]:
sac2 = SAC(env, num_states, num_actions, upper_bound, 1000000)

In [None]:
sac2.load_weights("/Users/anthonylaw/Desktop/Work_Research/rl_models/MA2C/sac/devel/test_out/weights")

In [None]:
sac2.train(500)