In [2]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
import numpy as np
import os
import json
import random
import tensorflow_probability as tfp
from tensorflow.keras import regularizers
import glfw

### Env Setup

In [3]:
problem = "Hopper-v3"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

### Actor Model

#### Note:

Same as PPO actor. Gaussian policy

In [4]:
EPSILON = 1e-10

class Actor(Model):

    def __init__(self, action_dimensions, action_bound):
        super().__init__()
        self.action_dim, self.upper_bound = action_dimensions, action_bound
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(self.action_dim),
                                                                    scale_diag=tf.ones(self.action_dim))
        self.input_batch_norm = layers.BatchNormalization()
        self.dense1_layer = layers.Dense(64, activation="relu")
        self.dense2_layer = layers.Dense(64, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):
        norm_state = self.input_batch_norm(state)
        a1 = self.dense1_layer(norm_state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)
        sigma = tf.clip_by_value(sigma, EPSILON, 2.718)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)
        
        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))
 
        action = tf.tanh(action_)

        log_pi_ = dist.log_prob(action_)     
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
        return action*self.upper_bound, log_pi


In [6]:
actor_test = Actor(num_actions, upper_bound)

In [8]:
obs = env.reset()
obs
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test = actor_test(tf_obs)
print(a_test, log_a_test)

tf.Tensor([[-0.05144804 -0.75096476  0.933185  ]], shape=(1, 3), dtype=float32) tf.Tensor([-1.5006382], shape=(1,), dtype=float32)


In [9]:
actor_test.summary()

Model: "actor_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_1 (Batc  multiple                 44        
 hNormalization)                                                 
                                                                 
 dense_4 (Dense)             multiple                  768       
                                                                 
 dense_5 (Dense)             multiple                  4160      
                                                                 
 dense_6 (Dense)             multiple                  195       
                                                                 
 dense_7 (Dense)             multiple                  195       
                                                                 
Total params: 5,362
Trainable params: 5,340
Non-trainable params: 22
________________________________________________________

### Critic Wrapper

#### Note:

Different from PPO, critic evaluate state-action pairs.

In [10]:
class Critic_Wrapper():
    def __init__(self, state_dim, action_dim):
        self.s_dim=state_dim
        self.a_dim=action_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(32, activation="relu")(state_input)

        # Action as input
        action_input = layers.Input(shape=(self.a_dim))
        action_out = layers.Dense(32, activation="relu")(action_input)

        # Concatenating
        concat = layers.Concatenate()([state_out, action_out])
        out = layers.Dense(64, activation="relu")(concat)
        outputs = tf.squeeze(layers.Dense(1)(out))

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model

In [11]:
critic_gen = Critic_Wrapper(num_states, num_actions)
critic_test = critic_gen.get_critic()

In [12]:
obs = env.reset()
obs

array([ 1.25423948e+00, -7.42031410e-04, -1.22600433e-03,  3.87884076e-03,
        1.99316606e-03,  3.57898386e-03, -2.55238089e-04, -1.33673836e-03,
       -3.03476574e-03, -6.64394176e-04, -2.63203961e-03])

In [13]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.25423948e+00, -7.42031410e-04, -1.22600433e-03,
         3.87884076e-03,  1.99316606e-03,  3.57898386e-03,
        -2.55238089e-04, -1.33673836e-03, -3.03476574e-03,
        -6.64394176e-04, -2.63203961e-03]])>

In [14]:
v_test = critic_test([tf_obs, a_test])
v_test

<tf.Tensor: shape=(), dtype=float32, numpy=-0.24859521>

In [15]:
obs_new, _, _, _ = env.step(a_test[0])
tf_obs_new = tf.expand_dims(obs_new, 0)
statex2 = tf.convert_to_tensor([obs, obs_new])
print(statex2.shape)

(2, 11)


In [16]:
a_2, loga_2 = actor_test(statex2)

print(a_2.shape, loga_2.shape)

(2, 3) (2,)


In [17]:
v_2 = critic_test([statex2, a_2])
print(v_2.shape)

(2,)


In [18]:
critic_test.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 3)]          0           []                               
                                                                                                  
 dense_8 (Dense)                (None, 32)           384         ['input_1[0][0]']                
                                                                                                  
 dense_9 (Dense)                (None, 32)           128         ['input_2[0][0]']                
                                                                                              

### Replay Buffer

In [19]:
class Buffer:
    def __init__(self, obs_dim, a_dim, buffer_capacity=100000, batch_size=256):
        
        self.obs_dim = obs_dim
        self.a_dim = a_dim
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.action_buffer = np.zeros((self.buffer_capacity, self.a_dim))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.done_buffer = np.zeros((self.buffer_capacity, 1))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]
        self.buffer_counter += 1
        
    def sample(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.squeeze(tf.convert_to_tensor(self.reward_buffer[batch_indices]))
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        done_batch = tf.squeeze(tf.convert_to_tensor(self.done_buffer[batch_indices]))
        
        return (state_batch,
               action_batch,
               reward_batch,
               next_state_batch,
               done_batch)


In [20]:
buffer1 = Buffer(num_states, num_actions, 100, 10)

In [21]:
prev_obs = env.reset()

for i in range(buffer1.buffer_capacity):
    a, _ = actor_test(tf.expand_dims(prev_obs, 0))
    obs, r, d, _ = env.step(a[0])
    
    buffer1.record((prev_obs, a[0], r, obs, d))
    
    prev_obs = obs
    

In [22]:
buffer1.sample()

(<tf.Tensor: shape=(10, 11), dtype=float64, numpy=
 array([[ 5.22422464e-01, -1.35114301e+00, -4.67345067e-01,
         -2.58434218e+00,  7.87965647e-01,  7.84194272e-02,
          1.68828093e-01, -3.16890953e-01, -7.31766063e-01,
          6.95071929e-01, -8.06056469e-02],
        [ 5.11060487e-01, -1.33210946e+00, -4.13802264e-01,
         -2.63394096e+00,  7.84158932e-01, -1.89792833e-02,
          1.00521521e-01,  2.85410802e-01,  5.22948816e-01,
          4.38351580e-01, -6.60558773e-01],
        [ 1.24796805e+00, -1.64244341e-02,  5.02232628e-03,
         -4.34726769e-02, -4.29737455e-02, -2.95574252e-01,
         -2.75388487e-01, -1.99150059e+00, -1.01805869e+00,
         -2.33266916e+00, -1.16679800e+00],
        [ 1.22434447e+00, -1.51458966e-01, -7.90829511e-02,
         -1.84881261e-01,  2.94548805e-02, -4.02805795e-01,
         -9.05418786e-01, -3.42803959e+00, -1.91260355e+00,
         -4.10511434e+00,  4.40498184e+00],
        [ 5.19822519e-01, -1.34002324e+00, -4.4611976

### Soft Actor Critic

In [23]:
class SAC:
    
    def __init__(self, env, observation_dimensions, action_dimensions, action_bound, buffer_capacity,
                 minibatch_size=256, gamma=0.99, tau=0.95, lr=3e-4):
        
        self.env = env
        
        self.a = Actor(action_dimensions, action_bound)
        self.c_gen = Critic_Wrapper(observation_dimensions, action_dimensions)
        self.c1 = self.c_gen.get_critic()
        self.c2 = self.c_gen.get_critic()
        self.tc1 = self.c_gen.get_critic()
        self.tc2 = self.c_gen.get_critic()
        
        self.tc1.set_weights(self.c1.get_weights())
        self.tc2.set_weights(self.c2.get_weights())

        self.te = -np.prod(action_dimensions)
        self.alpha = tf.Variable(0.0, dtype=tf.float32)
        
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c1_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c2_opt = tf.keras.optimizers.Adam(learning_rate=lr)                                                  
        self.alpha_opt = tf.keras.optimizers.Adam(learning_rate=lr)   
        
        self.buffer = Buffer(observation_dimensions, action_dimensions, buffer_capacity, minibatch_size)
        
        self.gamma, self.tau = gamma, tau
        
    def train(self, max_env_step):
        t = 0
        a_losses = []
        c1_losses = []
        c2_losses = []
        alpha_losses = []
        while t < max_env_step:
            p_s = self.env.reset()

            while True:
                a, log_a = self.a(tf.expand_dims(p_s, 0))
                a=a[0]
                s, r, d, _ = self.env.step(a)
                end = 0 if d else 1
                
                self.buffer.record((p_s, a, r, s, end))
                data = self.buffer.sample()
                
                a_loss, c1_loss, c2_loss, alpha_loss = self.update(data)
                
                a_losses.append(a_loss.numpy())
                c1_losses.append(c1_loss.numpy())
                c2_losses.append(c2_loss.numpy())
                alpha_losses.append(alpha_loss.numpy())
                
                t = t+1
                
                if d:
                    break
                p_s = s
                
        print("Per {:04d} Steps".format(max_env_step), "Policy Avg. Loss: ", np.mean(a_losses), 
              ", Critic 1 Avg. Loss: ",  np.mean(c1_losses), 
              ", Critic 2 Avg. Loss: ",  np.mean(c2_losses), 
              ", Alpha 1 Avg. Loss: ",  np.mean(alpha_losses), flush=True)


    @tf.function
    def update(self, data):
        s_b, a_b, r_b, ns_b, d_b = data
        with tf.GradientTape() as tape_c1, tf.GradientTape() as tape_c2:
            q1 = self.c1([s_b, a_b])
            q2 = self.c2([s_b, a_b])
            na, nlog_a = self.a(ns_b, training=True)
            
            tq1 = self.tc1([ns_b, na])
            tq2 = self.tc2([ns_b, na])
            
            min_qt = tf.math.minimum(tq1,tq2)
            
            soft_qt = min_qt - (self.alpha*nlog_a)
            
            y = tf.stop_gradient(r_b+self.gamma*d_b*tf.cast(soft_qt, dtype=tf.float64))
            
            L_c1 = 0.5*tf.reduce_mean((y-tf.cast(q1, dtype=tf.float64))**2)
            L_c2 = 0.5*tf.reduce_mean((y-tf.cast(q2, dtype=tf.float64))**2)
        c1_grad = tape_c1.gradient(L_c1, self.c1.trainable_variables)
        c2_grad = tape_c2.gradient(L_c2, self.c2.trainable_variables)
        
        self.c1_opt.apply_gradients(zip(c1_grad, self.c1.trainable_variables))
        self.c2_opt.apply_gradients(zip(c2_grad, self.c2.trainable_variables))
        
        for (tc1w, c1w) in zip(self.tc1.variables, self.c1.variables):
            tc1w.assign(tc1w*self.tau + c1w*(1.0-self.tau))
        for (tc2w, c2w) in zip(self.tc2.variables, self.c2.variables):
            tc2w.assign(tc2w*self.tau + c2w*(1.0-self.tau))
            
        with tf.GradientTape() as tape_a, tf.GradientTape() as tape_alpha:
            a, log_a = self.a(s_b, training=True)
            qa1 = self.c1([s_b, a])
            qa2 = self.c2([s_b, a])
            
            soft_qa = tf.math.minimum(qa1,qa2)

            L_a = -tf.reduce_mean(soft_qa-self.alpha*log_a)
            L_alpha = -tf.reduce_mean(self.alpha*tf.stop_gradient(log_a + self.te))
        grad_a = tape_a.gradient(L_a, self.a.trainable_variables)
        grad_alpha = tape_alpha.gradient(L_alpha, [self.alpha])
        self.a_opt.apply_gradients(zip(grad_a, self.a.trainable_variables))
        self.alpha_opt.apply_gradients(zip(grad_alpha, [self.alpha]))
        
        return L_a, L_c1, L_c2, L_alpha
    
    def save_weights(self, dir_path):
        cp = tf.train.Checkpoint(step=self.alpha)
        self.a.save_weights(dir_path+"/a.ckpt")
        print("Saved actor weights", flush=True)
        self.c1.save_weights(dir_path+"/c1.ckpt")
        print("Saved critic 1 weights", flush=True)
        self.c2.save_weights(dir_path+"/c2.ckpt")
        print("Saved critic 2 weights", flush=True)
        cp.save(dir_path+"/alpha")
        print("Saved alpha weights", flush=True)

    def load_weights(self, dir_path):
        try:
            cp = tf.train.Checkpoint(step=self.alpha)
            self.a.load_weights(dir_path+"/a.ckpt")
            print("Loaded actor weights", flush=True)
            self.c1.load_weights(dir_path+"/c1.ckpt")
            print("Loaded critic 1 weights", flush=True)
            self.c2.load_weights(dir_path+"/c2.ckpt")
            print("Loaded critic 2 weights", flush=True)
            cp.restore(dir_path+"/alpha-1")
            print("Loaded alpha weights", flush=True)
            self.tc1.set_weights(self.c1.get_weights())
            self.tc2.set_weights(self.c2.get_weights())

        except ValueError:
            print("ERROR: Please make sure weights are saved as .ckpt", flush=True)
            
    def eval_rollout(self, problem, rbs=False, render=False):
        eps_r = 0
        
        if rbs:
            domain, task, controller = problem
            eval_env = Robosuite_Wrapper(domain, task, controller, render)
        else:
            eval_env = gym.make(problem)
            
        eval_obs = eval_env.reset()

        while True:
            if render:
                eval_env.render()

            tf_eval_obs = tf.expand_dims(tf.convert_to_tensor(eval_obs), 0)

            eval_a, eval_log_a = self.a(tf_eval_obs, eval_mode=True)

            eval_a = eval_a[0]

            eval_obs_new, eval_r, eval_d, _ = eval_env.step(eval_a)

            eps_r += eval_r

            if eval_d:
                break
                
            eval_obs = eval_obs_new
        
        if render:
            if not rbs:
                glfw.destroy_window(eval_env.viewer.window)

        eval_env.close()
        print("rollout episodic reward: ", eps_r, flush=True)
        
        return eps_r


In [24]:
sac1 = SAC(env, num_states, num_actions, upper_bound, 1000000)

In [25]:
sac1.train(1000)

Instructions for updating:
`scale_identity_multiplier` is deprecated; please combine it into `scale_diag` directly instead.
Per 1000 Steps Policy Avg. Loss:  -10.498349 , Critic 1 Avg. Loss:  2.32548693862299 , Critic 2 Avg. Loss:  2.272197491487963 , Alpha 1 Avg. Loss:  -0.20444104


In [26]:
sac1.save_weights("/home/tony/rl_models/MA2C/sac/devel/weights")

Saved actor weights
Saved critic 1 weights
Saved critic 2 weights
Saved alpha weights


In [27]:
print(sac1.alpha)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.13345031>


In [28]:
sac2 = SAC(env, num_states, num_actions, upper_bound, 1000000)

In [29]:
sac2.load_weights("/home/tony/rl_models/MA2C/sac/devel/weights")

Loaded actor weights
Loaded critic 1 weights
Loaded critic 2 weights
Loaded alpha weights


In [30]:
sac2.alpha

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.13345031>

In [31]:
sac2.train(500)

Per 0500 Steps Policy Avg. Loss:  -20.888533 , Critic 1 Avg. Loss:  3.4495082708911866 , Critic 2 Avg. Loss:  3.2069531694989233 , Alpha 1 Avg. Loss:  0.012425124


In [32]:
sac2.eval_rollout(problem)

rollout episodic reward:  42.905397710801054


42.905397710801054

### Robosuite Adapter

In [33]:
import numpy as np
import robosuite as suite
from gym import spaces
from robosuite import load_controller_config

In [34]:
class Robosuite_Wrapper():

    def __init__(self, domain, task, controller, render=False):
        self.config = load_controller_config(default_controller=controller)
        self.env = suite.make(env_name=task, # try with other tasks like "Stack" and "Door"
                            robots=domain,  # try with other robots like "Sawyer" and "Jaco"
                            controller_configs=self.config,
                            has_renderer=render,
                            ignore_done=False,
                            has_offscreen_renderer=False,
                            use_camera_obs=False,
                            reward_shaping=True,
                            )
        self.obs_keys = [key for key, value in self.env.observation_spec().items()]
        
        obs_dim = []
        
        for x in self.obs_keys:
            if x == 'hinge_qpos' or x == 'handle_qpos':
                obs_dim.append(1)
            else:
                obs_dim.append(self.env.observation_spec()[x].shape[0])
                
        self.s_dim = int(np.sum(obs_dim,dtype=np.int32))
        self.a_dim = self.env.action_dim
        self.a_ub = self.env.action_spec[1][0]
        self.a_lb = self.env.action_spec[0][0]
        
    def env_specs(self):
        return self.s_dim, self.a_dim, self.a_ub, self.a_lb
    
    def step(self, a):
        s, r, d, i = self.env.step(a)

        s = np.concatenate([s[x] for x in self.obs_keys], axis = None)

        return s, r, d, i
    
    def reset(self):
        s = self.env.reset()
        
        s = np.concatenate([s[x] for x in self.obs_keys], axis = None)
        
        return s
    
    def render(self):
        self.env.render()
    
    def close(self):
        self.env.close()
        

In [35]:
rbs_env = Robosuite_Wrapper("Sawyer", "Door", "JOINT_VELOCITY", True)

Creating window glfw


  @overload(np.MachAr)


In [36]:
rbs_env.env_specs()

(92, 8, 1.0, -1.0)

In [37]:
rbs_env.step(np.ones(8))

(array([ 9.99876857e-01,  4.03822877e-01,  9.99952769e-01, -5.72626229e-01,
         9.99648717e-01,  8.36903826e-01, -5.86764353e-03,  1.56929919e-02,
        -9.14837190e-01,  9.71907541e-03,  8.19816566e-01,  2.65036449e-02,
         5.47349967e-01, -9.99982785e-01,  2.38449836e-02,  5.13842176e-02,
         3.16107393e-02,  4.29136238e-02,  2.13785884e-02,  1.22256101e-02,
         2.54866474e-02, -1.17421972e-01,  1.74780019e-01,  1.01195112e+00,
         9.95654126e-01,  9.11556826e-02, -1.60533910e-02, -1.02855003e-02,
         2.07727265e-02, -2.07696963e-02, -3.01367731e-02,  3.16518448e-02,
        -2.07569588e-01, -3.53069101e-01,  1.10000000e+00, -1.57391609e-01,
        -2.53158289e-01,  1.07500000e+00, -9.01476159e-02, -5.27849119e-01,
         8.80488834e-02, -3.99696368e-02, -4.27938307e-01,  6.30488834e-02,
         2.14833941e-12, -3.95547419e-07,  9.99876857e-01,  4.03822877e-01,
         9.99952769e-01, -5.72626229e-01,  9.99648717e-01,  8.36903826e-01,
        -5.8

In [38]:
rbs_env.render()