In [2]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
import numpy as np
import os
import json
import random
import tensorflow_probability as tfp
from tensorflow.keras import regularizers
import glfw

### Env Setup

#### Toggle GPU

In [3]:
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" #If the line below doesn't work, uncomment this line (make sure to comment the line below); it should help.
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [4]:
problem = "Hopper-v3"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

### Actor Model

#### Note:

Same as PPO actor. Gaussian policy

In [5]:
EPSILON = 1e-10

class Actor(Model):

    def __init__(self, action_dimensions, action_bound):
        super().__init__()
        self.action_dim, self.upper_bound = action_dimensions, action_bound
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(self.action_dim),
                                                                    scale_diag=tf.ones(self.action_dim))
        self.input_batch_norm = layers.BatchNormalization()
        self.dense1_layer = layers.Dense(64, activation="relu")
        self.dense2_layer = layers.Dense(64, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):
        norm_state = self.input_batch_norm(state)
        a1 = self.dense1_layer(norm_state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)
        sigma = tf.clip_by_value(sigma, EPSILON, 2.718)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)
        
        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))
 
        action = tf.tanh(action_)

        log_pi_ = dist.log_prob(action_)     
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
        return action*self.upper_bound, log_pi


In [8]:
actor_test = Actor(num_actions, upper_bound)

In [9]:
obs = env.reset()
obs
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test = actor_test(tf_obs)
print(a_test, log_a_test)

tf.Tensor([[ 0.39798087 -0.9506462   0.9480283 ]], shape=(1, 3), dtype=float32) tf.Tensor([0.368845], shape=(1,), dtype=float32)


In [10]:
actor_test.summary()

Model: "actor_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_1 (Batc  multiple                 44        
 hNormalization)                                                 
                                                                 
 dense_4 (Dense)             multiple                  768       
                                                                 
 dense_5 (Dense)             multiple                  4160      
                                                                 
 dense_6 (Dense)             multiple                  195       
                                                                 
 dense_7 (Dense)             multiple                  195       
                                                                 
Total params: 5,362
Trainable params: 5,340
Non-trainable params: 22
________________________________________________________

### Critic Wrapper

#### Note:

Different from PPO, critic evaluate state-action pairs.

In [11]:
class Critic_Wrapper():
    def __init__(self, state_dim, action_dim):
        self.s_dim=state_dim
        self.a_dim=action_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(32, activation="relu")(state_input)

        # Action as input
        action_input = layers.Input(shape=(self.a_dim))
        action_out = layers.Dense(32, activation="relu")(action_input)

        # Concatenating
        concat = layers.Concatenate()([state_out, action_out])
        out = layers.Dense(64, activation="relu")(concat)
        outputs = tf.squeeze(layers.Dense(1)(out))

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model

In [12]:
critic_gen = Critic_Wrapper(num_states, num_actions)
critic_test = critic_gen.get_critic()

In [13]:
obs = env.reset()
obs

array([ 1.24880503e+00,  1.42239147e-03, -9.43414843e-04,  4.49359909e-03,
       -4.49856098e-03,  4.32982949e-03,  2.01481111e-03, -2.96223623e-03,
        3.50253660e-03,  2.19241399e-03, -4.95274510e-03])

In [14]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.24880503e+00,  1.42239147e-03, -9.43414843e-04,
         4.49359909e-03, -4.49856098e-03,  4.32982949e-03,
         2.01481111e-03, -2.96223623e-03,  3.50253660e-03,
         2.19241399e-03, -4.95274510e-03]])>

In [15]:
v_test = critic_test([tf_obs, a_test])
v_test

<tf.Tensor: shape=(), dtype=float32, numpy=-0.037884098>

In [16]:
obs_new, _, _, _ = env.step(a_test[0])
tf_obs_new = tf.expand_dims(obs_new, 0)
statex2 = tf.convert_to_tensor([obs, obs_new])
print(statex2.shape)

(2, 11)


In [17]:
a_2, loga_2 = actor_test(statex2)

print(a_2.shape, loga_2.shape)

(2, 3) (2,)


In [18]:
v_2 = critic_test([statex2, a_2])
print(v_2.shape)

(2,)


In [19]:
critic_test.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 3)]          0           []                               
                                                                                                  
 dense_8 (Dense)                (None, 32)           384         ['input_1[0][0]']                
                                                                                                  
 dense_9 (Dense)                (None, 32)           128         ['input_2[0][0]']                
                                                                                              

### Replay Buffer

In [20]:
class Buffer:
    def __init__(self, obs_dim, a_dim, buffer_capacity=100000, batch_size=256):
        
        self.obs_dim = obs_dim
        self.a_dim = a_dim
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.action_buffer = np.zeros((self.buffer_capacity, self.a_dim))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.done_buffer = np.zeros((self.buffer_capacity, 1))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]
        self.buffer_counter += 1
        
    def sample(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.squeeze(tf.convert_to_tensor(self.reward_buffer[batch_indices]))
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        done_batch = tf.squeeze(tf.convert_to_tensor(self.done_buffer[batch_indices]))
        
        return (state_batch,
               action_batch,
               reward_batch,
               next_state_batch,
               done_batch)


In [21]:
buffer1 = Buffer(num_states, num_actions, 100, 10)

In [22]:
prev_obs = env.reset()

for i in range(buffer1.buffer_capacity):
    a, _ = actor_test(tf.expand_dims(prev_obs, 0))
    obs, r, d, _ = env.step(a[0])
    
    buffer1.record((prev_obs, a[0], r, obs, d))
    
    prev_obs = obs
    

In [23]:
buffer1.sample()

(<tf.Tensor: shape=(10, 11), dtype=float64, numpy=
 array([[ 1.21723377e+00, -2.08340990e-01, -4.67962509e-02,
         -3.25600430e-01,  6.23583820e-03, -1.07274105e+00,
         -3.80004980e-01, -3.22802333e+00,  8.55268481e-01,
         -6.70776993e+00,  1.74540892e+00],
        [ 1.86238683e-01, -2.30998832e+00, -5.52612388e-01,
         -2.61000593e+00,  6.95616188e-01,  3.29272822e-01,
          3.50386963e-01, -4.10256297e+00, -2.83147178e+00,
          3.44946233e-01, -4.21573939e+00],
        [ 2.38517062e-01, -2.51215895e+00, -5.61198158e-01,
         -2.59561934e+00,  3.45594740e-01,  5.98429209e-01,
         -4.23727146e-01,  2.81824917e-01, -9.93262087e-01,
         -7.56256987e-01,  4.31650840e+00],
        [ 2.43869661e-01, -2.50656941e+00, -5.16542170e-01,
         -2.59154912e+00,  2.50113664e-01,  4.28416933e-01,
          3.92567546e-02, -1.42019885e-01, -1.87602533e+00,
          6.82211640e-01,  2.45340628e+00],
        [ 1.24424973e+00, -7.54478919e-03,  1.3442425

### Soft Actor Critic

In [38]:
class SAC:
    
    def __init__(self, env, observation_dimensions, action_dimensions, action_bound, buffer_capacity,
                 minibatch_size=256, gamma=0.99, tau=0.95, lr=3e-4):
        
        self.env = env
        tf.debugging.enable_check_numerics()
        self.a = Actor(action_dimensions, action_bound)
        self.c_gen = Critic_Wrapper(observation_dimensions, action_dimensions)
        self.c1 = self.c_gen.get_critic()
        self.c2 = self.c_gen.get_critic()
        self.tc1 = self.c_gen.get_critic()
        self.tc2 = self.c_gen.get_critic()
        
        self.tc1.set_weights(self.c1.get_weights())
        self.tc2.set_weights(self.c2.get_weights())

        self.te = -np.prod(action_dimensions)
        self.alpha = tf.Variable(0.0, dtype=tf.float32)
        
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c1_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c2_opt = tf.keras.optimizers.Adam(learning_rate=lr)                                                  
        self.alpha_opt = tf.keras.optimizers.Adam(learning_rate=lr)   
        
        self.buffer = Buffer(observation_dimensions, action_dimensions, buffer_capacity, minibatch_size)
        
        self.gamma, self.tau = gamma, tau
        
    def train(self, max_env_step):
        t = 0
        a_losses = []
        c1_losses = []
        c2_losses = []
        alpha_losses = []
        while t < max_env_step:
            p_s = self.env.reset()

            while True:
                a, log_a = self.a(tf.expand_dims(p_s, 0))
                a=a[0]
                s, r, d, _ = self.env.step(a)
                end = 0 if d else 1
                
                self.buffer.record((p_s, a, r, s, end))
                data = self.buffer.sample()
                
                a_loss, c1_loss, c2_loss, alpha_loss = self.update(data)
                
                a_losses.append(a_loss.numpy())
                c1_losses.append(c1_loss.numpy())
                c2_losses.append(c2_loss.numpy())
                alpha_losses.append(alpha_loss.numpy())
                
                t = t+1
                
                if d:
                    break
                p_s = s
                
        print("Per {:04d} Steps".format(max_env_step), "Policy Avg. Loss: ", np.mean(a_losses), 
              ", Critic 1 Avg. Loss: ",  np.mean(c1_losses), 
              ", Critic 2 Avg. Loss: ",  np.mean(c2_losses), 
              ", Alpha 1 Avg. Loss: ",  np.mean(alpha_losses), flush=True)


    @tf.function
    def update(self, data):
        s_b, a_b, r_b, ns_b, d_b = data
        with tf.GradientTape() as tape_c1, tf.GradientTape() as tape_c2:
            q1 = self.c1([s_b, a_b])
            q2 = self.c2([s_b, a_b])
            na, nlog_a = self.a(ns_b, training=True)
            
            tq1 = self.tc1([ns_b, na])
            tq2 = self.tc2([ns_b, na])
            
            min_qt = tf.math.minimum(tq1,tq2)
            
            soft_qt = min_qt - (self.alpha*nlog_a)
            
            y = tf.stop_gradient(r_b+self.gamma*d_b*tf.cast(soft_qt, dtype=tf.float64))
            
            L_c1 = 0.5*tf.reduce_mean((y-tf.cast(q1, dtype=tf.float64))**2)
            L_c2 = 0.5*tf.reduce_mean((y-tf.cast(q2, dtype=tf.float64))**2)
        c1_grad = tape_c1.gradient(L_c1, self.c1.trainable_variables)
        c2_grad = tape_c2.gradient(L_c2, self.c2.trainable_variables)
        
        self.c1_opt.apply_gradients(zip(c1_grad, self.c1.trainable_variables))
        self.c2_opt.apply_gradients(zip(c2_grad, self.c2.trainable_variables))
        
        for (tc1w, c1w) in zip(self.tc1.variables, self.c1.variables):
            tc1w.assign(tc1w*self.tau + c1w*(1.0-self.tau))
        for (tc2w, c2w) in zip(self.tc2.variables, self.c2.variables):
            tc2w.assign(tc2w*self.tau + c2w*(1.0-self.tau))
            
        with tf.GradientTape() as tape_a, tf.GradientTape() as tape_alpha:
            a, log_a = self.a(s_b, training=True)
            qa1 = self.c1([s_b, a])
            qa2 = self.c2([s_b, a])
            
            soft_qa = tf.math.minimum(qa1,qa2)

            L_a = -tf.reduce_mean(soft_qa-self.alpha*log_a)
            L_alpha = -tf.reduce_mean(self.alpha*tf.stop_gradient(log_a + self.te))
        grad_a = tape_a.gradient(L_a, self.a.trainable_variables)
        grad_alpha = tape_alpha.gradient(L_alpha, [self.alpha])
        self.a_opt.apply_gradients(zip(grad_a, self.a.trainable_variables))
        self.alpha_opt.apply_gradients(zip(grad_alpha, [self.alpha]))
        
        return L_a, L_c1, L_c2, L_alpha
    
    def save_weights(self, dir_path):
        cp = tf.train.Checkpoint(step=self.alpha)
        self.a.save_weights(dir_path+"/a.ckpt")
        print("Saved actor weights", flush=True)
        self.c1.save_weights(dir_path+"/c1.ckpt")
        print("Saved critic 1 weights", flush=True)
        self.c2.save_weights(dir_path+"/c2.ckpt")
        print("Saved critic 2 weights", flush=True)
        cp.save(dir_path+"/alpha")
        print("Saved alpha weights", flush=True)

    def load_weights(self, dir_path):
        try:
            cp = tf.train.Checkpoint(step=self.alpha)
            self.a.load_weights(dir_path+"/a.ckpt")
            print("Loaded actor weights", flush=True)
            self.c1.load_weights(dir_path+"/c1.ckpt")
            print("Loaded critic 1 weights", flush=True)
            self.c2.load_weights(dir_path+"/c2.ckpt")
            print("Loaded critic 2 weights", flush=True)
            cp.restore(dir_path+"/alpha-1")
            print("Loaded alpha weights", flush=True)
            self.tc1.set_weights(self.c1.get_weights())
            self.tc2.set_weights(self.c2.get_weights())

        except ValueError:
            print("ERROR: Please make sure weights are saved as .ckpt", flush=True)
            
    def eval_rollout(self, problem, rbs=False, render=False):
        eps_r = 0
        
        if rbs:
            domain, task, controller = problem
            eval_env = Robosuite_Wrapper(domain, task, controller, render)
        else:
            eval_env = gym.make(problem)
            
        eval_obs = eval_env.reset()

        while True:
            if render:
                eval_env.render()

            tf_eval_obs = tf.expand_dims(tf.convert_to_tensor(eval_obs), 0)

            eval_a, eval_log_a = self.a(tf_eval_obs, eval_mode=True)

            eval_a = eval_a[0]

            eval_obs_new, eval_r, eval_d, _ = eval_env.step(eval_a)

            eps_r += eval_r

            if eval_d:
                break
                
            eval_obs = eval_obs_new
        
        if render:
            if not rbs:
                glfw.destroy_window(eval_env.viewer.window)

        eval_env.close()
        print("rollout episodic reward: ", eps_r, flush=True)
        
        return eps_r


In [39]:
sac1 = SAC(env, num_states, num_actions, upper_bound, 1000000)

INFO:tensorflow:Enabled check-numerics callback in thread MainThread


In [40]:
sac1.train(1000)

Per 1000 Steps Policy Avg. Loss:  -11.305841 , Critic 1 Avg. Loss:  2.204413843825311 , Critic 2 Avg. Loss:  2.323269229063349 , Alpha 1 Avg. Loss:  -0.31867275


In [41]:
sac1.save_weights("/home/tony/rl_models/MA2C/sac/devel/weights")

Saved actor weights
Saved critic 1 weights
Saved critic 2 weights
Saved alpha weights


In [42]:
print(sac1.alpha)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.12317008>


In [43]:
sac2 = SAC(env, num_states, num_actions, upper_bound, 1000000)

INFO:tensorflow:Enabled check-numerics callback in thread MainThread


In [44]:
sac2.load_weights("/home/tony/rl_models/MA2C/sac/devel/weights")

Loaded actor weights
Loaded critic 1 weights
Loaded critic 2 weights
Loaded alpha weights


In [45]:
sac2.alpha

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.12317008>

In [32]:
sac2.train(500)

Per 0500 Steps Policy Avg. Loss:  -10.627731 , Critic 1 Avg. Loss:  5.961140287805368 , Critic 2 Avg. Loss:  4.4566712474632535 , Alpha 1 Avg. Loss:  0.00820907


In [33]:
sac2.eval_rollout(problem)

rollout episodic reward:  2.9068387334922288


2.9068387334922288

### Robosuite Adapter

In [35]:
import numpy as np
import robosuite as suite
from gym import spaces
from robosuite import load_controller_config

In [36]:
class Robosuite_Wrapper():

    def __init__(self, domain, task, controller, render=False):
        self.config = load_controller_config(default_controller=controller)
        self.env = suite.make(env_name=task, # try with other tasks like "Stack" and "Door"
                            robots=domain,  # try with other robots like "Sawyer" and "Jaco"
                            controller_configs=self.config,
                            has_renderer=render,
                            ignore_done=False,
                            has_offscreen_renderer=False,
                            use_camera_obs=False,
                            reward_shaping=True,
                            )
        self.obs_keys = [key for key, value in self.env.observation_spec().items()]
        
        obs_dim = []
        
        for x in self.obs_keys:
            if x == 'hinge_qpos' or x == 'handle_qpos':
                obs_dim.append(1)
            else:
                obs_dim.append(self.env.observation_spec()[x].shape[0])
                
        self.s_dim = int(np.sum(obs_dim,dtype=np.int32))
        self.a_dim = self.env.action_dim
        self.a_ub = self.env.action_spec[1][0]
        self.a_lb = self.env.action_spec[0][0]
        
    def env_specs(self):
        return self.s_dim, self.a_dim, self.a_ub, self.a_lb
    
    def step(self, a):
        s, r, d, i = self.env.step(a)

        s = np.concatenate([s[x] for x in self.obs_keys], axis = None)

        return s, r, d, i
    
    def reset(self):
        s = self.env.reset()
        
        s = np.concatenate([s[x] for x in self.obs_keys], axis = None)
        
        return s
    
    def render(self):
        self.env.render()
    
    def close(self):
        self.env.close()
        

In [37]:
rbs_env = Robosuite_Wrapper("Sawyer", "Door", "JOINT_VELOCITY", True)

Creating window glfw


  @overload(np.MachAr)


In [38]:
rbs_env.env_specs()

(92, 8, 1.0, -1.0)

In [39]:
rbs_env.step(np.ones(8))

(array([ 9.99942609e-01,  3.74334537e-01,  9.99672001e-01, -5.57340643e-01,
         9.98845567e-01,  8.36790461e-01, -9.26542966e-03, -1.07134734e-02,
        -9.27293726e-01, -2.56103524e-02,  8.30283932e-01, -4.80367958e-02,
         5.47523264e-01, -9.99957075e-01,  2.41115402e-02,  5.10183094e-02,
         3.14640923e-02,  4.34014696e-02,  2.07159479e-02,  1.26095977e-02,
         2.57603885e-02, -1.05494183e-01,  1.38008635e-01,  1.02279206e+00,
         9.95321889e-01,  9.45303762e-02,  6.30356712e-03,  1.89370250e-02,
         2.07728249e-02, -2.07695979e-02, -3.00875605e-02,  3.17010574e-02,
        -2.00798007e-01, -3.54255872e-01,  1.10000000e+00, -1.50693271e-01,
        -2.54308309e-01,  1.07500000e+00, -9.53038239e-02, -4.92264508e-01,
         7.72079444e-02, -4.51990886e-02, -3.92316944e-01,  5.22079444e-02,
         2.14833941e-12, -3.95547419e-07,  9.99942609e-01,  3.74334537e-01,
         9.99672001e-01, -5.57340643e-01,  9.98845567e-01,  8.36790461e-01,
        -9.2

In [40]:
rbs_env.render()