In [1]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import random
import tensorflow_probability as tfp
from tensorflow.keras import regularizers

  _nlv = LooseVersion(_np_version)
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(PILLOW_VERSION) >= "3.4":
  other = LooseVersion(other)
  if (distutils.version.LooseVersion(tf.__version__) <


### Env Setup

In [2]:
problem = "Hopper-v3"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

  and should_run_async(code)


### Actor Model

#### Note:

Same as PPO actor. Gaussian policy

In [3]:
EPSILON = 1e-10

class Actor(Model):

    def __init__(self, action_dimensions, action_bound):
        super().__init__()
        self.action_dim, self.upper_bound = action_dimensions, action_bound
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(self.action_dim),
                                                                    scale_diag=tf.ones(self.action_dim))
        self.dense1_layer = layers.Dense(64, activation="relu")
        self.dense2_layer = layers.Dense(64, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):

        a1 = self.dense1_layer(state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)
        sigma = tf.clip_by_value(sigma, EPSILON, 2.718)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)
        
        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))
 
        action = tf.tanh(action_)

        log_pi_ = dist.log_prob(action_)     
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
        return action*self.upper_bound, log_pi, dist.entropy()


In [4]:
actor_test = Actor(num_actions, upper_bound)

In [5]:
obs = env.reset()
obs
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test, h_a_test = actor_test(tf_obs)
print(a_test.shape, log_a_test.shape, h_a_test.shape)

(1, 3) (1,) (1,)


In [6]:
actor_test.summary()

Model: "actor"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  768       
                                                                 
 dense_1 (Dense)             multiple                  4160      
                                                                 
 dense_2 (Dense)             multiple                  195       
                                                                 
 dense_3 (Dense)             multiple                  195       
                                                                 
Total params: 5,318
Trainable params: 5,318
Non-trainable params: 0
_________________________________________________________________


### Critic Wrapper

#### Note:

Different from PPO, critic evaluate state-action pairs.

In [27]:
class Critic_Wrapper():
    def __init__(self, state_dim, action_dim):
        self.s_dim=state_dim
        self.a_dim=action_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(32, activation="relu")(state_input)

        # Action as input
        action_input = layers.Input(shape=(self.a_dim))
        action_out = layers.Dense(32, activation="relu")(action_input)

        # Concatenating
        concat = layers.Concatenate()([state_out, action_out])
        out = layers.Dense(64, activation="relu")(concat)
        outputs = tf.squeeze(layers.Dense(1)(out))

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model

In [28]:
critic_gen = Critic_Wrapper(num_states, num_actions)
critic_test = critic_gen.get_critic()

In [29]:
obs = env.reset()
obs

array([ 1.25253867e+00, -4.92599989e-03, -3.53270510e-03,  1.57248767e-03,
        4.03869469e-03,  3.76607568e-03, -2.81583066e-03, -4.27397296e-03,
        1.91855328e-03,  3.50469503e-03,  1.10798166e-04])

In [30]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test, h_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.25253867e+00, -4.92599989e-03, -3.53270510e-03,
         1.57248767e-03,  4.03869469e-03,  3.76607568e-03,
        -2.81583066e-03, -4.27397296e-03,  1.91855328e-03,
         3.50469503e-03,  1.10798166e-04]])>

In [34]:
v_test = critic_test([tf_obs, a_test])
v_test

<tf.Tensor: shape=(), dtype=float32, numpy=-0.3223806>

In [37]:
obs_new, _, _, _ = env.step(a_test[0])
tf_obs_new = tf.expand_dims(obs_new, 0)
statex2 = tf.convert_to_tensor([obs, obs_new])
print(statex2.shape)

(2, 11)


In [36]:
a_2, loga_2, ha_2 = actor_test(statex2)

print(a_2.shape, loga_2.shape, ha_2.shape)

(2, 3) (2,) (2,)


In [39]:
v_2 = critic_test([statex2, a_2])
print(v_2.shape)

(2,)


In [40]:
critic_test.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 11)]         0           []                               
                                                                                                  
 input_12 (InputLayer)          [(None, 3)]          0           []                               
                                                                                                  
 dense_28 (Dense)               (None, 32)           384         ['input_11[0][0]']               
                                                                                                  
 dense_29 (Dense)               (None, 32)           128         ['input_12[0][0]']               
                                                                                            

### Replay Buffer

In [16]:
class Buffer:
    def __init__(self, obs_dim, a_dim, buffer_capacity=100000, batch_size=256):
        
        self.obs_dim = obs_dim
        self.a_dim = a_dim
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.action_buffer = np.zeros((self.buffer_capacity, self.a_dim))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, self.obs_dim))
        self.done_buffer = np.zeros((self.buffer_capacity, 1))

    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]
        self.buffer_counter += 1
        
    def sample(self):
        record_range = min(self.buffer_counter, self.buffer_capacity)
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.squeeze(tf.convert_to_tensor(self.reward_buffer[batch_indices]))
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        done_batch = tf.squeeze(tf.convert_to_tensor(self.done_buffer[batch_indices]))
        
        return (state_batch,
               action_batch,
               reward_batch,
               next_state_batch,
               done_batch)


In [17]:
buffer1 = Buffer(num_states, num_actions, 100, 10)

In [18]:
prev_obs = env.reset()

for i in range(buffer1.buffer_capacity):
    a, _, _ = actor_test(tf.expand_dims(prev_obs, 0))
    obs, r, d, _ = env.step(a[0])
    
    buffer1.record((prev_obs, a[0], r, obs, d))
    
    prev_obs = obs
    

In [19]:
buffer1.sample()

(<tf.Tensor: shape=(10, 11), dtype=float64, numpy=
 array([[ 1.04179102e+00, -1.07418133e+00, -6.95978301e-01,
          9.57524420e-05, -3.82127597e-01, -1.51648909e+00,
         -1.38013366e+00, -4.21671507e+00, -3.03130110e+00,
          5.76602765e-02, -1.55324893e+00],
        [ 1.20553191e+00, -7.74735677e-02,  5.61179534e-04,
          1.29514355e-02, -1.06950202e-01, -1.08236183e+00,
         -1.39994738e-01, -2.79268109e+00, -2.33679404e+00,
          2.01693559e-02,  4.75330127e-01],
        [ 9.92321761e-01, -1.22949322e+00, -8.12135304e-01,
          1.54451066e-03, -4.14168363e-01, -1.60080036e+00,
         -1.57646019e+00, -5.17573675e+00, -3.81405616e+00,
          1.37187263e-02, -6.46599813e-01],
        [ 1.20398459e+00, -3.96854423e-01, -2.60864533e-01,
          3.63089123e-03,  2.41754410e-02, -9.14625785e-01,
         -1.89093401e-01, -3.32807481e+00, -3.03446926e+00,
         -7.92573789e-02, -2.86001003e-01],
        [ 1.03001391e+00, -1.11072723e+00, -7.2379906

### Soft Actor Critic

In [41]:
class SAC:
    
    def __init__(self, env, observation_dimensions, action_dimensions, action_bound, buffer_capacity,
                 minibatch_size=256, gamma=0.99, tau=0.95, lr=3e-4):
        
        self.env = env
        
        self.a = Actor(action_dimensions, action_bound)
        self.c_gen = Critic_Wrapper(observation_dimensions, action_dimensions)
        self.c1 = self.c_gen.get_critic()
        self.c2 = self.c_gen.get_critic()
        self.tc1 = self.c_gen.get_critic()
        self.tc2 = self.c_gen.get_critic()
        
        self.tc1.set_weights(self.c1.get_weights())
        self.tc2.set_weights(self.c2.get_weights())

        self.te = -np.prod(num_actions)
        self.alpha = tf.Variable(0.0, dtype=tf.float32)
        
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c1_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        self.c2_opt = tf.keras.optimizers.Adam(learning_rate=lr)                                                  
        self.alpha_opt = tf.keras.optimizers.Adam(learning_rate=lr)   
        
        self.buffer = Buffer(observation_dimensions, action_dimensions, buffer_capacity, minibatch_size)
        
        self.gamma, self.tau = gamma, tau
        
    def update(self):
        s_b, a_b, r_b, ns_b, d_b = self.buffer.sample()
        with tf.GradientTape() as tape_c1, tf.GradientTape() as tape_c2:
            q1 = self.c1([s_b, a_b])
            q2 = self.c2([s_b, a_b])
            print(q1.shape)
            print(q2.shape)

        


        

In [42]:
sac1 = SAC(env, num_states, num_actions, upper_bound, 1000, 10)

In [43]:
buffer1 = Buffer(num_states, num_actions, 100, 10)

In [44]:
prev_obs = env.reset()

for i in range(buffer1.buffer_capacity):
    a, _, _ = actor_test(tf.expand_dims(prev_obs, 0))
    obs, r, d, _ = env.step(a[0])
    
    buffer1.record((prev_obs, a[0], r, obs, d))
    
    prev_obs = obs
    

In [45]:
sac1.buffer =  buffer1

In [46]:
sac1.update()

(10,)
(10,)
