### Packages

In [150]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal
import time
from tensorflow.keras import Model
import matplotlib.pyplot as plt
import random
import tensorflow_probability as tfp

### Env Setup

In [151]:
problem = "Hopper-v2"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

num_states, num_states, upper_bound, lower_bound

EPSILON = 1e-10

### Actor Model

In [163]:
class Actor(Model):

    def __init__(self, action_dimensions):
        super().__init__()
        self.action_dim = action_dimensions
        self.sample_dist = tfp.distributions.MultivariateNormalDiag(loc=tf.zeros(num_actions), scale_diag=tf.ones(num_actions))
        self.dense1_layer = layers.Dense(256, activation="relu")
        self.dense2_layer = layers.Dense(256, activation="relu")
        self.mean_layer = layers.Dense(self.action_dim)
        self.stdev_layer = layers.Dense(self.action_dim)

    def call(self, state, eval_mode=False):

        a1 = self.dense1_layer(state)
        a2 = self.dense2_layer(a1)
        mu = self.mean_layer(a2)

        log_sigma = self.stdev_layer(a2)
        sigma = tf.exp(log_sigma)

#         sigma = tf.clip_by_value(sigma, 0.01, 2.718)

#         print("Mu, Sigma: ",mu, sigma)

        dist = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=sigma)
        
#         print("DIST MU, SIGMA: ", dist.mean(), dist.stddev())

        if eval_mode:
            action_ = mu
        else:
            action_ = tf.math.add(mu, tf.math.multiply(sigma, tf.expand_dims(self.sample_dist.sample(), 0)))

#         print("Unbounded a: ",action_)
        
        action = tf.tanh(action_)
        
#         print("Bounded a: ", action)

        log_pi_ = dist.log_prob(action_)

#         print("Presquashed log a: ", log_pi_)
        
        log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
        
#         print("Squashed log a: ",log_pi )

        return action*upper_bound, log_pi


#### Testing

In [164]:
actor_test = Actor(num_actions)

In [165]:
obs = env.reset()
obs

array([ 1.24755665e+00, -4.37420824e-03, -2.86328271e-03,  3.78593567e-03,
       -1.00007902e-03,  2.51344390e-03, -4.01047093e-03,  3.78887863e-03,
       -1.16118361e-03,  2.81530012e-03, -2.44420182e-03])

In [166]:
tf_obs = tf.expand_dims(obs, 0)
tf_obs
a_test, log_a_test = actor_test(tf_obs)


In [167]:
actor_test.summary()

Model: "actor_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_80 (Dense)            multiple                  3072      
                                                                 
 dense_81 (Dense)            multiple                  65792     
                                                                 
 dense_82 (Dense)            multiple                  771       
                                                                 
 dense_83 (Dense)            multiple                  771       
                                                                 
Total params: 70,406
Trainable params: 70,406
Non-trainable params: 0
_________________________________________________________________


### Critic Model

In [168]:
class Critic_wrapper():
    def __init__(self, state_dim, action_dim):
        self.s_dim=state_dim
        self.a_dim=action_dim
        
    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.s_dim))
        state_out = layers.Dense(128, activation="relu")(state_input)
        # state_out = layers.Dense(32, activation="relu")(state_out)

        # Action as input
        action_input = layers.Input(shape=(self.a_dim))
        action_out = layers.Dense(128, activation="relu")(action_input)

        # Concatenating
        concat = layers.Concatenate()([state_out, action_out])
        out = layers.Dense(256, activation="relu")(concat)
        outputs = layers.Dense(1, dtype='float64')(out)

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model


#### Testing

In [169]:
critic_gen = Critic_wrapper(num_states, num_actions)
critic_test = critic_gen.get_critic()

In [170]:
obs = env.reset()
obs

array([ 1.25294591e+00, -2.42876634e-03,  1.15680813e-04,  8.51958546e-04,
        2.32168135e-03, -4.44291986e-03, -3.23700541e-03, -1.07100765e-03,
        4.98346840e-03, -1.71841054e-03, -2.99219715e-03])

In [171]:
tf_obs = tf.expand_dims(obs, 0)
a_test, log_a_test = actor_test(tf_obs)
tf_obs

<tf.Tensor: shape=(1, 11), dtype=float64, numpy=
array([[ 1.25294591e+00, -2.42876634e-03,  1.15680813e-04,
         8.51958546e-04,  2.32168135e-03, -4.44291986e-03,
        -3.23700541e-03, -1.07100765e-03,  4.98346840e-03,
        -1.71841054e-03, -2.99219715e-03]])>

In [172]:
v_test = critic_test([tf_obs, a_test])
v_test

<tf.Tensor: shape=(1, 1), dtype=float64, numpy=array([[-0.0448832]])>

In [173]:
critic_test.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_23 (InputLayer)          [(None, 11)]         0           []                               
                                                                                                  
 input_24 (InputLayer)          [(None, 3)]          0           []                               
                                                                                                  
 dense_84 (Dense)               (None, 128)          1536        ['input_23[0][0]']               
                                                                                                  
 dense_85 (Dense)               (None, 128)          512         ['input_24[0][0]']               
                                                                                           

### Replay Buffer

In [286]:
class Buffer:

    def __init__(self, observation_dimensions, action_dimensions, size, minibatch_size, gamma=0.99, lam=0.95):

        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros((size, action_dimensions), dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        
        self.gamma, self.lam = gamma, lam
        self.batch_size = minibatch_size
        
        self.pointer = 0
        self.trajectory_start_indices = []
        self.trajectory_start_indices.append(0)

    def store(self, observation, action, reward, logprobability, done):

        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1
        if done:
            self.trajectory_start_indices.append(self.pointer)




    def get(self):
        # Get all data of the buffer and normalize the advantages
        rindex = np.random.choice(len(self.trajectory_start_indices)-1, self.batch_size)
#         print(rindex)
        isolated_obs = [self.observation_buffer[self.trajectory_start_indices[ri]:
                                               self.trajectory_start_indices[ri+1]-1] for ri in rindex]
        isolated_a = [self.action_buffer[self.trajectory_start_indices[ri]:
                                               self.trajectory_start_indices[ri+1]-1] for ri in rindex]
        isolated_r = [self.reward_buffer[self.trajectory_start_indices[ri]:
                                               self.trajectory_start_indices[ri+1]-1] for ri in rindex]
        isolated_log_a = [self.logprobability_buffer[self.trajectory_start_indices[ri]:
                                               self.trajectory_start_indices[ri+1]-1] for ri in rindex]
        
        return (
            isolated_obs,
            isolated_a,
            isolated_r,
            isolated_log_a,
        )
    def clear(self):
        self.pointer = 0
        self.trajectory_start_indices = []
        self.trajectory_start_indices.append(0)

#### Testing

In [299]:
buffer = Buffer(num_states, num_actions, 100, 2)
actor_test = Actor(num_actions)
critic_gen = Critic_wrapper(num_states, num_actions)
critic_test = critic_gen.get_critic()

In [300]:
obs = env.reset()
buffer.clear()
for x in range(100):
    tf_obs = tf.expand_dims(obs, 0)
    a, log_a = actor_test(tf_obs)
    a = a[0]
    obs_new, r, d, _ = env.step(a)
    
    buffer.store(obs, a, r, log_a, d)
    if d:
        obs = env.reset()
    else:
        obs = obs_new
    
print(buffer.trajectory_start_indices)

[0, 12, 29, 56, 65, 93]


In [318]:
s_b, a_b, r_b, l_b = buffer.get()
# np.array(s_b, dtype=object).shape, np.array(a_b, dtype=object).shape, np.array(r_b, dtype=object).shape, np.array(l_b, dtype=object).shape
# np.array(s_b, dtype=object), np.array(a_b, dtype=object), np.array(r_b, dtype=object), np.array(l_b, dtype=object)
for x in s_b:
    print("trajectory len: ", len(x))
    print("s_dim: ", len(x[0]))

trajectory len:  27
s_dim:  11
trajectory len:  16
s_dim:  11


### Discount Sample Returns

In [81]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [82]:
discount = 0.99
samples = []