In [129]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

import gym
import pybullet_envs


In [130]:
def action(model, obs, act_spc):
    est = tf.squeeze(model(tf.expand_dims(obs, 0)), axis=0)
    if act_spc.shape: # Box
        dist = tfd.MultivariateNormalDiag(est, tf.exp(model.log_std))
    else: # Discrete
        dist = tfd.Categorical(logits=est, dtype=act_spc.dtype)

    return dist.sample()


In [131]:
def test(epochs, env, model):
    for i in range(1, epochs+1):
        print('epoch', i)
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            act = action(model, obs, env.action_space)
            obs, rew, done, _ = env.step(act.numpy())
            episode_rew += rew
        print("Episode reward", episode_rew)

In [132]:
env_name = 'Walker2DBulletEnv-v0'
env = gym.make(env_name)
env = gym.wrappers.NormalizeReward(env)
env = gym.wrappers.TransformReward(env, lambda reward: tf.clip_by_value(reward, -10, 10))
obs_spc = env.observation_space
act_spc = env.action_space

In [133]:
model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='tanh', input_shape=obs_spc.shape),
            tf.keras.layers.Dense(64, activation='tanh'),
            tf.keras.layers.Dense(act_spc.shape[0] if act_spc.shape else act_spc.n)
        ])
if act_spc.shape:
    model.log_std = tf.Variable(tf.zeros(act_spc.shape))
model.summary()


Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_51 (Dense)            (None, 64)                1472      
                                                                 
 dense_52 (Dense)            (None, 64)                4160      
                                                                 
 dense_53 (Dense)            (None, 6)                 390       
                                                                 
Total params: 6,028
Trainable params: 6,028
Non-trainable params: 0
_________________________________________________________________


In [136]:
test(10, env, model)

epoch 1
Episode reward tf.Tensor(2.183619675876214, shape=(), dtype=float64)
epoch 2
Episode reward tf.Tensor(3.2180128399767116, shape=(), dtype=float64)
epoch 3
Episode reward tf.Tensor(2.14742739778221, shape=(), dtype=float64)
epoch 4
Episode reward tf.Tensor(2.0113324524482077, shape=(), dtype=float64)
epoch 5
Episode reward tf.Tensor(2.8878095209334083, shape=(), dtype=float64)
epoch 6
Episode reward tf.Tensor(1.3064470389747167, shape=(), dtype=float64)
epoch 7
Episode reward tf.Tensor(2.0597418566878987, shape=(), dtype=float64)
epoch 8
Episode reward tf.Tensor(2.300824895951164, shape=(), dtype=float64)
epoch 9
Episode reward tf.Tensor(2.9550075958506925, shape=(), dtype=float64)
epoch 10
Episode reward tf.Tensor(2.098149857849518, shape=(), dtype=float64)
