<a href="https://colab.research.google.com/github/syrma/RLExp/blob/master/PPO_VECENV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow
!pip install pybullet
!pip install gym_vecenv

import tensorflow as tf
import gym
import pybullet_envs
import time
import math
import gym_vecenv
import tensorflow_probability as tfp
tfd = tfp.distributions


Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/e6/9c/7b76db10cdaa69c840b211fe21ce6f31fb80b611b198fe18a64ddb8f374e/pybullet-3.1.0-cp37-cp37m-manylinux1_x86_64.whl (88.7MB)
[K     |████████████████████████████████| 88.7MB 47kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.1.0
Collecting gym_vecenv
  Downloading https://files.pythonhosted.org/packages/51/e2/3375a235249cdcad2ddcfca831982816b86c292e3be50ab56ea96c806ed0/gym_vecenv-1.0-py3-none-any.whl
Installing collected packages: gym-vecenv
Successfully installed gym-vecenv-1.0


In [None]:
@tf.function
def action(model, obs, env):
    est = model(obs)
    if env.action_space.shape:
        dist = tfd.MultivariateNormalDiag(est, tf.exp(model.log_std))
    else:
        dist = tfd.Categorical(logits=est, dtype=env.action_space.dtype)

    action = dist.sample()
    logprob = tf.reduce_sum(dist.log_prob(action))

    return action, logprob

In [None]:
    size = 5000
    epochs = 100
    opt = tf.optimizers.Adam(learning_rate=1e-2)
    γ = .99
    λ = 0.97
    num_env = 5

    env_name = "CartPole-v0"

    env = gym_vecenv.DummyVecEnv([lambda: gym.make(env_name)] * num_env)
    # policy/actor model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, activation='tanh', input_shape=env.observation_space.shape),
        tf.keras.layers.Dense(64, activation='tanh'),
        tf.keras.layers.Dense(env.action_space.shape[0] if env.action_space.shape else env.action_space.n)
    ])
    if env.action_space.shape:
        model.log_std = tf.Variable(tf.fill(env.action_space.shape, -0.5))
    model.summary()

    # value/critic model
    value_model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, activation='tanh', input_shape=env.observation_space.shape),
        tf.keras.layers.Dense(64, activation='tanh'),
        tf.keras.layers.Dense(1)
    ])
    value_model.compile('adam', loss='MSE')
    value_model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 4,610
Trainable params: 4,610
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                320       
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
________________________________

In [None]:
    obs_dtype = env.observation_space.dtype
    act_dtype = env.action_space.dtype
    
    obs_buf = tf.TensorArray(obs_dtype, size)
    act_buf = tf.TensorArray(act_dtype, size)
    rew_buf = tf.TensorArray(tf.float32, size)
    prob_buf = tf.TensorArray(tf.float32, size)
    done_buf = tf.TensorArray(tf.float32, size)
    
    obs = env.reset()
    obs = tf.cast(obs, obs_dtype)

    for i in range(size):
        act, prob = action(model, obs, env)
        new_obs, rew, done, _ = env.step(act.numpy())

        obs_buf = obs_buf.write(i, obs)
        act_buf = act_buf.write(i, act)
        rew_buf = rew_buf.write(i, rew)
        prob_buf = prob_buf.write(i, prob)
        done_buf = done_buf.write(i, done)

        obs = tf.cast(new_obs, obs_dtype)

    obs_buf = obs_buf.stack()
    act_buf = act_buf.stack()
    rew_buf = rew_buf.stack()
    prob_buf = prob_buf.stack()
    done_buf = done_buf.stack()

    # last_val is 0 when done
    last_val = tf.squeeze(value_model(obs)) * (1 - done_buf[-1])

    #rets = []
    #lens = []


In [None]:
#TODO: turn into a list of tensor arrays
v_hats = [tf.TensorArray(tf.float32, size) for _ in range(num_env)]
gae = [tf.TensorArray(tf.float32, size) for _ in range(num_env)]

#TODO: changer la boucle et remplacer cumprod/cumsum
last_idx = [0] * num_env

for i in range(size):
  for j in range(num_env): # num_env = ?
    if i != size - 1 and not done_buf[i,j]:
      continue

    # sum of discounted rewards
    current_episode = slice(last_idx[j],i+1)   
    ep_idx = range(last_idx[j], i+1) 
    ep_rew = rew_buf[current_episode, j]
    discounts = tf.math.cumprod(tf.fill(ep_rew.shape, γ), exclusive=True)
    ep_v_hats = tf.math.cumsum(discounts * ep_rew, reverse=True)
    v_hats[j] = v_hats[j].scatter(ep_idx, ep_v_hats)

    Vs = tf.squeeze(value_model(obs_buf[current_episode, j]), axis=1)
    if i == size - 1:
      Vsp1 = tf.concat([Vs[1:], [last_val[j]]], axis=0)
    else:
      Vsp1 = tf.concat([Vs[1:], [0]], axis=0)
    
    deltas = rew_buf[current_episode, j] + γ * Vsp1 - Vs

    # compute the advantage function (gae)
    discounts = tf.math.cumprod(tf.fill(deltas.shape, γ * λ), exclusive=True)
    ep_gae = tf.math.cumsum(discounts * deltas, reverse=True)
    gae[j] = gae[j].scatter(ep_idx, ep_gae)

    last_idx[j] = i+1

v_hats = [v_hat.stack() for v_hat in v_hats]
gae = [g.stack() for g in gae]

In [None]:
#def run_env(env, size, model, value_model, γ, λ):

In [None]:
def train_one_epoch(env, batch_size, model, value_model, γ, λ):
    obs_spc = env.observation_space
    act_spc = env.action_space

    start_time = time.time()

    run_env(env, batch_size, model, value_model, γ, λ)

    train_start_time = time.time()

    var_list = list(model.trainable_weights)
    if act_spc.shape:
        var_list.append(model.log_std)

    opt.minimize(batch.loss, var_list=var_list)

    train_time = time.time() - train_start_time
    run_time = train_start_time - start_time

    print('run time', run_time, 'train time', train_time)
    print('AvgEpRet:', tf.reduce_mean(batch.rets).numpy())

    hist = value_model.fit(batch.obs_buf.numpy(), batch.V_hats.numpy(), batch_size=32)
    wandb.log({'LossV': tf.reduce_mean(hist.history['loss']).numpy(),
               'EpRet': wandb.Histogram(batch.rets),
               'AvgEpRet': tf.reduce_mean(batch.rets),
               'EpLen': tf.reduce_mean(batch.lens),
               'VVals': wandb.Histogram(batch.V_hats)},
              commit=False)

In [None]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Train or test PPO')
    parser.add_argument('test', nargs='?', help = 'Test a saved or a random model')
    parser.add_argument('--load_dir', help='Optional: directory of saved model to test or resume training')
    parser.add_argument('--env_name', help='Environment name to use with OpenAI Gym')
    parser.add_argument('--save_dir', help='Optional: directory where the model should be saved')

    args = parser.parse_args()
    env_name = args.env_name
    save_dir = args.save_dir
    load_dir = args.load_dir

    batch_size = 5000
    epochs = 100
    opt = tf.optimizers.Adam(learning_rate=1e-2)
    γ = .99
    λ = 0.97
    num_env = 5

    env = gym_vecenv.DummyVecEnv([lambda: gym.make(env_name)] * num_env)
    obs_spc = env.observation_space
    act_spc = env.action_space

    wandb.init(project='ppo', entity='rlexp')
    wandb.config.env = env_name
    wandb.config.epochs = epochs
    wandb.config.batch_size = batch_size
    wandb.config.lam = λ
    wandb.config.gamma = γ

    # policy/actor model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, activation='tanh', input_shape=obs_spc.shape),
        tf.keras.layers.Dense(64, activation='tanh'),
        tf.keras.layers.Dense(act_spc.shape[0] if act_spc.shape else act_spc.n)
    ])
    if act_spc.shape:
        model.log_std = tf.Variable(tf.fill(env.action_space.shape, -0.5))
    model.summary()

    # value/critic model
    value_model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, activation='tanh', input_shape=obs_spc.shape),
        tf.keras.layers.Dense(64, activation='tanh'),
        tf.keras.layers.Dense(1)
    ])
    value_model.compile('adam', loss='MSE')
    value_model.summary()

usage: ipykernel_launcher.py [-h] [--load_dir LOAD_DIR] [--env_name ENV_NAME]
                             [--save_dir SAVE_DIR]
                             [test]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
