In [1]:
import numpy as np
import tensorflow as tf
from roller import Roller
from config import Params
from wrapper_env import env_wrapper
import gym

from nn_architecures import network_builder
from models import CategoricalModel, GaussianModel
from policy import Policy
from CREnv import CREnv


# physical_devices = tf.config.list_physical_devices('GPU') 
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

params = Params()          # Get Configuration | HORIZON = Steps per epoch

tf.random.set_seed(params.env.seed)                                     # Set Random Seeds for np and tf
np.random.seed(params.env.seed)

# env = create_batched_env(params.env.num_envs, params.env)               # Create Environment in multiprocessing mode
# env = gym.make('CartPole-v0')
env = CREnv()
env = env_wrapper(env, params.env)

network = network_builder(params.trainer.nn_architecure) \
    (hidden_sizes=params.policy.hidden_sizes, env_info=env.env_info)    # Build Neural Network with Forward Pass

model = CategoricalModel if env.env_info.is_discrete else GaussianModel
model = model(network=network, env_info=env.env_info)                   # Build Model for Discrete or Continuous Spaces

roller = Roller(env, model, params.trainer.steps_per_epoch,
                params.trainer.gamma, params.trainer.lam)               # Define Roller for genrating rollouts for training

ppo = Policy(model=model)            # Define PPO Policy with combined loss
rollouts, infos = roller.rollout()
print(rollouts) 


USE STANDARD VARIABLES
Model Summary: simple_actor_critic
Model: "simple_actor_critic"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_actor_critic1 (Dense) (None, 32)                160       
_________________________________________________________________
simple_actor_critic2 (Dense) (None, 32)                1056      
_________________________________________________________________
simple_actor_critic_output ( (None, 4)                 132       
Total params: 1,348
Trainable params: 1,348
Non-trainable params: 0
_________________________________________________________________
Model: "simple_actor_critic"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_actor_critic1 (Dense) (None, 32)                160       
_________________________________________________________________
simple_actor_critic2 (De

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [1]:
# build model
import tensorflow as tf
import numpy as np
obs_shape = (4,)

class ActorCritic(tf.keras.Model):
    """Combined actor-critic network."""

    def __init__(self, num_actions, hidden_units):
        """Initialize."""
        super().__init__()

        self.n_actions = num_actions
        self.hidden_sizes = hidden_units
        
        self.actor = self.mlp(output_size=self.n_actions)
        self.critic = self.mlp(output_size=1)
        self.actor.build(input_shape=(None,)+(4,))
        self.critic.build(input_shape=(None,)+(4,))

    # build mlp
    def mlp(self, output_size=1, activation='relu', activation_output=None, kernel_init='glorot_uniform'):
        model = tf.keras.Sequential()
        for h in self.hidden_sizes:
            model.add(tf.keras.layers.Dense(units=h, activation=activation, kernel_initializer=kernel_init, bias_initializer='zeros'))
        model.add(tf.keras.layers.Dense(units=output_size, activation=activation_output))
        return model
    
    def call(self, x):
        return self.actor(x), self.critic(x)

In [2]:
def update(model, vec_obses, actions, advs, returns, logps):
    """
        Update Policy and the Value Network
        -----------------------------------
            Inputs: obs, act, advantages, returns, logp-t
            Returns: loss-pi, loss-entropy, approx-ent, kl, loss-v, loss-total
    """
    nsample = 200
    nbatch_train = nsample // 50
    train_iters = 5
    inds = np.arange(nsample)
    loss_type = 'total_loss'
    target_kl = 0.01
    
    np.random.shuffle(inds)
    
    for i in range(train_iters):
        means = []
        for start in range(0, nsample, nbatch_train):

            end = start + nbatch_train
            slices = inds[start:end]
#             print(slices)
            losses = train_one_step(model, vec_obses[slices],
                                    actions[slices], 
                                    advs[slices], 
                                    logps[slices], 
                                    returns[slices],
                                    loss_type)

            means.append([losses['pi_loss'], 
                          losses['v_loss'], 
                          losses['entropy_loss'], 
                          losses['approx_ent'], 
                          losses['approx_kl']])

        means = np.asarray(means)
        means = np.mean(means, axis= 0)
        
        mean_losses = {'pi_loss': means[0], 
                    'v_loss': means[1],
                    'entropy_loss': means[2], 
                    'approx_ent': means[3], 
                    'approx_kl': means[4]}
        
        if mean_losses['approx_kl'] > 1.5 * target_kl:
            print("Early stopping at step %d due to reaching max kl." %i)
            break
    return mean_losses

def loss(model, vec_obs, logp_old, act, adv, returns):
    
    action_dim = 2
    clip_ratio = 0.2
    ent_coef = 0.1
    v_coef = 0.5
    pi, value = model.call(vec_obs)

    logp_all = tf.nn.log_softmax(pi)
    one_hot = tf.one_hot(act, depth=action_dim)
    logp = tf.reduce_sum(one_hot * logp_all, axis= -1) 
    
    ratio = tf.exp(logp - logp_old)
    min_adv = tf.where(adv > 0, (1 + clip_ratio) * adv, (1 - clip_ratio) * adv)

    clipped_loss = -tf.reduce_mean(tf.math.minimum(ratio * adv, min_adv))
        
    # print(ratio, adv)
    a0 = pi - tf.reduce_max(logits, axis= -1, keepdims=True)
    exp_a0 = tf.exp(a0)
    z0 = tf.reduce_sum(exp_a0, axis= -1, keepdims=True)
    p0 = exp_a0 / z0
    entropy = tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis= -1) 
    entropy_loss = -tf.reduce_mean(entropy)

    pi_loss = clipped_loss + entropy_loss * ent_coef

    approx_kl = tf.reduce_mean(logp_old - logp)
    approx_ent = tf.reduce_mean(-logp)
    
    v_loss = 0.5 * tf.reduce_mean(tf.square(returns - value))
    # v_loss = tf.reduce_mean(tf.square(returns - values))
    total_loss = pi_loss + v_loss * v_coef

    return {    'pi_loss': pi_loss, 
                'entropy_loss': entropy_loss, 
                'approx_ent': approx_ent, 
                'approx_kl': approx_kl, 
                'v_loss': v_loss, 
                'total_loss': total_loss}

def train_one_step(model, vec_obs, act, adv, logp_old, returns, loss_type):
    
    clip_grads = 0.5
    lr = 0.001
    with tf.GradientTape() as tape:
        losses = loss(model, vec_obs, logp_old, act, adv, returns)

    if loss_type=='pi_loss':
        trainable_variables = model.actor.trainable_variables            # take all trainable variables into account
    elif loss_type=='v_loss':
        trainable_variables = model.critic.trainable_variables
    else:
        trainable_variables = model.trainable_variables

    grads = tape.gradient(losses[loss_type], trainable_variables)
    grads, grad_norm = tf.clip_by_global_norm(grads, clip_grads)               # clip gradients for slight updates
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    optimizer.apply_gradients(zip(grads, trainable_variables))                 # Backprop gradients through network

    return losses

In [3]:
import numpy as np
import tensorflow as tf
import gym

# set params
seed = 10000

tf.random.set_seed(seed)
np.random.seed(seed)

env = gym.make('CartPole-v0')

n_actions = env.action_space.n  # 2
hidden_units = (32,32)

model = ActorCritic(n_actions, hidden_units)
model.actor.summary()
model.critic.summary()

gamma = 0.99
lam = 0.97

for epoch in range(200):
    # rollout
    obs = env.reset()

    num_steps = 200

    vec_obses = []
    rews = []
    dones = []
    actions = []
    values = []
    logps = []
    ep_rews = []
    ep_lens = []

    # print(np.expand_dims(obs, axis=0).shape)
    # print(model.call(np.expand_dims(obs, axis=0)))
    ep_len = 0
    ep_rew = 0
    done_t = False
    for step in range(num_steps):

        if done_t:
            obs = env.reset()
            ep_lens.append(ep_len)
            ep_rews.append(ep_rew)
            ep_len = 0
            ep_rew = 0

        logits, value = model.call(np.expand_dims(obs, axis=0))
        action_t = tf.squeeze(tf.random.categorical(logits, 1), axis=-1)
        logp_all = tf.nn.log_softmax(logits)
        one_hot = tf.one_hot(action_t, depth=n_actions)
        logp_t = tf.reduce_sum( one_hot * logp_all, axis= -1)

        vec_obses.append(obs)
        dones.append(done_t)
        actions.append(action_t)
        values.append(value)
        logps.append(logp_t)

        obs, rew, done_t, infos = env.step(action_t.numpy()[0])

        rews.append(rew)

        ep_len += 1
        ep_rew += rew
        
    print(np.mean(ep_lens))

    # ep_lens.append(ep_len)
    # ep_rews.append(ep_rew)

    """
        End of for loop
        ---------------
        Get last Values for BOOTSTRAPING
    """
    logits, value = model.call(np.expand_dims(obs, axis=0))
    action_t = tf.squeeze(tf.random.categorical(logits, 1), axis=-1)
    last_values = value

    vec_obses = np.array(vec_obses, dtype= np.float32)
    rews = np.array(rews, dtype=np.float32).flatten()
    dones = np.array(dones, dtype= np.bool)
    values = np.array(values, dtype=np.float32).flatten()
    logps = np.array(logps, dtype=np.float32).flatten()
    actions = np.array(actions, dtype=np.int32).flatten()

    """
        Discount / Bootstrap Values and calc Advantages
        -----------------------------------------------
    """
    returns = np.zeros_like(rews)
    advs = np.zeros_like(rews)
    last_gae_lam = 0

    for t in reversed(range(num_steps)):
        if t == num_steps - 1:
            next_non_terminal = 1.0 - done_t
            next_values = last_values
        else:
            next_non_terminal = 1.0 - dones[t + 1]
            next_values = values[t + 1]

        delta = rews[t] + gamma * next_values * next_non_terminal - values[t]
        advs[t] = last_gae_lam = delta + gamma * lam * next_non_terminal * last_gae_lam

    returns = advs + values
    advs = (advs - advs.mean()) / (advs.std())
#     print(returns)
    m_losses = update(model, vec_obses, actions, advs, returns, logps)
    print(m_losses)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                160       
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
________________________________

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


nan
{'pi_loss': -0.07557553, 'v_loss': 89.91515, 'entropy_loss': -0.4578965, 'approx_ent': 0.44915363, 'approx_kl': 0.0085265}
182.0
{'pi_loss': -0.047668207, 'v_loss': 105.62388, 'entropy_loss': -0.47353077, 'approx_ent': 0.44965374, 'approx_kl': 0.003544569}
nan
{'pi_loss': -0.045822755, 'v_loss': 129.1583, 'entropy_loss': -0.4524636, 'approx_ent': 0.43971333, 'approx_kl': -0.012743679}
nan
Early stopping at step 1 due to reaching max kl.
{'pi_loss': -0.03446734, 'v_loss': 138.43916, 'entropy_loss': -0.4701195, 'approx_ent': 0.5128239, 'approx_kl': 0.01547514}
165.0
{'pi_loss': 0.009048533, 'v_loss': 128.13828, 'entropy_loss': -0.36919507, 'approx_ent': 0.4052278, 'approx_kl': -0.020322919}
nan
Early stopping at step 3 due to reaching max kl.
{'pi_loss': -0.02972061, 'v_loss': 139.6433, 'entropy_loss': -0.40777463, 'approx_ent': 0.472226, 'approx_kl': 0.018590486}
119.0
Early stopping at step 1 due to reaching max kl.
{'pi_loss': -0.036350645, 'v_loss': 142.42091, 'entropy_loss': -0.

153.0
Early stopping at step 2 due to reaching max kl.
{'pi_loss': -0.036773633, 'v_loss': 83.35416, 'entropy_loss': -0.4216482, 'approx_ent': 0.42804414, 'approx_kl': 0.018318191}
nan
Early stopping at step 1 due to reaching max kl.
{'pi_loss': 0.009763537, 'v_loss': 101.32596, 'entropy_loss': -0.26492032, 'approx_ent': 0.22885679, 'approx_kl': 0.02873401}
63.0
{'pi_loss': -0.040155396, 'v_loss': 106.19511, 'entropy_loss': -0.3360642, 'approx_ent': 0.35992578, 'approx_kl': 0.009820402}
181.0
Early stopping at step 0 due to reaching max kl.
{'pi_loss': -0.011881041, 'v_loss': 118.299324, 'entropy_loss': -0.24466623, 'approx_ent': 0.29108226, 'approx_kl': 0.015258235}
53.0
{'pi_loss': -0.008105241, 'v_loss': 128.22037, 'entropy_loss': -0.33152694, 'approx_ent': 0.36016247, 'approx_kl': 0.0036915208}
65.5
Early stopping at step 1 due to reaching max kl.
{'pi_loss': -0.053974308, 'v_loss': 184.5058, 'entropy_loss': -0.34035072, 'approx_ent': 0.286903, 'approx_kl': 0.015848735}
83.5
{'pi_l

50.0
{'pi_loss': -0.029353423, 'v_loss': 114.30518, 'entropy_loss': -0.3070734, 'approx_ent': 0.31797042, 'approx_kl': 0.0100741815}
46.0
{'pi_loss': -0.014050753, 'v_loss': 96.41682, 'entropy_loss': -0.3758054, 'approx_ent': 0.3662269, 'approx_kl': 0.007953371}
94.5
Early stopping at step 1 due to reaching max kl.
{'pi_loss': 0.05116224, 'v_loss': 77.416336, 'entropy_loss': -0.22766054, 'approx_ent': 0.28449157, 'approx_kl': 0.025700722}
42.0
Early stopping at step 0 due to reaching max kl.
{'pi_loss': -0.073743746, 'v_loss': 85.996574, 'entropy_loss': -0.37066573, 'approx_ent': 0.32760483, 'approx_kl': 0.045096647}
62.0
Early stopping at step 0 due to reaching max kl.
{'pi_loss': -0.03739102, 'v_loss': 78.065544, 'entropy_loss': -0.30319777, 'approx_ent': 0.30952194, 'approx_kl': 0.020434808}
61.333333333333336
Early stopping at step 1 due to reaching max kl.
{'pi_loss': -0.028337132, 'v_loss': 88.75558, 'entropy_loss': -0.36287895, 'approx_ent': 0.38261548, 'approx_kl': 0.04549793}


135.0
Early stopping at step 0 due to reaching max kl.
{'pi_loss': -0.041040443, 'v_loss': 93.48047, 'entropy_loss': -0.32888716, 'approx_ent': 0.36445618, 'approx_kl': 0.029592719}
58.0
Early stopping at step 0 due to reaching max kl.
{'pi_loss': -0.033125784, 'v_loss': 205.2362, 'entropy_loss': -0.25919732, 'approx_ent': 0.3886633, 'approx_kl': 0.07225399}
150.0
Early stopping at step 4 due to reaching max kl.
{'pi_loss': -0.040965978, 'v_loss': 85.480865, 'entropy_loss': -0.31979764, 'approx_ent': 0.3865658, 'approx_kl': 0.026944647}
67.0
Early stopping at step 0 due to reaching max kl.
{'pi_loss': -0.036504317, 'v_loss': 151.04198, 'entropy_loss': -0.29212087, 'approx_ent': 0.30603063, 'approx_kl': 0.022130586}
51.0
{'pi_loss': -0.016319888, 'v_loss': 93.607414, 'entropy_loss': -0.3489653, 'approx_ent': 0.3818686, 'approx_kl': -0.0060985005}
142.0
Early stopping at step 3 due to reaching max kl.
{'pi_loss': -0.021982225, 'v_loss': 98.77708, 'entropy_loss': -0.30000085, 'approx_ent'

In [4]:
print(m_losses)

{'pi_loss': -0.021982225, 'v_loss': 98.77708, 'entropy_loss': -0.30000085, 'approx_ent': 0.26633313, 'approx_kl': 0.022213459}
