This notebook focus on Actor-Critc and A2C

In [1]:
import os
# disable tensorflow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import statistics
import numpy as np
import tensorflow as tf
import gym
import tqdm
import collections


In [2]:
env = gym.make("CartPole-v1")
initial_state, _ = env.reset()
initial_state_shape = initial_state.shape
action_space = env.action_space.n
eps = np.finfo(np.float32).eps.item()
gamma = 1.0
lr = 0.01
step_length = 50


# define model

In [3]:
"""
The model will use basic Actor-Critc (A2C with baseline)
"""

def get_model():
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    hidden = tf.keras.layers.Dense(128, activation="relu")(inputs)
    actions = tf.keras.layers.Dense(action_space, activation=tf.keras.activations.softmax)(hidden)
    value = tf.keras.layers.Dense(1, activation=None)(hidden)
    outs = [actions, value]
    return tf.keras.Model(inputs, outs)

model = get_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 dense (Dense)                  (None, 128)          640         ['input_1[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 2)            258         ['dense[0][0]']                  
                                                                                                  
 dense_2 (Dense)                (None, 1)            129         ['dense[0][0]']                  
                                                                                              

# define data collection

In [4]:
def _next_step(action):
    state, reward, done, _, _ = env.step(action)
    return (state.astype(np.float32), np.array(reward, np.float32), np.array(done, np.int32))

def tf_next_step(action):
    return tf.numpy_function(_next_step, [action], (tf.float32, tf.float32, tf.int32))


def run_step(start_state, model, step_length):
    rewards = tf.constant(0.0, tf.float32)
    values = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    actions = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
    action_probs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    state = start_state
    for t in tf.range(step_length):
        state = tf.expand_dims(state, 0)
        action_output, value = model(state)
        action = tf.random.categorical(tf.math.log(action_output), 1, dtype=tf.int32) 
        action = tf.squeeze(action)
        ## option to try math
#         action = tf.math.argmax(tf.squeeze(action_output), output_type=tf.int32)
        action_prob = action_output[0, action]
    
        
        state, reward, done = tf_next_step(action)
        state.set_shape(initial_state_shape)
        
        rewards += reward     
        values = values.write(t, tf.squeeze(value))
        actions = actions.write(t, action)
        action_probs = action_probs.write(t, tf.squeeze(action_prob))
        done = tf.cast(done, tf.bool)
        if done:
            break
            
    _, next_value = model(tf.expand_dims(state, 0))
    next_value = tf.squeeze(next_value)
    values = values.stack()
    actions = actions.stack()
    action_probs = action_probs.stack()
    return values, actions, action_probs, rewards, next_value, state, done


        
        
    
    

In [5]:
_state, _ = env.reset()
result = run_step(_state, model, 50)
result

  if not isinstance(terminated, (bool, np.bool8)):


(<tf.Tensor: shape=(26,), dtype=float32, numpy=
 array([-0.00321881, -0.0064002 , -0.00191115, -0.00574407, -0.01122334,
        -0.01608   , -0.00854622, -0.00299684,  0.00040344, -0.05456816,
         0.00120498, -0.04965396, -0.11863778, -0.18882552, -0.2588526 ,
        -0.329962  , -0.2636674 , -0.19884075, -0.13876355, -0.21346955,
        -0.1568311 , -0.23310043, -0.31304407, -0.25908795, -0.20702438,
        -0.15216939], dtype=float32)>,
 <tf.Tensor: shape=(26,), dtype=int32, numpy=
 array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
        0, 0, 0, 1], dtype=int32)>,
 <tf.Tensor: shape=(26,), dtype=float32, numpy=
 array([0.50302094, 0.48568046, 0.50213146, 0.51379186, 0.5261309 ,
        0.4612352 , 0.47313854, 0.48483455, 0.49768218, 0.48926473,
        0.49598694, 0.5098852 , 0.51921064, 0.5274372 , 0.53492934,
        0.45787916, 0.46908158, 0.47969836, 0.5104296 , 0.4811098 ,
        0.5090946 , 0.5174549 , 0.47426614, 0.4850101 , 0.49475515,
    

# define returns

In [6]:
def get_returns(total_rewards):
    """
    Input: total_rewards is a value
    Output: 
        discount_array: array of discount rate.
            index i to the (end timestamp + 1) discount rate
        returns: array of discounted returns
            index i means the returns between index i to the index(end timestamp + 1)
    """
    total_rewards = tf.cast(total_rewards, tf.int32)
    rewards_array = tf.ones(shape=(total_rewards,), dtype=tf.float32)
    discount_array = rewards_array * gamma
    discount_array = tf.math.cumprod(discount_array , reverse=True) 
    returns_array = tf.math.cumsum(discount_array / gamma, reverse=True)
    
    return discount_array, returns_array

rewards = result[3]
get_returns(rewards)

(<tf.Tensor: shape=(26,), dtype=float32, numpy=
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>,
 <tf.Tensor: shape=(26,), dtype=float32, numpy=
 array([26., 25., 24., 23., 22., 21., 20., 19., 18., 17., 16., 15., 14.,
        13., 12., 11., 10.,  9.,  8.,  7.,  6.,  5.,  4.,  3.,  2.,  1.],
       dtype=float32)>)

# Loss

In [7]:
loss_func = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)
# huber can make sure the actor and critic loss is at the same magnitude



def get_loss_discount_rate(values):
    """
    Used for TD learning. As the time step increase the loss will be increased. This method will try to reduce the increasing loss
    caused in incrasing time step.
    
    t1,t2,t3,...tn -> [1/n, 1/n-1, 1/n-2, ... 1]
    
    """
    discount = tf.ones_like(values, dtype=tf.float32)
    discount = 1.0 / tf.math.cumsum(discount, reverse=True)
    discount = tf.math.pow(discount, 1.0)
    return discount




def calculate_loss(returns, values, action_probs, value_next, discount_array):
    """
    Policy part ---------
    V(St) = E(Q) = Pi(St, At1; theta) * Q(St, At1) + ...
    G denote gradient W.R.T theta
    
    
    G(V(St)) = G[Pi(St, At1; theta) * Q(St, At1) + ...]
                approximate= Pi(St, A1)* G(logpi(St, At1; theta) * Q) + ...  # chain rule G(logpi) = 1/pi * G(pi)
                = E[ G(logpi * Q) ] # Pi(St, A) is the PDF, so this is the expectation
              [1]  approximate= G(logpi * Q)  # monte carlo approximation
              [2]  = G(logpi * (Q - baseline))  where baseline can be V. This is the A2C
                  Qt can be approximate by Yt
              Yt = gamma^T * Q(T) + r + gamma*r + gamma^2*r + ...
              
    Critic Part TD learning -----------
    Qt = discounted_ovserved + QT
            
    """
    loss_dis_rate = get_loss_discount_rate(values)
    
    action_probs = tf.squeeze(action_probs)
    logpi = tf.math.log( tf.clip_by_value(action_probs, eps, 1.0))
    values = tf.squeeze(values)
    Yt = returns + discount_array * value_next

    
    # this is the negative gradient instead of loss
    loss_actor = - logpi * (Yt - values)
#     loss_actor = - logpi * Yt
    loss_actor = tf.reduce_mean(loss_actor)
    # critic
    loss_critic = loss_func(tf.expand_dims(Yt*loss_dis_rate, 1), tf.expand_dims(values*loss_dis_rate,1))
#     loss_critic = loss_func(tf.expand_dims(Yt, 1), tf.expand_dims(values,1))
    loss = loss_actor + loss_critic
    return loss, loss_actor, loss_critic
    

In [8]:
values, actions, action_probs, rewards, next_value, state, done = result
discount_array, returns_array = get_returns(rewards)
calculate_loss(returns_array, values, action_probs, next_value, discount_array)

(<tf.Tensor: shape=(), dtype=float32, numpy=10.357225>,
 <tf.Tensor: shape=(), dtype=float32, numpy=9.37497>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.9822549>)

# train step

In [9]:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

def run_train_step(model, optimizer, start_state, step_length):
    with tf.GradientTape(persistent=True) as tape:
        STEP_RES = run_step(start_state, model, step_length)
        values, actions, action_probs, rewards, next_value, state, done = STEP_RES
        discount_array, returns_array = get_returns(rewards)
        loss, loss_actor, loss_critic = calculate_loss(returns_array, values, action_probs, next_value, discount_array)
    gradient = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradient, model.trainable_variables))
    return STEP_RES, loss, loss_actor, loss_critic

In [10]:
_state, _ = env.reset()
run_train_step(model, optimizer, _state, 50)

((<tf.Tensor: shape=(21,), dtype=float32, numpy=
  array([ 0.00166062, -0.06392872, -0.00058175,  0.00368372, -0.00185627,
         -0.06943968, -0.14092867, -0.21284667, -0.14633742, -0.08269417,
         -0.01806045, -0.09361966, -0.16815835, -0.10802558, -0.04622431,
         -0.12375565, -0.20051135, -0.27872792, -0.2243101 , -0.30496684,
         -0.38762915], dtype=float32)>,
  <tf.Tensor: shape=(21,), dtype=int32, numpy=
  array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
        dtype=int32)>,
  <tf.Tensor: shape=(21,), dtype=float32, numpy=
  array([0.49703497, 0.48947832, 0.50239354, 0.48430508, 0.49884677,
         0.5110986 , 0.5201013 , 0.47168228, 0.4822488 , 0.4925978 ,
         0.4968004 , 0.5075302 , 0.48327962, 0.49349865, 0.4953855 ,
         0.50627714, 0.51570725, 0.47583163, 0.51381725, 0.5223433 ,
         0.46964854], dtype=float32)>,
  <tf.Tensor: shape=(), dtype=float32, numpy=21.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=-0.34011278

# RUN

In [11]:
min_epoch = 100
max_epoch = 10000
step_length = 1000
thred = 475
max_steps_per_epoch = 600
running_rewards = collections.deque(maxlen=min_epoch)
all_rewards = []
all_running_rewards = []
t = tqdm.trange(max_epoch)
for i in t:
    start_state, _ = env.reset()
    cur_step = 0
    epoch_reward = 0
    while cur_step < max_steps_per_epoch:
        STEP_RES, loss, loss_actor, loss_critic = run_train_step(model, optimizer, start_state, step_length)
        cur_step += step_length
        values, actions, action_probs, rewards, next_value, state, done = STEP_RES
        epoch_reward += int(tf.reduce_sum(rewards))
        if done:
            break
        
    running_rewards.append(epoch_reward)
    avg_reward = statistics.mean(running_rewards)
    all_rewards.append(epoch_reward)
    all_running_rewards.append(avg_reward)
    t.set_postfix(running_rewards=avg_reward, current_reward=epoch_reward, loss=float(loss))
    if avg_reward > thred and i > min_epoch:
        break
    
    


  1%|          | 118/10000 [01:55<2:41:22,  1.02it/s, current_reward=1000, loss=236, running_rewards=480]
