This notebook focus on Actor-Critc and A2C

In [1]:
import os
# disable tensorflow logging
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import statistics
import numpy as np
import tensorflow as tf
import gym
import tqdm
import collections


2023-03-23 15:54:58.617214: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 15:54:58.699087: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-23 15:54:58.699103: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-23 15:54:59.083418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
class Env():
    def __init__(self, max_step=1000):
        self.env = gym.make("CartPole-v1")
        self.max_step = max_step
        self.cur_step = 1
        self.action_space = self.env.action_space
        
    def reset(self):
        self.cur_step = 1
        initial_state, info = self.env.reset()
        initial_state = self.add_step_into_state(initial_state)
        return initial_state, info
    
    def add_step_into_state(self, state):
#         state = np.concatenate([state, np.array([self.cur_step/self.max_step])])
        
        return state
    
    def step(self, action):
        self.cur_step += 1
        state, reward, done, _, _ = self.env.step(action)
        if self.cur_step > self.max_step:
            reward = 1.0
            done = True
            self.cur_step = self.max_step + 1
        else:
            if done:
                reward = 1.0
        state = self.add_step_into_state(state)
        return state, reward, done, _, _

In [3]:
env = gym.make("CartPole-v1")
# env = Env()
initial_state, _ = env.reset()
initial_state_shape = initial_state.shape
action_space = env.action_space.n
eps = np.finfo(np.float32).eps.item()
gamma = 0.99
lr = 0.001
step_length = 50
use_dueling = False 
"""Q learning training is much harder than policy gradient"""

if use_dueling:
    lr = 0.0003
    gamma = 0.99

# define model

In [4]:
"""
The model will use basic Q-learning
"""

def get_model():
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    hidden = tf.keras.layers.Dense(64, activation="relu")(inputs)
    hidden = tf.keras.layers.Dense(128, activation="relu")(hidden)
    outs = tf.keras.layers.Dense(action_space, activation=None)(hidden)
    return tf.keras.Model(inputs, outs)

def get_dueling_model():
    """
    A = Q - S
    Q = A + S - mean(A)
    """
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    hidden = tf.keras.layers.Dense(64, activation="relu")(inputs)
    hidden = tf.keras.layers.Dense(128, activation="relu")(hidden)
    A = tf.keras.layers.Dense(action_space, activation=None)(hidden)
    S = tf.keras.layers.Dense(1, activation=None)(hidden)
    A_mean = tf.math.reduce_mean(A, axis=1, name="mean")
    outs = tf.keras.layers.Add(name="outs")([A, S, -A_mean])
    
    return tf.keras.Model(inputs, outs)
    
if use_dueling:
    model = get_dueling_model()
else:
    model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense (Dense)               (None, 64)                320       
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
Total params: 8,898
Trainable params: 8,898
Non-trainable params: 0
_________________________________________________________________


2023-03-23 15:54:59.657521: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-23 15:54:59.657717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-23 15:54:59.657753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-23 15:54:59.657779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2023-03-23 15:54:59.657804: W tensorflow/c

# define data collection

In [5]:
def _run_step_numpy(action):
    state, reward, done, _, _ = env.step(action)
    return (state.astype(np.float32), np.array(reward, dtype=np.float32), np.array(done, dtype=np.int32))

def run_step_tf(action):
    return tf.numpy_function(_run_step_numpy, [action], (tf.float32, tf.float32, tf.int32))

def run_step(start_state, model, step_length):

    values = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    actions = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
    rewards = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    done = tf.constant(True, dtype=tf.bool)
    done_shape = done.shape
    
    state = start_state
    for t in tf.range(step_length):
        q_output = model(tf.expand_dims(state, 0))
        q_output = tf.squeeze(q_output)
        action = tf.math.argmax(q_output, output_type=tf.int32)
        
        value = q_output[action]
        state, reward, done = run_step_tf(action)
        state.set_shape(initial_state_shape)
        
        actions = actions.write(t, action)
        values = values.write(t, tf.squeeze(value))
        rewards = rewards.write(t, reward)
        done = tf.cast(done, dtype=tf.bool)
        done.set_shape(done_shape)
        if done:
            break
    next_value = tf.constant(0.0, dtype=tf.float32)
    if t == step_length - 1:
        vcomplete = 1 / (1 - gamma)
        vcomplete = tf.constant(vcomplete, dtype=tf.float32)
        rewards = rewards.write(t-1, vcomplete)
     
    actions = actions.stack()
    values = values.stack()
    rewards = rewards.stack()
    return values, actions, rewards, next_value, state, done

In [6]:
state_, _ = env.reset()
result = run_step(state_, model, 100)
values, actions, rewards, next_state, state, done = result

  if not isinstance(terminated, (bool, np.bool8)):


# define returns

In [7]:
def get_returns(rewards_array, gamma=0.99):
    """
    Input: total_rewards is a value
    Output: 
        discount_array: array of discount rate.
            index i to the (end timestamp + 1) discount rate
        returns: array of discounted returns
            index i means the returns between index i to the index(end timestamp + 1)
    """
    rewards_array = tf.squeeze(rewards_array)
    gamma = tf.constant(gamma, tf.float32)
    discounted_return = tf.constant(0.0, tf.float32)
    dshape = discounted_return.shape
    returns = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    for idx in tf.range(tf.shape(rewards_array)[0] -1, -1, -1):
        discounted_return = gamma * discounted_return + rewards_array[idx]
        discounted_return.set_shape(dshape)
        returns = returns.write(idx, discounted_return)
    discount_array = tf.ones_like(rewards_array, dtype=tf.float32) * gamma
    discount_array = tf.math.cumprod(discount_array, reverse=True)
    returns = returns.stack()
    
    returns = (returns - tf.math.reduce_mean(returns)) / (eps + tf.math.reduce_std(returns))
    
    return discount_array, returns
 

In [8]:
rewards = result[2]
get_returns(rewards)

(<tf.Tensor: shape=(23,), dtype=float32, numpy=
 array([0.79361445, 0.80163074, 0.809728  , 0.8179071 , 0.8261688 ,
        0.8345139 , 0.8429433 , 0.8514579 , 0.8600585 , 0.8687459 ,
        0.8775211 , 0.88638496, 0.89533836, 0.90438217, 0.91351736,
        0.9227448 , 0.9320654 , 0.9414802 , 0.95099014, 0.9605961 ,
        0.97029907, 0.98010004, 0.99      ], dtype=float32)>,
 <tf.Tensor: shape=(23,), dtype=float32, numpy=
 array([ 1.6004506 ,  1.4650329 ,  1.3282472 ,  1.1900798 ,  1.050517  ,
         0.90954405,  0.7671474 ,  0.6233125 ,  0.47802448,  0.33126906,
         0.18303116,  0.03329597, -0.11795165, -0.27072698, -0.42504552,
        -0.58092284, -0.7383747 , -0.897417  , -1.0580659 , -1.2203374 ,
        -1.3842481 , -1.5498143 , -1.7170529 ], dtype=float32)>)

# Loss

In [9]:
loss_func = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM)
def calculate_loss(returns, values, value_next, discount_array):
    """
    Policy part ---------
    V(St) = E(Q) = Pi(St, At1; theta) * Q(St, At1) + ...
    G denote gradient W.R.T theta
    
    
    G(V(St)) = G[Pi(St, At1; theta) * Q(St, At1) + ...]
                approximate= Pi(St, A1)* G(logpi(St, At1; theta) * Q) + ...  # chain rule G(logpi) = 1/pi * G(pi)
                = E[ G(logpi * Q) ] # Pi(St, A) is the PDF, so this is the expectation
              [1]  approximate= G(logpi * Q)  # monte carlo approximation
              [2]  = G(logpi * (Q - baseline))  where baseline can be V. This is the A2C
                  Qt can be approximate by Yt
              Yt = gamma^T * Q(T) + r + gamma*r + gamma^2*r + ...
              
    Critic Part TD learning -----------
    Qt = discounted_ovserved + QT
            
    """
    values = tf.squeeze(values)
    returns = tf.squeeze(returns)
    
    loss = loss_func(tf.expand_dims(values, 1), tf.expand_dims(returns,1))
    
    return loss#, abs(Yt - values)

In [10]:
values, actions, rewards, next_value, state, done = result
discount_array, returns_array = get_returns(rewards, gamma)
calculate_loss(returns_array, values, next_value, discount_array)

<tf.Tensor: shape=(), dtype=float32, numpy=20.00364>

# train step

In [11]:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

@tf.function
def run_train_step(model, optimizer, start_state, step_length):
    with tf.GradientTape() as tape:
        STEP_RES = run_step(start_state, model, step_length)
        values, actions, rewards, next_value, state, done = STEP_RES
        discount_array, returns_array = get_returns(rewards, gamma)
        loss = calculate_loss(returns_array, values, next_value, discount_array)
    gradient = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradient, model.trainable_variables))
    return STEP_RES, loss

In [12]:
_state, _ = env.reset()
run_train_step(model, optimizer, _state, 50)

((<tf.Tensor: shape=(12,), dtype=float32, numpy=
  array([ 0.00847541, -0.02667668,  0.00580538, -0.02851547,  0.00177115,
         -0.0305927 , -0.06151223, -0.09285256, -0.12473955, -0.15730904,
         -0.19074258, -0.22514847], dtype=float32)>,
  <tf.Tensor: shape=(12,), dtype=int32, numpy=array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)>,
  <tf.Tensor: shape=(12,), dtype=float32, numpy=array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.0>,
  <tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.07948017,  1.5441382 , -0.21186265, -2.5351312 ], dtype=float32)>,
  <tf.Tensor: shape=(), dtype=bool, numpy=True>),
 <tf.Tensor: shape=(), dtype=float32, numpy=10.400293>)

# RUN

In [13]:
min_epoch = 100
max_epoch = 10000
step_length = 500
thred = 475
running_rewards = collections.deque(maxlen=100)
max_steps_per_epoch = 500
all_rewards = []
all_running_rewards = []
t = tqdm.trange(max_epoch)
for i in t:
    start_state, _ = env.reset()
    cur_step = 0
    epoch_reward = 0
    while cur_step < max_steps_per_epoch:
        STEP_RES, loss= run_train_step(model, optimizer, start_state, step_length)
        cur_step += step_length
        values, actions, rewards, next_value, state, done = STEP_RES
        epoch_reward += int(sum(rewards))
        if done:
            break
    running_rewards.append(epoch_reward)
    avg_reward = statistics.mean(running_rewards)
    all_rewards.append(epoch_reward)
    all_running_rewards.append(avg_reward)
    t.set_postfix(running_rewards=avg_reward, current_reward=epoch_reward, loss=float(loss))
    if avg_reward > thred and i > min_epoch:
        break
    
    


  3%|▎         | 342/10000 [00:14<06:47, 23.72it/s, current_reward=38, loss=28.8, running_rewards=68.6] 


KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot as plt

In [None]:
_state, _ = env.reset()
result = run_step(_state, model, 1000)
values, actions, rewards, next_value, state, done = result

In [None]:
plt.plot(range(len(values)), values)

In [None]:
# Render an episode and save as a GIF file

from IPython import display as ipythondisplay
from PIL import Image

render_env = gym.make("CartPole-v1", render_mode='rgb_array')

def render_episode(env: gym.Env, model: tf.keras.Model, max_steps: int): 
  state, info = env.reset()
  state = tf.constant(state, dtype=tf.float32)
  screen = env.render()
  images = [Image.fromarray(screen)]
  values = []

  for i in range(1, max_steps + 1):
    state = tf.expand_dims(state, 0)
    value_output = model(state)
    value_output = np.squeeze(value_output)
    action = np.argmax(np.squeeze(value_output))
    values.append(value_output[action])

    state, reward, done, truncated, info = env.step(action)
    state = tf.constant(state, dtype=tf.float32)

    # Render screen every 10 steps
    if i % 10 == 0:
      screen = env.render()
      images.append(Image.fromarray(screen))

    if done:
      break

  return images, values


# Save GIF image
images,values = render_episode(render_env, model, max_steps_per_epoch)
image_file = 'cartpole-v1.gif'
# loop=0: loop forever, duration=1: play each frame for 1ms
images[0].save(
    image_file, save_all=True, append_images=images[1:], loop=0, duration=1)


In [None]:
import tensorflow_docs.vis.embed as embed
embed.embed_file(image_file)

In [None]:
values = tf.stack(values)
values = values.numpy()
plt.plot(range(len(values)), values)