In [1]:
import numpy as np
import tensorflow as tf
import gym
import tqdm
import collections, statistics

2023-03-17 18:02:01.389767: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-17 18:02:01.470855: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-17 18:02:01.470872: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-17 18:02:01.848540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
env = gym.make("CartPole-v1")
eps = np.finfo(np.float32).eps.item()
n_hidden_unit = 128
n_action_space = env.action_space.n
initial_state, _ = env.reset()
initial_state_shape = initial_state.shape
gamma = 0.99
print(f"initial state shape: {initial_state_shape}")

initial state shape: (4,)


# define model

In [3]:
def get_model():
    """
    The input will be the state
    The output will be 
    (1) action logits, shape 2
    (2) value, shape 1
    """
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    comm = tf.keras.layers.Dense(n_hidden_unit, activation="relu", name="common")(inputs)
    action_logits = tf.keras.layers.Dense(n_action_space, activation=None, name="output_action_logits")(comm)
    value = tf.keras.layers.Dense(1, activation=None, name="output_value")(comm)
    return tf.keras.Model(inputs, [action_logits, value])
model = get_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 common (Dense)                 (None, 128)          640         ['input_1[0][0]']                
                                                                                                  
 output_action_logits (Dense)   (None, 2)            258         ['common[0][0]']                 
                                                                                                  
 output_value (Dense)           (None, 1)            129         ['common[0][0]']                 
                                                                                              

2023-03-17 18:02:02.389512: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-17 18:02:02.389673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-17 18:02:02.389706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-17 18:02:02.389732: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2023-03-17 18:02:02.389756: W tensorflow/c

# Design training data collection -- very important

In [4]:
def env_step(action: int):
    """
    env.step cannot be call inside tf graph, so we need to convert it to tf function.
    """
    state, reward, done, _, _ = env.step(action)
    return (
        state.astype(np.float32), 
        np.array(reward,dtype=np.float32),
        np.array(done, dtype=np.int32)
    )

def tf_env_step(action):
    return tf.numpy_function(env_step, [action], [tf.float32, tf.float32, tf.int32])

def run_epoch(initial_state, max_steps, model):
    """
    return action probs, estimated values, actual rewards
    """
    action_probs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    estimated_values = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    rewards = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    state = initial_state
    # tf.range make sure this can be called in a tf graph
    for t in tf.range(max_steps):
        state = tf.expand_dims(state, 0) # add batch dimension
        action_logit, estimated_value = model(state)
        # sample the action
        action = tf.random.categorical(action_logit, num_samples=1, dtype=None, seed=None, name=None)[0,0]
        action_prob = tf.keras.activations.softmax(action_logit) # the first zero is the batch dim
        
        state, reward, done = tf_env_step(action)
        state.set_shape(initial_state_shape)
        
        action_probs = action_probs.write(t, tf.squeeze(action_prob)[action])
        estimated_values = estimated_values.write(t, tf.squeeze(estimated_value))
        rewards = rewards.write(t, tf.squeeze(reward))
        
        if tf.cast(done, tf.bool):
            break
    action_probs = action_probs.stack()
    estimated_values = estimated_values.stack()
    rewards = rewards.stack()
    return action_probs, estimated_values, rewards
    
    

## test the output

In [5]:
initial_state, _ = env.reset()
A = run_epoch(initial_state, 500, model)
A

  if not isinstance(terminated, (bool, np.bool8)):


(<tf.Tensor: shape=(18,), dtype=float32, numpy=
 array([0.4990992 , 0.50119716, 0.49918973, 0.5014183 , 0.4993501 ,
        0.49843   , 0.5020509 , 0.4981027 , 0.49774188, 0.50262576,
        0.49747005, 0.5027613 , 0.4974178 , 0.50269586, 0.4978886 ,
        0.49761343, 0.49751216, 0.49734417], dtype=float32)>,
 <tf.Tensor: shape=(18,), dtype=float32, numpy=
 array([-0.01557202, -0.06963612, -0.01391116, -0.06862514, -0.01309672,
        -0.06810703, -0.12750013, -0.06743591, -0.1277196 , -0.18818456,
        -0.12875366, -0.19022927, -0.13185446, -0.19431609, -0.13765489,
        -0.2005909 , -0.26414856, -0.32797354], dtype=float32)>,
 <tf.Tensor: shape=(18,), dtype=float32, numpy=
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1.], dtype=float32)>)

# Calculate returns from rewards

In [6]:
# this will be called inside a tf function
def calculate_returns(rewards, normalize=tf.constant(True)):
    returns = tf.TensorArray(np.float32, size=0, dynamic_size=True)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    # cannot use rewards.shape since it will cause None issue. The tf.shape will handle this problem
    for t in tf.range(tf.shape(rewards)[0]-1, -1, -1):
        discounted_sum = discounted_sum * gamma + rewards[t]
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(t, discounted_sum)
    returns = returns.stack()
    
    if normalize:
        returns = (returns - tf.math.reduce_mean(returns)) / (tf.math.reduce_std(returns) + eps)
    return returns
        
        

## test the output

In [7]:
r = calculate_returns(A[-1])
print(f"raw returns: {r}")
print(f"mean: {tf.math.reduce_mean(r)}")
print(f"std: {tf.math.reduce_std(r)}")

raw returns: [ 1.5946827   1.4170172   1.2375572   1.0562843   0.8731804   0.6882269
  0.5014053   0.3126966   0.12208184 -0.07045836 -0.26494336 -0.461393
 -0.65982693 -0.86026525 -1.0627283  -1.2672364  -1.4738101  -1.6824704 ]
mean: 2.6490953430879927e-08
std: 1.0


# Calculate the loss

In [8]:
# The loss will be called inside tf function
huber = tf.losses.Huber(reduction=tf.losses.Reduction.SUM) # the SUM_OVER_BATCH will normalize by batch size
def compute_loss(action_probs, estimated_values, actual_values):
#     expand dim - this is extremly important. Without below, the loss is wrong and the model will not learn anything
    action_probs, estimated_values, actual_values = [
        tf.cond(tf.cast(x.shape.ndims==1, tf.bool), 
                lambda: tf.expand_dims(x,1), 
                lambda: x
               )  for x in (action_probs, estimated_values, actual_values)]
    
    
    advantage = actual_values - estimated_values
#     advantage = actual_values
    log_action = tf.math.log(tf.clip_by_value(action_probs, eps, 1e10))
    loss_actor = - tf.math.reduce_sum(advantage * log_action)
    loss_huber = huber(estimated_values, actual_values)
    loss = loss_actor + loss_huber
    
    return loss, loss_actor, loss_huber
    

## test the output

In [9]:
loss = compute_loss(A[0], A[1], calculate_returns(A[-1]))
loss

(<tf.Tensor: shape=(), dtype=float32, numpy=9.025241>,
 <tf.Tensor: shape=(), dtype=float32, numpy=1.5992051>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.426036>)

# Define train step

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

@tf.function
def train_step(initial_state, max_steps, model, optimizer):
    """
    1. run epoch to collect data
    2. calculate loss
    3. update weights
    """
    with tf.GradientTape() as tape:
        # 1. run epoch to collect data
        action_probs, estimated_values, rewards = run_epoch(initial_state, max_steps, model)
        actual_values = calculate_returns(rewards)
        
        # 2. calculate loss
        loss, loss_actor, loss_huber = compute_loss(action_probs, estimated_values, actual_values)
        
        
    # calculate gradient
    gradient = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradient, model.trainable_variables))
    total_rewards = tf.math.reduce_sum(tf.squeeze(rewards))
    A = (loss, loss_actor, loss_huber)
    return total_rewards, A
        
    

## test output

In [11]:
initial_state, _ = env.reset()
train_step(initial_state, 500, model, optimizer)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


(<tf.Tensor: shape=(), dtype=float32, numpy=14.0>,
 (<tf.Tensor: shape=(), dtype=float32, numpy=6.7945375>,
  <tf.Tensor: shape=(), dtype=float32, numpy=1.2778747>,
  <tf.Tensor: shape=(), dtype=float32, numpy=5.5166626>))

# Run

In [12]:
min_epochs_to_run = 100
max_epochs_to_run = 10000
max_steps_per_epoch = 500
reward_thred = 475
t = tqdm.trange(max_epochs_to_run)
q = collections.deque(maxlen=100)
rewards_history = []
for i in t:
    initial_state, _ = env.reset()
    initial_state = tf.constant(initial_state, dtype=tf.float32)
    epoch_reward, loss = train_step(initial_state, max_steps_per_epoch, model, optimizer)
    epoch_reward = int(epoch_reward)
    rewards_history.append(epoch_reward)
    q.append(epoch_reward)
    avg_rewards = statistics.mean(q)
    t.set_postfix(current_rewards=epoch_reward, avg_rewards=avg_rewards)
    
    if avg_rewards > reward_thred and i > min_epochs_to_run:
        break
    
    

  5%|▍         | 459/10000 [00:27<09:35, 16.59it/s, avg_rewards=323, current_rewards=222] 


KeyboardInterrupt: 

In [None]:
epoch_reward