In [1]:
import os
# disable tensorflow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import statistics
import numpy as np
import tensorflow as tf
import gym
import tqdm
import collections


In [2]:
env = gym.make("CartPole-v1")
initial_state, _ = env.reset()
initial_state_shape = initial_state.shape
action_space = env.action_space.n
eps = np.finfo(np.float32).eps.item()
gamma = 0.999
lr = 0.005
step_length = 50
use_dueling = True

if use_dueling:
    lr = 0.01
    gamma = 0.999

# define model

In [3]:
"""
The model will use basic Q-learning
"""

def get_model():
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    hidden = tf.keras.layers.Dense(64, activation="relu")(inputs)
    hidden = tf.keras.layers.Dense(128, activation="relu")(hidden)
    outs = tf.keras.layers.Dense(action_space, activation=None)(hidden)
    return tf.keras.Model(inputs, outs)

def get_dueling_model():
    """
    A = Q - S
    Q = A + S - mean(A)
    """
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    hidden = tf.keras.layers.Dense(128, activation="relu")(inputs)
#     hidden = tf.keras.layers.Dense(128, activation="relu")(hidden)
    A = tf.keras.layers.Dense(action_space, activation=None)(hidden)
    S = tf.keras.layers.Dense(1, activation=None)(hidden)
    A_mean = tf.math.reduce_mean(A, axis=1, name="mean")
    outs = tf.keras.layers.Add(name="outs")([A, S, -A_mean])
    
    return tf.keras.Model(inputs, outs)
    
if use_dueling:
    model = get_dueling_model()
else:
    model = get_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 dense (Dense)                  (None, 128)          640         ['input_1[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 2)            258         ['dense[0][0]']                  
                                                                                                  
 tf.math.reduce_mean (TFOpLambd  (None,)             0           ['dense_1[0][0]']                
 a)                                                                                           

# define data collection

In [4]:
def _next_step(action):
    state, reward, done, _, _ = env.step(action)
    return (state.astype(np.float32), np.array(reward, np.float32), np.array(done, np.int32))

def tf_next_step(action):
    return tf.numpy_function(_next_step, [action], (tf.float32, tf.float32, tf.int32))


def run_step(start_state, model, step_length):
    rewards = tf.constant(0.0, tf.float32)
    values = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    actions = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
    state = start_state
    for t in tf.range(step_length):
        state = tf.expand_dims(state, 0)
        value_output = model(state)
#         print(value_output)
        action = tf.math.argmax(tf.squeeze(value_output), output_type=tf.int32)
        value = value_output[0,action]
        
        state, reward, done = tf_next_step(action)
        state.set_shape(initial_state_shape)
        
        rewards += reward     
        values = values.write(t, value)
        actions = actions.write(t, action)
        done = tf.cast(done, tf.bool)
        if done:
            break
            
    next_value = model(tf.expand_dims(state, 0))
    next_value = tf.math.reduce_max(next_value, axis=1)
    values = values.stack()
    actions = actions.stack()
    return values, actions, rewards, next_value, state, done


        
        
    
    

In [5]:
_state, _ = env.reset()
result = run_step(_state, model, 50)
result

  if not isinstance(terminated, (bool, np.bool8)):


(<tf.Tensor: shape=(10,), dtype=float32, numpy=
 array([ 0.01204011, -0.01781474, -0.04292154, -0.0691343 , -0.09661203,
        -0.12549505, -0.15586528, -0.1879367 , -0.22193643, -0.25777888],
       dtype=float32)>,
 <tf.Tensor: shape=(10,), dtype=int32, numpy=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.29548955], dtype=float32)>,
 <tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.14747137,  1.9623908 , -0.2391352 , -3.0154977 ], dtype=float32)>,
 <tf.Tensor: shape=(), dtype=bool, numpy=True>)

# define returns

In [6]:
def get_returns(total_rewards):
    """
    Input: total_rewards is a value
    Output: 
        discount_array: array of discount rate.
            index i to the (end timestamp + 1) discount rate
        returns: array of discounted returns
            index i means the returns between index i to the index(end timestamp + 1)
    """
    total_rewards = tf.cast(total_rewards, tf.int32)
    rewards_array = tf.ones(shape=(total_rewards,), dtype=tf.float32)
    discount_array = rewards_array * gamma
    discount_array = tf.math.cumprod(discount_array , reverse=True) 
    returns_array = tf.math.cumsum(discount_array / gamma, reverse=True)
    
    return discount_array, returns_array

rewards = result[2]
get_returns(rewards)

(<tf.Tensor: shape=(10,), dtype=float32, numpy=
 array([0.990045  , 0.99103606, 0.99202806, 0.9930211 , 0.9940151 ,
        0.9950101 , 0.9960061 , 0.9970031 , 0.99800104, 0.999     ],
       dtype=float32)>,
 <tf.Tensor: shape=(10,), dtype=float32, numpy=
 array([9.955121 , 8.964085 , 7.9720564, 6.9790354, 5.98502  , 4.9900103,
        3.9940042, 2.9970012, 1.9990001, 1.       ], dtype=float32)>)

# Loss

In [7]:
loss_func = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)



def get_loss_discount_rate(values):
    """
    Used for TD learning. As the time step increase the loss will be increased. This method will try to reduce the increasing loss
    caused in incrasing time step.
    
    t1,t2,t3,...tn -> [1/n, 1/n-1, 1/n-2, ... 1]
    
    """
    discount = tf.ones_like(values, dtype=tf.float32)
    discount = 1.0 / tf.math.cumsum(discount, reverse=True)
    discount = tf.math.pow(discount, 1.0)
    return discount

# def get_loss_discount_rate(values, rate=0.99):
#     """
#     Used for TD learning. As the time step increase the loss will be increased. This method will try to reduce the increasing loss
#     caused in incrasing time step.
    
#     t1,t2,t3,...tn -> [1/n, 1/n-1, 1/n-2, ... 1]
    
#     """
#     discount = tf.ones_like(values, dtype=tf.float32)
#     discount = discount * rate
#     discount = tf.math.cumprod(discount, reverse=True)
#     discount = tf.math.pow(discount, 4.0)
#     return discount



def calculate_loss(returns, values, value_next, discount_array):
    """
    Q(St) = (Ut - UT) + alpha * Q(ST); where alpha = gamma ^ (T - t)
    
    Ut - UT is the returns
    """
#     Yt = returns +  value_next
    loss_dis_rate = get_loss_discount_rate(values)
    # pay attention to below, different time steps have different discount rate on value_next
    left = values - discount_array * value_next
    right = returns
    left = left * loss_dis_rate
    right = right * loss_dis_rate
    loss = loss_func(tf.expand_dims(left,1), tf.expand_dims(right,1))
    return loss


In [8]:
values, actions, rewards, next_value, state, done = result
discount_array, returns_array = get_returns(rewards)
calculate_loss(returns_array, values, next_value, discount_array)

<tf.Tensor: shape=(), dtype=float32, numpy=0.9297956>

In [9]:
get_loss_discount_rate(values)

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([0.1       , 0.11111111, 0.125     , 0.14285715, 0.16666667,
       0.2       , 0.25      , 0.33333334, 0.5       , 1.        ],
      dtype=float32)>

# train step

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

def run_train_step(model, optimizer, start_state, step_length):
    with tf.GradientTape() as tape:
        STEP_RES = run_step(start_state, model, step_length)
        values, actions, rewards, next_value, state, done = STEP_RES
        discount_array, returns_array = get_returns(rewards)
        loss = calculate_loss(returns_array, values, next_value, discount_array)
    gradient = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradient, model.trainable_variables))
    return STEP_RES, loss

In [11]:
_state, _ = env.reset()
run_train_step(model, optimizer, _state, 50)

((<tf.Tensor: shape=(8,), dtype=float32, numpy=
  array([ 0.01408245, -0.02872298, -0.05520951, -0.0825946 , -0.11144827,
         -0.14186016, -0.17387797, -0.2074891 ], dtype=float32)>,
  <tf.Tensor: shape=(8,), dtype=int32, numpy=array([1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)>,
  <tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
  <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.24282286], dtype=float32)>,
  <tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.06571004,  1.5303363 , -0.22228448, -2.5553713 ], dtype=float32)>,
  <tf.Tensor: shape=(), dtype=bool, numpy=True>),
 <tf.Tensor: shape=(), dtype=float32, numpy=0.9325868>)

# RUN

In [12]:
min_epoch = 100
max_epoch = 10000
step_length = 200
thred = 475
max_steps_per_epoch = 600
running_rewards = collections.deque(maxlen=min_epoch)
all_rewards = []
all_running_rewards = []
t = tqdm.trange(max_epoch)
for i in t:
    start_state, _ = env.reset()
    cur_step = 0
    epoch_reward = 0
    while cur_step < max_steps_per_epoch:
        STEP_RES, loss = run_train_step(model, optimizer, start_state, step_length)
        cur_step += step_length
        values, actions, rewards, next_value, state, done = STEP_RES
        epoch_reward += int(tf.reduce_sum(rewards))
        if done:
            break
        
    running_rewards.append(epoch_reward)
    avg_reward = statistics.mean(running_rewards)
    all_rewards.append(epoch_reward)
    all_running_rewards.append(avg_reward)
    t.set_postfix(running_rewards=avg_reward, current_reward=epoch_reward, loss=float(loss))
    if avg_reward > thred and i > min_epoch:
        break
    
    


  1%|          | 106/10000 [00:36<57:30,  2.87it/s, current_reward=600, loss=0.906, running_rewards=113]   


KeyboardInterrupt: 