In [1]:
import os,sys

# disable tensorflow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import statistics
import numpy as np
import tensorflow as tf
import gym
import tqdm
import collections

sys.path.insert(0, "/home/wei/data/code/tf-learning/")

from lib.sumTree import SumTree

In [2]:
# TODO add current time step in to state for NN input.

In [3]:
env = gym.make("CartPole-v1")
initial_state, _ = env.reset()
initial_state_shape = initial_state.shape
action_space = env.action_space.n
eps = np.finfo(np.float32).eps.item()
gamma = 0.99
lr = 0.005
step_length = 50
use_dueling = True
batch_size = 64

if use_dueling:
    lr = 0.01
    gamma = 0.99
    
replay_cache_size=5000
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# define model

In [4]:
"""
The model will use basic Q-learning
"""

def get_model():
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    hidden = tf.keras.layers.Dense(64, activation="relu")(inputs)
    hidden = tf.keras.layers.Dense(128, activation="relu")(hidden)
    outs = tf.keras.layers.Dense(action_space, activation=None)(hidden)
    return tf.keras.Model(inputs, outs)

def get_dueling_model():
    """
    A = Q - S
    Q = A + S - mean(A)
    """
    inputs = tf.keras.layers.Input(shape=initial_state_shape)
    hidden = tf.keras.layers.Dense(128, activation="relu")(inputs)
#     hidden = tf.keras.layers.Dense(128, activation="relu")(hidden)
    A = tf.keras.layers.Dense(action_space, activation=None)(hidden)
    S = tf.keras.layers.Dense(1, activation=None)(hidden)
    A_mean = tf.math.reduce_mean(A, axis=1, name="mean")
    outs = tf.keras.layers.Add(name="outs")([A, S, -A_mean])
    
    return tf.keras.Model(inputs, outs)
    
if use_dueling:
    model = get_dueling_model()
else:
    model = get_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 dense (Dense)                  (None, 128)          640         ['input_1[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 2)            258         ['dense[0][0]']                  
                                                                                                  
 tf.math.reduce_mean (TFOpLambd  (None,)             0           ['dense_1[0][0]']                
 a)                                                                                           

# define cache

In [5]:
replay_buffer = SumTree(replay_cache_size)

# define run step

</br> 1. run for T steps or till the end of the game -- Get (1) (state_t, action_t, value, return_t_T, state_T ) # of (T-t) timestamp
</br> 2. evaluate the loss for (state_t, action_t, return_t_T, state_T ), LOSS
</br> 3. add the data to replay buffer
</br> 4. random sample N from the replay buffer W.R.T the LOSS and compute gradient and update the model weights
</br> 5. re-calculate the sampled data loss and update the replay buffer
</br> 6. redo from 1

In [6]:
# everying should be tf instead of numpy
def _run_step(start_state, model, step_length):
    state = start_state
    states = []
    actions = []
    values = []
    returns = []
    for t in range(step_length):
        state = tf.expand_dims(state, 0)
        value_output = model(state)
        action = np.argmax(value_output.numpy().squeeze())
        value = value_output[0, action]
        
        states.append(state)
        actions.append(action)
        values.append(value)
        
        state, reward, done, _, _ = env.step(action)
        returns.append(reward)
        if done:
            break
    next_state = state
    next_value = model(tf.expand_dims(next_state, 0))
    next_value = tf.reduce_max(tf.squeeze(next_value))
    return states, actions, values, returns, next_state, next_value, done
    
def TD_organize(states, actions, values, returns, next_state, next_value, gamma=0.99):
    """ organize the output of run_step into
    (state_t, action_t, value_t, return_t to T, next_state, next_value, time step to next_state)
    """
    values = tf.convert_to_tensor(values)
    returns = tf.convert_to_tensor(returns)
    
    # 1. process returns
    decays = tf.math.cumprod(returns * gamma, reverse=True) / gamma
    dreturns = tf.math.cumsum(decays, reverse = True)
    decays = decays * gamma
    
    data = []
    next_state_tf = tf.expand_dims(next_state, 0)
    
    # 2. calculate loss
    timestep = tf.ones_like(returns, dtype=tf.float32)
    # or
#     timestep = tf.math.cumsum(returns, reverse = True)
    diff = calculate_diff(values, next_value, dreturns, decays, timestep)
    
    
    # 3.
    for t in range(len(states)-1,-1,-1):
        record = (states[t], actions[t], values[t], dreturns[t], decays[t], next_state_tf, next_value, timestep[t])
        data.append(record)
    return data, diff
        
    
def calculate_diff(value_t, value_T, accumulated_returns, decay, timestep=1.0):
    target = decay * value_T + accumulated_returns
    value_t = value_t / timestep
    target = target / timestep
    diff = (target - value_t) ** 2 # array
    return diff

def run_step(start_state, model, step_length, gamma=0.99):
    states, actions, values, returns, next_state, next_value, done = _run_step(start_state, model, step_length)
    
    
    data, diff = TD_organize(states, actions, values, returns, next_state, next_value, gamma)
    for data_record, diff_record in zip(data, diff):
        replay_buffer.add_new_data(data_record, diff_record)
    return next_state, done, len(returns)

    
    

## Test

### run step

In [7]:
state_, _ = env.reset()
states, actions, values, returns, next_state, next_value, done = _run_step(state_, model, 30)

  if not isinstance(terminated, (bool, np.bool8)):


### organize

In [8]:
data, diff = TD_organize(states, actions, values, returns, next_state, next_value)

### loss

### all

In [9]:
state_, _ = env.reset()
run_step(state_, model, 100, gamma=0.99)

(array([ 0.12146739,  1.5232143 , -0.22919363, -2.6031637 ], dtype=float32),
 True,
 8)

In [10]:
replay_buffer.tree_.weight

<tf.Tensor: shape=(), dtype=float32, numpy=197.263>

# Update Model

In [11]:


def sample(batch_size):
    tree_instances = replay_buffer.sample(batch_size)
    states = []
    next_states = []
    losses = []
    for ins in tree_instances:
        state, a, v, r, decay, next_state_tf, next_value, tstep = ins.data
        losses.append(ins.weight)
        states.append(state)
        next_states.append(next_state_tf)
    return tree_instances, states, next_states, losses
        
def update_model(state, model, optimizer, batch_size=64, step_length=100, gamma=0.99):
    with tf.GradientTape(persistent=True) as tape:
        next_state, done, returns_sum = run_step(state_, model, step_length, gamma)
        tree_instances, states, next_states, losses = sample(batch_size)
        loss_value = tf.math.reduce_mean(losses)
    try:
        gradient = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(gradient, model.trainable_variables))
    except Exception:
        return tree_instances, tree_instances, tree_instances, True
    update_loss(tree_instances, model)
    return next_state, done, returns_sum, False
    
def update_loss(tree_instances, model):
    states = []
    next_states = []
    accumulated_returns = []
    decays = []
    timesteps = []
    
    for ins in tree_instances:
        states.append(ins.data[0])
        next_states.append(ins.data[5])
        accumulated_returns.append(ins.data[3])
        decays.append(ins.data[4])
        timesteps.append(ins.data[-1])
    states = tf.concat(states, axis=0)
    next_states = tf.concat(next_states, axis=0)
    value_t = model(states)
    value_T = model(next_states)
    value_t = tf.math.reduce_max(value_t, axis=1)
    value_T = tf.math.reduce_max(value_T, axis=1)
    accumulated_returns = tf.stack(accumulated_returns, axis=0)
    decays = tf.stack(decays, axis=0)
    timesteps = tf.stack(timesteps, axis=0)
    diff = calculate_diff(value_t, value_T, accumulated_returns, decays, timestep=timesteps)
    for idx, ins in enumerate(tree_instances):
        old_data = ins.data
        s, a, v, d, dc, ns, nv, t = old_data
        new_data = (s,a, value_t[idx], d, dc, ns, value_T[idx], t)
        new_diff = diff[idx]
        replay_buffer.update_node_by_instance(ins, new_data, new_diff)

        
    

In [12]:
state_, _ = env.reset()
print(f"weights before update: {replay_buffer.tree_.weight}")
next_state, done, returns_sum, flag_exception = update_model(state_, model, optimizer, 64)
print(f"weights after update: {replay_buffer.tree_.weight}")

weights before update: 197.26300048828125
weights after update: 248.95452880859375


# Training

In [13]:
min_episode = 100
max_episode = 10000
max_steps_per_episode = 700
threds = 475
running_reward = 0.0
step_length = 200
t = tqdm.trange(max_episode)
# replay_buffer.build()
for i in t:
    state, _ = env.reset()
    done = False
    episode_rewards = 0.0
    while not done:
        state, done, returns_sum, flag_excption = update_model(state, model, optimizer, batch_size, step_length, gamma)
        if flag_excption:
            raise ValueError('!!!!!!')
        episode_rewards += returns_sum
    running_reward = running_reward * 0.99 + 0.1 * episode_rewards
    t.set_postfix(running_reward=running_reward, episode_reward=episode_rewards)
    

  0%|          | 11/10000 [00:01<22:27,  7.41it/s, episode_reward=10, running_reward=14.8]


ValueError: !!!!!!

In [15]:
losses = []
for ins in state:
    losses.append(ins.weight)

In [16]:
losses

[<tf.Tensor: shape=(), dtype=float32, numpy=3.4855995>,
 <tf.Tensor: shape=(), dtype=float32, numpy=88.39711>,
 <tf.Tensor: shape=(), dtype=float32, numpy=285.92062>,
 <tf.Tensor: shape=(), dtype=float32, numpy=342.65555>,
 <tf.Tensor: shape=(), dtype=float32, numpy=299.86472>,
 <tf.Tensor: shape=(), dtype=float32, numpy=80.85233>,
 <tf.Tensor: shape=(), dtype=float32, numpy=107.11661>,
 <tf.Tensor: shape=(), dtype=float32, numpy=245.54613>,
 <tf.Tensor: shape=(), dtype=float32, numpy=17.49826>,
 <tf.Tensor: shape=(), dtype=float32, numpy=151.62375>,
 <tf.Tensor: shape=(), dtype=float32, numpy=234.35164>,
 <tf.Tensor: shape=(), dtype=float32, numpy=114.31948>,
 <tf.Tensor: shape=(), dtype=float32, numpy=105.245155>,
 <tf.Tensor: shape=(), dtype=float32, numpy=107.902985>,
 <tf.Tensor: shape=(), dtype=float32, numpy=22.069914>,
 <tf.Tensor: shape=(), dtype=float32, numpy=165.12369>,
 <tf.Tensor: shape=(), dtype=float32, numpy=157.15266>,
 <tf.Tensor: shape=(), dtype=float32, numpy=251.8