In [1]:
import tensorflow as tf
print(tf.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.11.0


In [27]:
import tensorflow as tf 
import numpy as np
import gym

In [28]:
env = gym.envs.make("MountainCarContinuous-v0")

In [29]:
tf.reset_default_graph()

In [30]:
input_dims = 2
state_placeholder = tf.placeholder(tf.float32,[None,input_dims])

In [31]:
def value_function(state):
    n_hidden1 = 400  
    n_hidden2 = 400
    n_outputs = 1
    
    with tf.variable_scope("value_network"):
        init_xavier = tf.contrib.layers.xavier_initializer()
        
        hidden1 = tf.layers.dense(state, n_hidden1, tf.nn.elu, init_xavier)
        hidden2 = tf.layers.dense(hidden1, n_hidden2, tf.nn.elu, init_xavier) 
        V = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
    return V

In [32]:
def policy_network(state):
    n_hidden1 = 40
    n_hidden2 = 40
    n_outputs = 1
    
    with tf.variable_scope("policy_network"):
        init_xavier = tf.contrib.layers.xavier_initializer()
        
        hidden1 = tf.layers.dense(state, n_hidden1, tf.nn.elu, init_xavier)
        hidden2 = tf.layers.dense(hidden1, n_hidden2, tf.nn.elu, init_xavier)
        mu = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
        sigma = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
        sigma = tf.nn.softplus(sigma) + 1e-5
        norm_dist = tf.contrib.distributions.Normal(mu, sigma)
        action_tf_var = tf.squeeze(norm_dist.sample(1), axis=0)
        action_tf_var = tf.clip_by_value(
            action_tf_var, env.action_space.low[0], 
            env.action_space.high[0])
    return action_tf_var, norm_dist


In [33]:
#sample from state space for state normalization
import sklearn
import sklearn.preprocessing
                                    
state_space_samples = np.array(
    [env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)

#function to normalize states
def scale_state(state):                 #requires input shape=(2,)
    scaled = scaler.transform([state])
    return scaled                       #returns shape =(1,2)   

In [34]:
lr_actor = 0.00002  #set learning rates
lr_critic = 0.001

# define required placeholders
action_placeholder = tf.placeholder(tf.float32)
delta_placeholder = tf.placeholder(tf.float32)
target_placeholder = tf.placeholder(tf.float32)

action_tf_var, norm_dist = policy_network(state_placeholder)
V = value_function(state_placeholder)

# define actor (policy) loss function
loss_actor = -tf.log(norm_dist.prob(action_placeholder) + 1e-5) * delta_placeholder
training_op_actor = tf.train.AdamOptimizer(
    lr_actor, name='actor_optimizer').minimize(loss_actor)

# define critic (state-value) loss function
loss_critic = tf.reduce_mean(tf.squared_difference(
                             tf.squeeze(V), target_placeholder))
training_op_critic = tf.train.AdamOptimizer(
        lr_critic, name='critic_optimizer').minimize(loss_critic)

In [36]:
#training loop
gamma = 0.99
num_episodes = 300

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    episode_history = []
    for episode in range(num_episodes):
        #receive initial state from E
        state = env.reset()   # state.shape -> (2,)
        reward_total = 0 
        steps = 0
        done = False
        while (not done):
                
            #Sample action according to current policy
            #action.shape = (1,1)
            action  = sess.run(action_tf_var, feed_dict={
                          state_placeholder: scale_state(state)})
            #Execute action and observe reward & next state from E
            # next_state shape=(2,)    
            #env.step() requires input shape = (1,)
            next_state, reward, done, _ = env.step(
                                    np.squeeze(action, axis=0)) 
            steps +=1
            reward_total += reward
            #V_of_next_state.shape=(1,1)
            V_of_next_state = sess.run(V, feed_dict = 
                    {state_placeholder: scale_state(next_state)})  
            #Set TD Target
            #target = r + gamma * V(next_state)     
            target = reward + gamma * np.squeeze(V_of_next_state) 
            
            # td_error = target - V(s)
            #needed to feed delta_placeholder in actor training
            td_error = target - np.squeeze(sess.run(V, feed_dict = 
                        {state_placeholder: scale_state(state)})) 
            
            #Update actor by minimizing loss (Actor training)
            _, loss_actor_val  = sess.run(
                [training_op_actor, loss_actor], 
                feed_dict={action_placeholder: np.squeeze(action), 
                state_placeholder: scale_state(state), 
                delta_placeholder: td_error})
            #Update critic by minimizinf loss  (Critic training)
            _, loss_critic_val  = sess.run(
                [training_op_critic, loss_critic], 
                feed_dict={state_placeholder: scale_state(state), 
                target_placeholder: target})
            
            state = next_state
            #end while
        episode_history.append(reward_total)
        print("Episode: {}, Number of Steps : {}, Cumulative reward: {:0.2f}".format(
            episode, steps, reward_total))
        
        if np.mean(episode_history[-100:]) > 90 and len(episode_history) >= 101:
            print("****************Solved***************")
            print("Mean cumulative reward over 100 episodes:{:0.2f}" .format(
                np.mean(episode_history[-100:])))

Episode: 0, Number of Steps : 848, Cumulative reward: 62.62
Episode: 1, Number of Steps : 891, Cumulative reward: 64.49
Episode: 2, Number of Steps : 806, Cumulative reward: 65.39
Episode: 3, Number of Steps : 999, Cumulative reward: -40.53
Episode: 4, Number of Steps : 999, Cumulative reward: -41.99
Episode: 5, Number of Steps : 620, Cumulative reward: 76.38
Episode: 6, Number of Steps : 794, Cumulative reward: 67.89
Episode: 7, Number of Steps : 578, Cumulative reward: 76.81
Episode: 8, Number of Steps : 447, Cumulative reward: 82.17
Episode: 9, Number of Steps : 873, Cumulative reward: 64.20
Episode: 10, Number of Steps : 401, Cumulative reward: 83.79
Episode: 11, Number of Steps : 744, Cumulative reward: 69.80
Episode: 12, Number of Steps : 592, Cumulative reward: 76.30
Episode: 13, Number of Steps : 809, Cumulative reward: 69.45
Episode: 14, Number of Steps : 413, Cumulative reward: 82.41
Episode: 15, Number of Steps : 544, Cumulative reward: 78.14
Episode: 16, Number of Steps : 4

Episode: 130, Number of Steps : 198, Cumulative reward: 90.59
****************Solved***************
Mean cumulative reward over 100 episodes:90.57
Episode: 131, Number of Steps : 115, Cumulative reward: 93.14
****************Solved***************
Mean cumulative reward over 100 episodes:90.65
Episode: 132, Number of Steps : 180, Cumulative reward: 90.02
****************Solved***************
Mean cumulative reward over 100 episodes:90.68
Episode: 133, Number of Steps : 112, Cumulative reward: 92.78
****************Solved***************
Mean cumulative reward over 100 episodes:90.74
Episode: 134, Number of Steps : 115, Cumulative reward: 92.62
****************Solved***************
Mean cumulative reward over 100 episodes:90.87
Episode: 135, Number of Steps : 178, Cumulative reward: 91.47
****************Solved***************
Mean cumulative reward over 100 episodes:90.88
Episode: 136, Number of Steps : 229, Cumulative reward: 87.86
****************Solved***************
Mean cumulative re

Episode: 186, Number of Steps : 156, Cumulative reward: 90.11
****************Solved***************
Mean cumulative reward over 100 episodes:91.49
Episode: 187, Number of Steps : 109, Cumulative reward: 93.13
****************Solved***************
Mean cumulative reward over 100 episodes:91.53
Episode: 188, Number of Steps : 104, Cumulative reward: 91.33
****************Solved***************
Mean cumulative reward over 100 episodes:91.50
Episode: 189, Number of Steps : 114, Cumulative reward: 93.76
****************Solved***************
Mean cumulative reward over 100 episodes:91.50
Episode: 190, Number of Steps : 110, Cumulative reward: 92.98
****************Solved***************
Mean cumulative reward over 100 episodes:91.53
Episode: 191, Number of Steps : 104, Cumulative reward: 91.95
****************Solved***************
Mean cumulative reward over 100 episodes:91.52
Episode: 192, Number of Steps : 213, Cumulative reward: 86.43
****************Solved***************
Mean cumulative re

Episode: 242, Number of Steps : 146, Cumulative reward: 92.02
****************Solved***************
Mean cumulative reward over 100 episodes:91.76
Episode: 243, Number of Steps : 192, Cumulative reward: 88.96
****************Solved***************
Mean cumulative reward over 100 episodes:91.76
Episode: 244, Number of Steps : 140, Cumulative reward: 91.28
****************Solved***************
Mean cumulative reward over 100 episodes:91.76
Episode: 245, Number of Steps : 140, Cumulative reward: 91.12
****************Solved***************
Mean cumulative reward over 100 episodes:91.75
Episode: 246, Number of Steps : 108, Cumulative reward: 91.69
****************Solved***************
Mean cumulative reward over 100 episodes:91.78
Episode: 247, Number of Steps : 163, Cumulative reward: 91.44
****************Solved***************
Mean cumulative reward over 100 episodes:91.77
Episode: 248, Number of Steps : 105, Cumulative reward: 92.58
****************Solved***************
Mean cumulative re

****************Solved***************
Mean cumulative reward over 100 episodes:91.99
Episode: 299, Number of Steps : 135, Cumulative reward: 91.40
****************Solved***************
Mean cumulative reward over 100 episodes:91.97
