### Mountain Car With Cross Entropy Method

The reward used in this mountain car problem is:<br />
$total\_reward = total\_reward + reward + (feature\_scaling\_value\_from\_range\_[-1.2,0.6]\_to\_range\_[0,1](position))^2$<br />
However this reward is not able to learn a good policy.  In this problem, each state's reward is -1.0, except the state that the car reaches the goal position.  This makes exploration become important.  Within the limited (200) steps, this method is not able to explore enough states to reach goal state (or states close to goal state).

In [1]:
import gym
import numpy as np
import tensorflow as tf
import math
import random
import bisect

  from ._conv import register_converters as _register_converters


In [2]:
# goal: car reaches the flag on top of the mountain on the right side (position value: 0.5) within 200 time steps
# action: 0 (acceleration towards left), 1 (stay), 2 (acceleration towards right)
action_size = 3
# observation (state): [position (initial value: uniformly sample from range [-0.6, -0.4], minimum value: -1.2, maximum value: 0.6), velocity (initial value: 0.0, minimum value: -0.07, maximum value: 0.07)]
state_size = 2
# other environment hyper parameters
hidden_layer_size = 128
batch_size = 25
learning_rate = 0.01
max_episodes = 100
max_steps = 200
percentile = 70
max_num_of_trials = 20

In [3]:
# partial code below is derived from repository: https://github.com/schneider128k/reinforcement/
# neural network
class Net:
    def __init__(self, 
                 state_size = state_size, 
                 action_size = action_size, 
                 hidden_layer_size = hidden_layer_size,
                 learning_rate = learning_rate, 
                 name = 'net'):
        with tf.variable_scope(name):
            ### Prediction part
            
            # Input layer, state s is input
            self.states = tf.placeholder(
                tf.float32, 
                [None, state_size])
        
            # Hidden layer, ReLU activation
            self.hidden_layer = tf.contrib.layers.fully_connected(
                self.states, 
                hidden_layer_size)
            
            # Hidden layer, linear activation, logits
            self.logits = tf.contrib.layers.fully_connected(
                self.hidden_layer, 
                action_size,
                activation_fn = None)
            
            # Output layer, softmax activation yields probability distribution for actions
            self.probabilities = tf.nn.softmax(self.logits)
    
            ### Training part 
    
            # Action a
            self.actions = tf.placeholder(
                tf.int32, 
                [None])
            
            # One-hot encoded action a 
            #
            # encoded_action_vector = [1, 0, 0] if action a = 0
            # encoded_action_vector = [0, 1, 0] if action a = 1
            # encoded_action_vector = [0, 0, 1] if action a = 2
            self.one_hot_actions = tf.one_hot(
                self.actions, 
                action_size)

            # cross entropy
            self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits = self.logits, 
                labels = self.one_hot_actions)
            
            # cost
            self.cost = tf.reduce_mean(self.cross_entropy)
            
            # Optimizer
            self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
            
    # get action chosen according to current probabilistic policy
    def get_action(self, state):
        feed_dict = { self.states : np.array([state]) } 
        probabilities = sess.run(self.probabilities, feed_dict = feed_dict)
        
        return np.random.choice(action_size, p=probabilities[0])
    
    # train based on batch
    def train(self, batch):
        states, actions = zip(*batch)
        states = np.array(states)
        actions = np.array(actions)
        
        feed_dict = {
            self.states : states,
            self.actions : actions
        }
        
        sess.run(self.optimizer, feed_dict = feed_dict)

In [4]:
# reward function
def get_reward(position, velocity):
    pos_min = -1.2
    pos_max = 0.6
    pos_cur = (position - pos_min) / (pos_max - pos_min)
    return math.pow(pos_cur, 2)

In [5]:
# get mountain car environment
env = gym.make("MountainCar-v0")

# training
tf.reset_default_graph()
net = Net(name = 'net',
          hidden_layer_size = hidden_layer_size,
          learning_rate = learning_rate)
# run
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())
    
    start_index = int(max_episodes * percentile / 100)
    
    num_of_trials = 0
    while num_of_trials < max_num_of_trials:
        num_of_trials += 1

        total_reward_list = []
        trajectory_list = []

        for e in np.arange(max_episodes):
            total_reward = 0.0
            trajectory = []
            state = env.reset()
            for s in np.arange(max_steps):
                action = net.get_action(state)
                next_state, reward, done, info = env.step(action)
                # get car position and velocity
                position, velocity = next_state
                # update car reward
                total_reward += reward + get_reward(position, velocity)
                trajectory.append((state, action))
                state = next_state
                if done: break

            index = bisect.bisect(total_reward_list, total_reward)
            total_reward_list.insert(index, total_reward)
            trajectory_list.insert(index, trajectory)
        
        # keep the elite episodes, that is, throw out the bad ones 
        # train on state action pairs extracted from the elite episodes
        state_action_pairs = []
        for trajectory in trajectory_list[start_index:]:
            for state_action_pair in trajectory:
                state_action_pairs.append(state_action_pair)
        # shuffle to avoid correlations between adjacent states
        random.shuffle(state_action_pairs) 
        n = len(state_action_pairs)
        batches = [state_action_pairs[k:k + batch_size] for k in np.arange(0, n, batch_size)]

        for batch in batches:
            net.train(batch)

        # test episode runs
        total_reward = 0
        time_step = 0
        observation = env.reset()
        for time_step in range(max_steps):
            env.render()
            action = net.get_action(observation)
            observation, reward, done, info = env.step(action)
            position, velocity = observation
            #print(f"P:{position}, V:{velocity}, D:{done}, T:{time_step}")
            total_reward += reward + get_reward(position, velocity)
            if done:
                break
        print(f"Test:{num_of_trials:2d}, Reward:{total_reward:5.2f}, Timestep:{time_step + 1:3d}")
        
# close mountain car environment
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Test: 1, Reward:-169.58, Timestep:200
Test: 2, Reward:-170.84, Timestep:200
Test: 3, Reward:-171.38, Timestep:200
Test: 4, Reward:-169.29, Timestep:200
Test: 5, Reward:-166.83, Timestep:200
Test: 6, Reward:-162.09, Timestep:200
Test: 7, Reward:-166.74, Timestep:200
Test: 8, Reward:-166.43, Timestep:200
Test: 9, Reward:-162.59, Timestep:200
Test:10, Reward:-161.99, Timestep:200
Test:11, Reward:-164.78, Timestep:200
Test:12, Reward:-164.77, Timestep:200
Test:13, Reward:-163.69, Timestep:200
Test:14, Reward:-163.54, Timestep:200
Test:15, Reward:-164.20, Timestep:200
Test:16, Reward:-161.77, Timestep:200
Test:17, Reward:-160.99, Timestep:200
Test:18, Reward:-163.44, Timestep:200
Test:19, Reward:-164.83, Timestep:200
Test:20, Reward:-162.48, Timestep:200
