In [1]:
import gym

import keras as K
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical

import numpy as np

from random import shuffle, random, choice

Using TensorFlow backend.


In [2]:
env = gym.make('MountainCar-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


## Model


__Input__:

* Car Position
* Car Velocity

__Output__:

* Choice of action [0, 1, 2]

So here we're going to be using N layers:

* Fully Connected 2
* Hidden 32
* Fully Connected 3

In [3]:
def create_model():
    model = Sequential([
        Dense(32, input_dim=2),
        Activation('relu'),
        Dense(3),
        Activation('softmax')
    ])

    model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

In [4]:
def reshape(observation):
    return observation.reshape(1,2)

def get_action(model, state):
    s = reshape(state)
    result = model.predict(s)
    return np.argmax(result)

## Reward Function

My custom reward function rewards the agent for making forward progress, without penalizing it for moving backwards, as moving backwards is necessary for gaining the required momentum to move up the hill.

I unfortunately was unable to get the model to correctly learn a policy which was able to solve the problem. I think the model in my case may have been to simple.

In [5]:
def run_episode(model, n_steps=500, epsilon=0.2):
    experiences = []
    
    state = env.reset()
    total_reward = 0.0
    for t in range(n_steps):
        env.render()
        current_state = state
        
        if random() > epsilon:
            action = get_action(model, state)
        else:
            action = choice(range(3))
        state, r, is_done, _ = env.step(action)
        
        ## Custom reward calculation
        distance_right = state[0] + 0.5
        distance_reward = (max(distance_right, 0) * 10) ** 2
        custom_reward = r + distance_reward
        
        total_reward += custom_reward
        exp = (current_state, action) #, custom_reward, state)
        experiences.append(exp)
        if is_done: 
#             print("Finished with Reward: {}".format(total_reward))
            env.close()
            break
    return (total_reward, experiences)

In [6]:
def get_top_episodes(episodes, pct):
    sorted_batch = sorted(episodes, key=lambda x: x[0], reverse=True)
    top_n = int(len(sorted_batch) * pct)
    top_episodes = sorted_batch[:top_n]
    return top_episodes

In [7]:
def get_shuffled_xy(episodes):
    flattened = [
        experience
        for episode in top_episodes
        for experience in episode[1]
    ]
    
    shuffle(flattened)
    return get_states_actions(flattened)
    
def get_states_actions(episode):
    states, actions = zip(*episode)
    return np.array(states), to_categorical(actions, 3)

In [8]:
n_episodes = 200
top_pct = 0.1
n_steps = 500

training_epochs = 10

episode_batch = []
model = create_model()
for epoch in range(training_epochs):
    for ep_n in range(n_episodes):
        episode = run_episode(model, n_steps)
        episode_batch.append(episode)
    rewards = [x[0] for x in episode_batch]
    print("Avg Reward", sum(rewards) / len(rewards))
    top_episodes = get_top_episodes(episode_batch, top_pct)
    x_train, y_train = get_shuffled_xy(top_episodes)

    model.fit(x_train, y_train, epochs=100, batch_size=32, verbose=0)

Avg Reward -188.9136845986024
Avg Reward -189.2402057827802
Avg Reward -189.49895020765464
Avg Reward -189.6857238656394
Avg Reward -189.93705159623877
Avg Reward -189.14751161874483
Avg Reward -187.12480402563335
Avg Reward -188.0389604024254
Avg Reward -187.90269768764284
Avg Reward -187.65272190429567
