# Setup

## Imports

In [1]:
from tqdm import tqdm_notebook

In [2]:
import gym

## Define Useful Features

In [3]:
env = gym.make('MountainCar-v0')

In [4]:
n = env.observation_space.shape[0] + 1
max_reward = -110

In [5]:
params = np.random.randn(n) * 100

In [6]:
best_params = params
best_reward = -np.inf

In [7]:
def get_action(obs):
    return 0 if obs.dot(params[1:].T) < params[0] else 1

In [8]:
def custom_done(obs, r, max_reward=10000):
    return np.any(obs > env.observation_space.high) or np.any(obs < env.observation_space.low) or r >= max_reward

In [9]:
def sample(episodes=1, observe=True, max_reward=10000):
    epoch_reward = 0
    for episode in range(episodes):
        env.reset()
        done = False
        obs = env.observation_space.sample()
        episode_reward = 0
        while not done:
            if observe: 
                env.render()
            obs, r, done, _ = env.step(get_action(obs)) # take a random action
            episode_reward += r
            if observe and max_reward > 0:
                done = custom_done(obs, episode_reward, max_reward)
                
        epoch_reward += episode_reward
        if observe and max_reward > 0:
            print('Epsiode', episode + 1, '\tReward: ', int(episode_reward), end='\r', flush=True)
    if observe:
        env.render(close=True)
        return
    return epoch_reward

In [10]:
def train(epochs=0, episodes=100, show_improvements=True):
    global params, best_params, best_reward
    
    def train_epoch():
        global params, best_params, best_reward
        params = np.random.randn(n) * 100
        epoch_reward = sample(episodes, False)

        if epoch_reward > best_reward:
            best_reward = epoch_reward
            best_params = params
            sample(max_reward=-1)
            return True
        return False
            
    if epochs <= 0:
        while best_reward / episodes < max_reward:
            if train_epoch():
                print('Average reward:', int(best_reward / episodes), end='\r', flush=True)
    else:
        for _ in tqdm_notebook(range(epochs)):
            train_epoch()

    params = best_params
    print('Average reward:', int(best_reward / episodes), end='\r', flush=True)

# Train

In [11]:
train(100)


Average reward: -200

# Sample

In [12]:
sample(max_reward=-1)