# Setup

## Imports

In [1]:
from tqdm import tqdm_notebook
from time import time

OpenAI Gym Modules

In [2]:
import gym

Keras Modules

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [4]:
import keras.backend as K

from keras.models import Sequential
from keras.layers import Dense, Activation

Using TensorFlow backend.
  return f(*args, **kwds)


TensorFlow Modules

In [5]:
import tensorflow as tf

## Define Userful Features

In [6]:
def get_parameters():
    weights = model.get_weights()
    params = []
    for w in weights:
        params.append(w.reshape(-1))
    return np.hstack(params)

In [7]:
def set_parameters(parameters):
    global model
    param_idx = 0
    weights = []
    for s in parameter_shapes:
        num_params = np.prod(s)
        weights.append(parameters[param_idx:param_idx + num_params].reshape(s))
        param_idx += num_params
    model.set_weights(weights)

In [8]:
def get_action(obs, probabilistic=False):
    action = model.predict(np.expand_dims(obs, 0))[0]
    if a == 1:
        return action[0]
    elif a == 2:
        if probabilistic:
            return np.random.choice(2, p=[action[0], 1-action[0]])
        
        return (action > 0).astype(int)[0]
    elif probabilistic:
        return np.random.choice(a, p=action)
    
    return action.argmax()

In [9]:
def sample(episodes=1, observe=False):
    epoch_reward = 0
    for episode in range(episodes):
        env.reset()
        done = False
        obs = env.observation_space.sample()
        episode_reward = 0
        while not done:
            if observe:  env.render()
            obs, r, done, _ = env.step(get_action(obs))
            episode_reward += r
                
        epoch_reward += episode_reward
    return epoch_reward

In [10]:
env = gym.make('CartPole-v1')

In [11]:
n = env.observation_space.shape[0]
a = env.action_space.shape
a = env.action_space.n if len(a) == 0 else a[0]

In [12]:
num_episodes = 100

# Create Model

In [13]:
def get_model(hidden_features=[], activation='relu', use_gpu=False):
    K.clear_session()
    '''config = tf.ConfigProto(intra_op_parallelism_threads=4,
                            inter_op_parallelism_threads=4, allow_soft_placement=True,
                            device_count = {'CPU' : 1, 'GPU' : int(use_gpu)})
    session = tf.Session(config=config)
    K.set_session(session)'''
    K.set_learning_phase(False)
    
    if len(hidden_features) == 0:
        if a == 2:
            layers = [Dense(1, activation='sigmoid', input_dim=n, name='fc')]
        else:
            layers = [Dense(a, input_dim=n, name='fc')]
    else:
        layers = [Dense(hidden_features[0], activation=activation, input_dim=n, name='fc1')]
        for i, h in enumerate(hidden_features[1:]):
            layers.append(Dense(h, activation=activation, name='fc' + str(i + 2)))
        if a == 2:
            layers.append(Dense(1, name='fc' + str(len(hidden_features) + 1)))
        else:
            layers.append(Dense(a, name='fc' + str(len(hidden_features) + 1)))
        
    if a > 2:
        layers.append(Activation('softmax', name='softmax'))
    return Sequential(layers)

In [14]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
fc (Dense)                   (None, 1)                 5         
Total params: 5
Trainable params: 5
Non-trainable params: 0
_________________________________________________________________


In [15]:
parameter_shapes = [p.shape for p in model.get_weights()]
num_parameters = sum([np.prod(s) for s in parameter_shapes])

# Evolutionary Algorithms

## Define Base Features

In [16]:
def evaluate(parameters):
    start_time = time()
    set_parameters(parameters)
    epoch_reward = sample(num_episodes)
    print('Time for evaluation:', time() - start_time, 's')
    return epoch_reward

## Random Search

Searches randomly and selects the best performing model

In [17]:
class RandomSearch():
    def __init__(self, param_generator=None, show_progress=False):
        if param_generator is None:
            param_generator = lambda: np.random.randn(num_parameters)
        self.param_generator = param_generator
        self.best_parameters = param_generator()
        self.best_reward = -np.inf
        self.epochs_run = 0
        self.show_progress = show_progress
        self.progress = []
        
    def search(self, num_epochs=-1, max_episode_reward=None, verbose=False, timeout=600):
        start_time = time()
        if num_epochs == -1:
            while self.best_reward / num_episodes < max_episode_reward:
                self.__search(verbose)
                if time() - start_time > timeout: break
        else:
            for epoch in tqdm_notebook(range(num_epochs)):
                self.__search()
                if time() - start_time > timeout: break
        set_parameters(self.best_parameters)
                
    def __search(self, verbose=False):
        parameters = self.param_generator()
        reward = evaluate(parameters)
        self.epochs_run += 1
        
        if reward > self.best_reward:
            self.best_reward = reward
            self.best_parameters = parameters
            
            if verbose: print('Average reward:', int(reward / num_episodes))
            self.progress.append((self.epochs_run, reward))
            if self.show_progress: sample(observe=True)

In [18]:
optimizer = RandomSearch(show_progress=False)

In [19]:
optimizer.search(num_epochs=10)

Time for evaluation: 0.013402223587036133 s
Time for evaluation: 0.008942365646362305 s
Time for evaluation: 0.011004209518432617 s
Time for evaluation: 0.010904073715209961 s
Time for evaluation: 0.009038686752319336 s
Time for evaluation: 0.008856534957885742 s
Time for evaluation: 0.009875059127807617 s
Time for evaluation: 0.008667707443237305 s
Time for evaluation: 0.01014852523803711 s
Time for evaluation: 0.009482145309448242 s



In [20]:
#plt.plot(np.array(optimizer.progress)[:, 0], np.array(optimizer.progress)[:, 1] / num_episodes)

In [21]:
#sample(observe=True)
#env.render(close=True)