# Evolution strategies on OpenAI gym environment

In [30]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from time import time
from datetime import timedelta
from functools import partial

## Create environment and specification

In [3]:
from lagom.envs import make_gym_env
from lagom.envs import make_vec_env
from lagom.envs import EnvSpec
from lagom.envs.vec_env import SerialVecEnv


env = make_vec_env(vec_env_class=SerialVecEnv, 
                   make_env=make_gym_env, 
                   env_id='CartPole-v1', 
                   num_env=1, 
                   init_seed=0)
env_spec = EnvSpec(env)
env_spec

<EnvSpec, <SerialVecEnv: CartPole-v1, n: 1>>
	Observation space: Box(4,)
	Action space: Discrete(2)
	Control type: Discrete
	T: 500
	Max episode reward: 475.0
	Reward range: (-inf, inf)

## Create policy network

In [8]:
from lagom.core.networks import BaseNetwork
from lagom.core.networks import make_fc
from lagom.core.networks import ortho_init

from lagom.core.policies import CategoricalPolicy
from lagom.core.policies import GaussianPolicy


class Network(BaseNetwork):
    def make_params(self, config):
        self.layers = make_fc(input_dim=env_spec.observation_space.flat_dim, hidden_sizes=[32])
        self.last_feature_dim = 32

    def init_params(self, config):
        for layer in self.layers:
            ortho_init(layer, nonlinearity='relu', constant_bias=0.0)

    def forward(self, x):
        for layer in self.layers:
            x = F.relu(layer(x))

        return x

network = Network(config=None, env_spec=env_spec)
if env_spec.control_type == 'Discrete':
    policy = CategoricalPolicy(config=None, network=network, env_spec=env_spec)
elif env_spec.control_type == 'Continuous':
    policy = GaussianPolicy(config=None, network=network, env_spec=env_spec)
    
policy

CategoricalPolicy
	Network: Network(
  (layers): ModuleList(
    (0): Linear(in_features=4, out_features=32, bias=True)
  )
  (action_head): Linear(in_features=32, out_features=2, bias=True)
)

In [15]:
num_params = policy.network.num_params
num_params

226

## Create ES agent

In [7]:
from lagom.agents import BaseAgent


class Agent(BaseAgent):
    def __init__(self, policy, config):
        super().__init__(config)
        
        self.policy = policy
        
    def choose_action(self, obs):
        obs = torch.from_numpy(np.asarray(obs)).float()  # already batched due to VecEnv
        
        out_policy = self.policy(obs)
        
        return out_policy

## Create evaluation function

In [37]:
from lagom.envs import EnvSpec
from lagom.runner import TrajectoryRunner


def eval_f(parameters, env, N, T):
    parameters = torch.from_numpy(parameters).float()
    env_spec = EnvSpec(env)
    
    # Make network and load parameters
    network = Network(config=None, env_spec=env_spec)
    network.from_vec(parameters)
    # Make policy
    if env_spec.control_type == 'Discrete':
        policy = CategoricalPolicy(config=None, network=network, env_spec=env_spec)
    elif env_spec.control_type == 'Continuous':
        policy = GaussianPolicy(config=None, network=network, env_spec=env_spec)
    # Make agent
    agent = Agent(policy=policy, config=None)
    
    # Create runner
    runner = TrajectoryRunner(agent=agent, env=env, gamma=1.0)
    
    # Take rollouts
    D = runner(N=N, T=T)
    
    # Calculate mean return (no discount)
    mean_return = np.mean([sum(trajectory.all_r) for trajectory in D])
    
    # Negate return to be objective value, because ES does minimization
    f = -mean_return
    
    return f

## Create master-worker classes for ES

In [38]:
from lagom.core.es import CMAES
from lagom.core.es import OpenAIES

from lagom.core.es import BaseESWorker
from lagom.core.es import BaseGymESMaster


class ESWorker(BaseESWorker):
    def f(self, solution, seed):
        solution, make_env = solution
        
        # Create the environment and set seed sent by master
        env = make_env(init_seed=seed)
        
        # Evaluate the solution
        function_value = eval_f(parameters=solution, env=env, N=5, T=50)
        
        return function_value
    

class ESMaster(BaseGymESMaster):
    def make_es(self):
        es = CMAES(mu0=[0]*num_params,
                   std0=0.5, 
                   popsize=12)
        
        return es
        
    def _process_es_result(self, result):
        best_f_val = result['best_f_val']
        if self.generation == 0 or (self.generation+1) % 100 == 0:
            best_return = -best_f_val  # negate to get back reward
            print(f'Best episode reward evalauted at generation {self.generation+1}: {best_return}')
            
        # Save the parameters in final generation
        if (self.generation+1) == self.num_iteration:
            np.save('trained_param', result['best_param'])

## Train policy by ES

In [None]:
t = time()

make_env = partial(make_vec_env, 
                   vec_env_class=SerialVecEnv, 
                   make_env=make_gym_env, 
                   env_id='CartPole-v1', 
                   num_env=1)

es = ESMaster(make_env=make_env,
              num_iteration=1000, 
              worker_class=ESWorker, 
              num_worker=12, 
              init_seed=0, 
              daemonic_worker=None)
es()

print(f'\nTotal time: {timedelta(seconds=round(time() - t))}')

(6_w,12)-aCMA-ES (mu_w=3.7,w_1=40%) in dimension 226 (seed=1055468, Fri Sep 14 16:06:52 2018)
Best episode reward evalauted at generation 1: 36.0


## Evaluate the trained policy

In [28]:
# Load saved parameter
parameters = np.load('trained_param.npy')

# Make environment
env = make_env(seed=None, monitor=True, monitor_dir='logs/')
        
# Evaluate the solution
function_value = rollout(parameters=parameters, 
                         env=env, 
                         N=1, 
                         T=50)
function_value

1.986847748979926