In [2]:
import numpy as np
import gym
from gym.spaces import Discrete, Box

# ================================================================
# Policies
# ================================================================

class DeterministicDiscreteActionLinearPolicy(object):

    def __init__(self, theta, ob_space, ac_space):
        """
        dim_ob: dimension of observations
        n_actions: number of actions
        theta: flat vector of parameters
        """
        dim_ob = ob_space.shape[0]
        n_actions = ac_space.n
        assert len(theta) == (dim_ob + 1) * n_actions
        self.W = theta[0 : dim_ob * n_actions].reshape(dim_ob, n_actions)
        self.b = theta[dim_ob * n_actions : None].reshape(1, n_actions)

    def act(self, ob):
        """
        """
        y = ob.dot(self.W) + self.b
        a = y.argmax()
        return a

class DeterministicContinuousActionLinearPolicy(object):

    def __init__(self, theta, ob_space, ac_space):
        """
        dim_ob: dimension of observations
        dim_ac: dimension of action vector
        theta: flat vector of parameters
        """
        self.ac_space = ac_space
        dim_ob = ob_space.shape[0]
        dim_ac = ac_space.shape[0]
        assert len(theta) == (dim_ob + 1) * dim_ac
        self.W = theta[0 : dim_ob * dim_ac].reshape(dim_ob, dim_ac)
        self.b = theta[dim_ob * dim_ac : None]

    def act(self, ob):
        a = np.clip(ob.dot(self.W) + self.b, self.ac_space.low, self.ac_space.high)
        return a

def do_episode(policy, env, num_steps, render=False):
    total_rew = 0
    ob = env.reset()
    for t in range(num_steps):
        a = policy.act(ob)
        (ob, reward, done, _info) = env.step(a)
        total_rew += reward
        if render and t%3==0: env.render()
        if done: break
    return total_rew

env = None
def noisy_evaluation(theta):
    policy = make_policy(theta)
    rew = do_episode(policy, env, num_steps)
    return rew

def make_policy(theta):
    if isinstance(env.action_space, Discrete):
        return DeterministicDiscreteActionLinearPolicy(theta,
            env.observation_space, env.action_space)
    elif isinstance(env.action_space, Box):
        return DeterministicContinuousActionLinearPolicy(theta,
            env.observation_space, env.action_space)
    else:
        raise NotImplementedError

In [3]:
# Task settings:
env = gym.make('CartPole-v0') # Change as needed
num_steps = 500 # maximum length of episode
# Alg settings:
n_iter = 100 # number of iterations of CEM
batch_size = 25 # number of samples per batch
elite_frac = 0.2 # fraction of samples used as elite set

if isinstance(env.action_space, Discrete):
    dim_theta = (env.observation_space.shape[0]+1) * env.action_space.n
elif isinstance(env.action_space, Box):
    dim_theta = (env.observation_space.shape[0]+1) * env.action_space.shape[0]
else:
    raise NotImplementedError

# Initialize mean and standard deviation
theta_mean = np.zeros(dim_theta)
theta_std = np.ones(dim_theta)

[2016-07-07 11:14:53,265] Making new env: CartPole-v0


In [7]:
s = [[2, 3, 4], [3, 5, 6]]
np.mean(s)

3.8333333333333335

In [8]:
# Now, for the algorithm
for iteration in xrange(n_iter):
    # Sample parameter vectors
    
    #=======YOUR CODES HERE==========
    thetas = [theta_mean + np.random.randn(dim_theta) * theta_std for i in range(batch_size)]
    #================================
    
    rewards = [noisy_evaluation(theta) for theta in thetas]
    # Get elite parameters
    n_elite = int(batch_size * elite_frac)
    elite_inds = np.argsort(rewards)[batch_size - n_elite:batch_size]
    elite_thetas = [thetas[i] for i in elite_inds]
    # Update theta_mean, theta_std
    
    #=======YOUR CODES HERE==========
    theta_mean = np.mean(elite_thetas, axis=0)
    theta_std = np.std(elite_thetas - theta_mean)
    #================================
    
    print "iteration %i. mean f: %8.3g. max f: %8.3g"%(iteration, np.mean(rewards), np.max(rewards))
    do_episode(make_policy(theta_mean), env, num_steps, render=True)

iteration 0. mean f:     17.4. max f:       87
iteration 1. mean f:     27.8. max f:      118
iteration 2. mean f:      104. max f:      500
iteration 3. mean f:     98.6. max f:      349
iteration 4. mean f:      138. max f:      500
iteration 5. mean f:      177. max f:      483
iteration 6. mean f:      238. max f:      500
iteration 7. mean f:      330. max f:      500
iteration 8. mean f:      324. max f:      500
iteration 9. mean f:      430. max f:      500
iteration 10. mean f:      461. max f:      500
iteration 11. mean f:      468. max f:      500
iteration 12. mean f:      482. max f:      500
iteration 13. mean f:      500. max f:      500
iteration 14. mean f:      500. max f:      500
iteration 15. mean f:      489. max f:      500
iteration 16. mean f:      500. max f:      500
iteration 17. mean f:      500. max f:      500
iteration 18. mean f:      500. max f:      500
iteration 19. mean f:      500. max f:      500
iteration 20. mean f:      499. max f:      500
it