In [1]:
import numpy as np
import tensorflow as tf
import gym
from tensorflow.contrib.layers import fully_connected as fclayer

def discount(x, gamma):
    """
    Given vector x, computes a vector y such that
    y[i] = x[i] + gamma * x[i+1] + gamma^2 x[i+2] + ...
    """
    out = np.zeros(len(x), 'float64')
    out[-1] = x[-1]
    for i in reversed(xrange(len(x)-1)):
        out[i] = x[i] + gamma*out[i+1]
    assert x.ndim >= 1
    # More efficient version:
    # scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
    return out

def categorical_sample(prob_n):
    """
    Sample from categorical distribution,
    specified by a vector of class probabilities
    """
    prob_n = np.asarray(prob_n)
    csprob_n = np.cumsum(prob_n)
    return (csprob_n > np.random.rand()).argmax()


def get_traj(agent, env, episode_max_length, render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """
    ob = env.reset()
    obs = []
    acts = []
    rews = []
    for _ in xrange(episode_max_length):
        a = agent.act(ob)
        (ob, rew, done, _) = env.step(a)
        obs.append(ob)
        acts.append(a)
        rews.append(rew)
        if done: break
        if render: env.render()
    return {"reward" : np.array(rews),
            "ob" : np.array(obs),
            "action" : np.array(acts)
            }

class REINFORCEAgent(object):

    """
    REINFORCE with baselines
    Currently just works for discrete action space
    """

    def __init__(self, ob_space, action_space, **usercfg):
        """
        Initialize your agent's parameters
        """
        n0 = ob_space.shape[0]
        nA = action_space.n
        # Here are all the algorithm parameters. You can modify them by passing in keyword args
        self.config = dict(episode_max_length=200, timesteps_per_batch=10000, n_iter=50, 
            gamma=1.5, stepsize=0.05, nhid=20)
        
        self.config.update(usercfg)
        
        self.sess = tf.InteractiveSession()
        
        self.ob_no = tf.placeholder(tf.float32, shape=(None, n0))
        self.a_n = tf.placeholder(tf.int32, shape=(None,))
        self.adv_n = tf.placeholder(tf.float32, shape=(None,))
        
        '''
        self.W0 = tf.Variable(tf.random_normal(shape=(n0, self.config['nhid'])) / np.sqrt(n0))
        self.b0 = tf.Variable(tf.zeros(shape=(self.config['nhid'])))
        
        self.W1 = tf.Variable(1e-4 * tf.random_normal(shape=(self.config['nhid'], nA)))
        self.b1 = tf.Variable(tf.zeros(shape=(nA)))
        
        h = tf.tanh(tf.nn.bias_add(tf.matmul(self.ob_no, self.W0), self.b0))
        '''
        
        h1 = fclayer(inputs=self.ob_no, num_outputs=64, activation_fn=tf.nn.tanh, \
                weights_initializer=tf.contrib.layers.xavier_initializer())
        
        h2 = fclayer(inputs=h1, num_outputs=64, activation_fn=tf.nn.tanh, \
                weights_initializer=tf.contrib.layers.xavier_initializer())
        
        self.prob_na = fclayer(inputs=h2, num_outputs=nA, activation_fn=tf.nn.softmax, \
                weights_initializer=tf.contrib.layers.xavier_initializer())
        
        #self.prob_na = tf.nn.softmax(tf.nn.bias_add(tf.matmul(h, self.W1), self.b1))
        
        a_n_onehot = tf.one_hot(self.a_n, depth=nA)
        self.loss = -tf.reduce_mean(tf.mul(tf.log(tf.reduce_sum(self.prob_na * a_n_onehot, reduction_indices=1)), self.adv_n))

        
        stepsize = self.config['stepsize']
        self.train_op = tf.train.RMSPropOptimizer(stepsize).minimize(self.loss)
        
        self.sess.run(tf.initialize_all_variables())
        
    def act(self, ob):
        prob = self.prob_na.eval(feed_dict={self.ob_no: [ob]})
        action = categorical_sample(prob)
        return action
    
    
    def learn(self, env):
        cfg = self.config
        
        for iteration in xrange(cfg["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps 
            trajs = []
            timesteps_total = 0
            while timesteps_total < cfg["timesteps_per_batch"]:
                traj = get_traj(self, env, cfg["episode_max_length"])
                trajs.append(traj)
                timesteps_total += len(traj["reward"])
                
            all_ob = np.concatenate([traj["ob"] for traj in trajs])
            # Compute discounted sums of rewards
            rets = [discount(traj["reward"], cfg["gamma"]) for traj in trajs]
            maxlen = max(len(ret) for ret in rets)
            padded_rets = [np.concatenate([ret, np.zeros(maxlen-len(ret))]) for ret in rets]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate([traj["action"] for traj in trajs])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            
            self.sess.run(self.train_op, feed_dict={
                    self.ob_no: all_ob, 
                    self.a_n: all_action,
                    self.adv_n: all_adv
                })
            
            eprews = np.array([traj["reward"].sum() for traj in trajs]) # episode total rewards
            eplens = np.array([len(traj["reward"]) for traj in trajs]) # episode lengths
            # Print stats
            print "-----------------"
            print "Iteration: \t %i"%iteration
            print "NumTrajs: \t %i"%len(eprews)
            print "NumTimesteps: \t %i"%np.sum(eplens)
            print "MaxRew: \t %s"%eprews.max()
            print "MeanRew: \t %s +- %s"%(eprews.mean(), eprews.std()/np.sqrt(len(eprews)))
            print "MeanLen: \t %s +- %s"%(eplens.mean(), eplens.std()/np.sqrt(len(eplens)))
            print "-----------------"
            get_traj(self, env, cfg["episode_max_length"], render=True)

def main():
    env = gym.make("Acrobot-v0")
    agent = REINFORCEAgent(env.observation_space, env.action_space, 
        episode_max_length=env.spec.timestep_limit)
    agent.learn(env)

if __name__ == "__main__":
    main()

[2016-07-07 18:01:50,553] Making new env: Acrobot-v0


-----------------
Iteration: 	 0
NumTrajs: 	 50
NumTimesteps: 	 10000
MaxRew: 	 -200.0
MeanRew: 	 -200.0 +- 0.0
MeanLen: 	 200.0 +- 0.0
-----------------
-----------------
Iteration: 	 1
NumTrajs: 	 50
NumTimesteps: 	 10000
MaxRew: 	 -200.0
MeanRew: 	 -200.0 +- 0.0
MeanLen: 	 200.0 +- 0.0
-----------------
-----------------
Iteration: 	 2
NumTrajs: 	 50
NumTimesteps: 	 10000
MaxRew: 	 -200.0
MeanRew: 	 -200.0 +- 0.0
MeanLen: 	 200.0 +- 0.0
-----------------
-----------------
Iteration: 	 3
NumTrajs: 	 50
NumTimesteps: 	 10000
MaxRew: 	 -200.0
MeanRew: 	 -200.0 +- 0.0
MeanLen: 	 200.0 +- 0.0
-----------------
-----------------
Iteration: 	 4
NumTrajs: 	 50
NumTimesteps: 	 10000
MaxRew: 	 -200.0
MeanRew: 	 -200.0 +- 0.0
MeanLen: 	 200.0 +- 0.0
-----------------
-----------------
Iteration: 	 5
NumTrajs: 	 50
NumTimesteps: 	 10000
MaxRew: 	 -200.0
MeanRew: 	 -200.0 +- 0.0
MeanLen: 	 200.0 +- 0.0
-----------------
-----------------
Iteration: 	 6
NumTrajs: 	 50
NumTimesteps: 	 10000
MaxRew

KeyboardInterrupt: 