## Advantage actor-critic in AgentNet (5 pts)

Once we're done with REINFORCE, it's time to proceed with something more sophisticated.
The next one in line is advantage actor-critic, in which agent learns both policy and value function, using the latter to speed up learning.

Your main objective for this session is to... beat MountainCar-v0... with actor-critic.

Beating means making submission to [gym leaderboard](https://gym.openai.com/envs/MountainCar-v0).

``` MountainCar-v0 defines "solving" as getting average reward of -110.0 over 100 consecutive trials. ```


In [1]:
%env THEANO_FLAGS=device=gpu0
%env THEANO_FLAGS='floatX=float32'
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1
        
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


env: THEANO_FLAGS=device=gpu0
env: THEANO_FLAGS='floatX=float32'


In [2]:
import gym

env = gym.make("BipedalWalker-v2").env
obs = env.reset()
state_size = len(obs)

#print(dir(env.action_space))
#print(env.action_space.high)
#n_actions = env.action_space.n
actions_low = env.action_space.low
actions_high = env.action_space.high
action_space = env.action_space
print(obs)

[  2.74741463e-03  -1.45178684e-05   1.12934511e-03  -1.59999001e-02
   9.19512808e-02  -1.49035698e-03   8.60273570e-01   2.50423606e-03
   1.00000000e+00   3.23557667e-02  -1.49025198e-03   8.53825212e-01
   1.05399243e-03   1.00000000e+00   4.40814018e-01   4.45820123e-01
   4.61422771e-01   4.89550203e-01   5.34102798e-01   6.02461040e-01
   7.09148884e-01   8.85931849e-01   1.00000000e+00   1.00000000e+00]


# Basic agent setup
Here we define a simple agent that maps game images into Qvalues using shallow neural network.


In [3]:
import lasagne
from lasagne.layers import InputLayer,DenseLayer,NonlinearityLayer,batch_norm,dropout
#image observation at current tick goes here, shape = (sample_i,x,y,color)
observation_layer = InputLayer((None,state_size))

nn = observation_layer


In [4]:
#a layer that predicts Qvalues
print(action_space.shape[0])
policy_layer = DenseLayer(nn, action_space.shape[0], nonlinearity=lasagne.nonlinearities.softmax)

V_layer = DenseLayer(nn, 1, nonlinearity=None)

4


In [5]:
from agentnet.resolver.base import BaseResolver
import theano.tensor as T
import theano.tensor.shared_randomstreams as random_streams

class ProbabilisticResolver(BaseResolver):


    def __init__(self, incoming, assume_normalized=False, seed=1234, output_dtype='int32',
                 name='ProbabilisticResolver'):
        self.assume_normalized = assume_normalized
        self.rng = random_streams.RandomStreams(seed)
        super(ProbabilisticResolver, self).__init__(incoming, name=name,output_dtype=output_dtype)

    def get_output_for(self, policy, greedy=False, **kwargs):
        #print(dir(policy))
        #print(policy.shape)
        if greedy:
            chosen_action_ids = policy
        else:
            batch_size, n_actions = policy.shape
            print(batch_size, n_actions)
            chosen_action_ids = (policy + self.rng.normal(size=policy.shape))
            #chosen_action_ids = T.min(chosen_action_ids, T.ones_like(chosen_action_ids))
            #chosen_action_ids = T.max(chosen_action_ids, -T.ones_like(chosen_action_ids))

        return chosen_action_ids


action_layer = ProbabilisticResolver(policy_layer,
                                     name="e-greedy action picker",
                                     assume_normalized=True)

##### Finally, agent
We declare that this network is and MDP agent with such and such inputs, states and outputs

In [6]:
from agentnet.agent import Agent
#all together
agent = Agent(observation_layers=observation_layer,
              policy_estimators=(policy_layer,V_layer),
              action_layers=action_layer)


In [7]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params((action_layer,V_layer),trainable=True)
weights

[W, b, W, b]

# Create and manage a pool of atari sessions to play with

* To make training more stable, we shall have an entire batch of game sessions each happening independent of others
* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf
* Alternative approach: store more sessions: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

In [8]:
from gym.wrappers.time_limit import TimeLimit

class env_wrapper(object):
    def __init__(self, name, t_max):
        self.name = name
        self.t_max = t_max
    def __call__(self):
        env = gym.make(self.name).env
        env = TimeLimit(env, max_episode_steps=self.t_max)
        return env

In [13]:

"""
A thin wrapper for openAI gym environments that maintains a set of parallel games and has a method to generate
interaction sessions given agent one-step applier function.
"""

import numpy as np
from agentnet.utils.layers import get_layer_dtype
from agentnet.environment import SessionPoolEnvironment
from warnings import warn
import gym
from gym.wrappers import Monitor



def GamePool(*args, **kwargs):
    raise ValueError("Deprecated. Use EnvPool(agent,env_title,n_parallel_agents) instead")


deprecated_preprocess_obs = lambda obs: obs

# A whole lot of space invaders
class EnvPool(object):
    def __init__(self, agent, make_env=lambda: gym.make("SpaceInvaders-v0"), n_games=1, max_size=None,
                 preprocess_observation=deprecated_preprocess_obs, agent_step=None):
        """A pool that stores several
           - game states (gym environment)
           - prev observations - last agent observations
           - prev memory states - last agent hidden states
        and is capable of some auxilary actions like evaluating agent on one game session (See .evaluate()).
        :param agent: Agent which interacts with the environment.
        :type agent: agent.Agent
        :param make_env: Factory that produces environments OR a name of the gym environment.
                See gym.envs.registry.all()
        :type make_env: function or str
        :param n_games: Number of parallel games. One game by default.
        :type n_games: int
        :param max_size: Max pool size by default (if appending sessions). By default, pool is not constrained in size.
        :type max_size: int
        :param preprocess_observation: Function for preprocessing raw observations from gym env to agent format.
            By default it is identity function.
        :type preprocess_observation: function
        :param agent_step: Function with the same signature as agent.get_react_function().
        :type agent_step: theano.function
        """
        if not callable(make_env):
            env_name = make_env
            make_env = lambda: gym.make(env_name)

        ##Deprecation warning
        if preprocess_observation != deprecated_preprocess_obs:
            warn("preprocess_observation is deprecated (will be removed in 0.11). Use gym.core.Wrapper instead.")

        # Create atari games.
        self.make_env = make_env
        self.envs = [self.make_env() for _ in range(n_games)]
        self.preprocess_observation = preprocess_observation

        # Initial observations.
        self.prev_observations = [self.preprocess_observation(make_env.reset()) for make_env in self.envs]

        # Agent memory variables (if you use recurrent networks).
        self.prev_memory_states = [np.zeros((n_games,) + tuple(mem.output_shape[1:]),
                                            dtype=get_layer_dtype(mem))
                                   for mem in agent.agent_states]

        # Save agent.
        self.agent = agent
        self.agent_step = agent_step or agent.get_react_function()

        # Create experience replay environment.
        self.experience_replay = SessionPoolEnvironment(observations=agent.observation_layers,
                                                        actions=agent.action_layers,
                                                        agent_memories=agent.agent_states)
        self.max_size = max_size

        # Whether particular session has just been terminated and needs restarting.
        self.just_ended = [False] * len(self.envs)

    def interact(self, n_steps=100, verbose=False, add_last_observation=True):
        """Generate interaction sessions with ataries (openAI gym atari environments)
        Sessions will have length n_steps. Each time one of games is finished, it is immediately getting reset
        and this time is recorded in is_alive_log (See returned values).
        :param n_steps: Length of an interaction.
        :param verbose: If True, prints small debug message whenever a game gets reloaded after end.
        :param add_last_observation: If True, appends the final state with
                state=final_state,
                action=-1,
                reward=0,
                new_memory_states=prev_memory_states, effectively making n_steps-1 records.
        :returns: observation_log, action_log, reward_log, [memory_logs], is_alive_log, info_log
        :rtype: a bunch of tensors [batch, tick, size...],
                the only exception is info_log, which is a list of infos for [time][batch], None padded tick
        """

        def env_step(i, action):
            """Environment reaction.
            :returns: observation, reward, is_alive, info
            """

            if not self.just_ended[i]:
                new_observation, cur_reward, is_done, info = self.envs[i].step(action)
                if is_done:
                    # Game ends now, will finalize on next tick.
                    self.just_ended[i] = True
                new_observation = self.preprocess_observation(new_observation)

                # note: is_alive=True in any case because environment is still alive (last tick alive) in our notation.
                return new_observation, cur_reward, True, info
            else:
                # Reset environment, get new observation to be used on next tick.
                new_observation = self.preprocess_observation(self.envs[i].reset())

                # Reset memory for new episode.
                for m_i in range(len(new_memory_states)):
                    new_memory_states[m_i][i] = 0

                if verbose:
                    print("env %i reloaded" % i)

                self.just_ended[i] = False

                return new_observation, 0, False, {'end': True}

        history_log = []

        for i in range(n_steps - int(add_last_observation)):
            res = self.agent_step(self.prev_observations, *self.prev_memory_states)
            actions, new_memory_states = res[0], res[1:]

            new_observations, cur_rewards, is_alive, infos = zip(*map(env_step, range(len(self.envs)), actions))

            # Append data tuple for this tick.
            history_log.append((self.prev_observations, actions, cur_rewards, new_memory_states, is_alive, infos))

            self.prev_observations = new_observations
            self.prev_memory_states = new_memory_states

        if add_last_observation:
            fake_actions = np.array([env.action_space.sample() for env in self.envs])
            fake_rewards = np.zeros(shape=len(self.envs))
            fake_is_alive = np.ones(shape=len(self.envs))
            history_log.append((self.prev_observations, fake_actions, fake_rewards, self.prev_memory_states,
                                fake_is_alive, [None] * len(self.envs)))

        # cast to numpy arrays
        observation_log, action_log, reward_log, memories_log, is_alive_log, info_log = zip(*history_log)

        # tensor dimensions
        # [batch_i, time_i, observation_size...]
        observation_log = np.array(observation_log).swapaxes(0, 1)

        # [batch, time, units] for each memory tensor
        memories_log = list(map(lambda mem: np.array(mem).swapaxes(0, 1), zip(*memories_log)))

        # [batch_i,time_i]
        action_log = np.array(action_log).swapaxes(0, 1)

        # [batch_i, time_i]
        reward_log = np.array(reward_log).swapaxes(0, 1)

        # [batch_i, time_i]
        is_alive_log = np.array(is_alive_log).swapaxes(0, 1).astype('uint8')

        return observation_log, action_log, reward_log, memories_log, is_alive_log, info_log

    def update(self, n_steps=100, append=False, max_size=None, add_last_observation=True,
               preprocess=lambda observations, actions, rewards, is_alive, h0: (
                       observations, actions, rewards, is_alive, h0)):
        """Create new sessions and add them into the pool.
        :param n_steps: How many time steps in each session.
        :param append: If True, appends sessions to the pool and crops at max_size.
            Otherwise, old sessions will be thrown away entirely.
        :param max_size: If not None, substitutes default max_size (from __init__) for this update only.
        :param add_last_observation: See param `add_last_observation` in `.interact()` method.
        :param preprocess: Function that implements arbitrary processing of the sessions.
            Takes AND outputs (observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states).
            For param specs see `.interact()` output format.
        """

        preceding_memory_states = list(self.prev_memory_states)

        # Get interaction sessions.
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = self.interact(n_steps=n_steps,
                                                                                                add_last_observation=add_last_observation)

        observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states = \
            preprocess(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states)

        # Load them into experience replay environment.
        if not append:
            self.experience_replay.load_sessions(observation_tensor, action_tensor, reward_tensor,
                                                 is_alive_tensor, preceding_memory_states)
        else:
            self.experience_replay.append_sessions(observation_tensor, action_tensor, reward_tensor,
                                                   is_alive_tensor, preceding_memory_states,
                                                   max_pool_size=max_size or self.max_size)

    def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True,
                 t_max=100000):
        """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward.
        :param save_path: where to save the report
        :param record_video: if True, records mp4 video
        :return: total reward (scalar)
        """
        env = self.make_env()

        if not use_monitor and record_video:
            raise warn("Cannot video without gym monitor. If you still want video, set use_monitor to True")

        if record_video :
            env = Monitor(env,save_path,force=True)
        elif use_monitor:
            env = Monitor(env, save_path, video_callable=lambda i: False, force=True)

        game_rewards = []
        for _ in range(n_games):
            # initial observation
            observation = env.reset()
            # initial memory
            prev_memories = [np.zeros((1,) + tuple(mem.output_shape[1:]),
                                      dtype=get_layer_dtype(mem))
                             for mem in self.agent.agent_states]

            t = 0
            total_reward = 0
            while True:

                res = self.agent_step(self.preprocess_observation(observation)[None, ...], *prev_memories)
                action, new_memories = res[0], res[1:]

                observation, reward, done, info = env.step(action[0])

                total_reward += reward
                prev_memories = new_memories

                if done or t >= t_max:
                    if verbose:
                        print("Episode finished after {} timesteps with reward={}".format(t + 1, total_reward))
                    break
                t += 1
            game_rewards.append(total_reward)

        env.close()
        del env
        return game_rewards

#create a small pool with 10 parallel agents
pool = EnvPool(agent,make_env=env_wrapper("BipedalWalker-v2", 10000), n_games=24,max_size=1000) 

#we assume that pool size 1000 is small enough to learn "almost on policy" :)

ImportError: No module named 'agentnet.experiments.utils'

In [10]:
%%time
#interact for 7 ticks
_,action_log,reward_log,_,_,_  = pool.interact(7)


#print(action_log[:3])
#print(reward_log[:3])

CPU times: user 52 ms, sys: 8 ms, total: 60 ms
Wall time: 60.1 ms


In [11]:
SEQ_LENGTH = 100
#load first sessions (this function calls interact and remembers sessions)
pool.update(SEQ_LENGTH)

TypeError: ('Wrong number of dimensions: expected 2, got 3 with shape (24, 100, 4).', 'Container name "session.actions_history.0"')

# Actor-critic loss

Here we define obective function for actor-critic (one-step) RL.

* We regularize policy with expected inverse action probabilities (discouraging very small probas) to make objective numerically stable


In [None]:
#get agent's Qvalues obtained via experience replay
replay = pool.experience_replay.sample_session_batch(100)

_,_,_,_,(policy_seq,V_seq) = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,
)



In [None]:
from agentnet.learning import a2c                                                   


elwise_mse_loss = a2c.get_elementwise_objective(policy_seq,
                                                V_seq[:,:,0],
                                                replay.actions[0],
                                                replay.rewards,
                                                replay.is_alive,
                                                gamma_or_gammas=0.99,
                                                n_steps=1)

#compute mean over "alive" fragments
loss = elwise_mse_loss.sum() / replay.is_alive.sum()

loss += 0.0001*(1./(policy_seq)).sum(-1).mean()

In [None]:
from theano import tensor as T
# loss += <regularize agent with negative entropy. Higher entropy = smaller loss. Multiply by small coefficient>

In [None]:
# Compute weight updates
updates = lasagne.updates.rmsprop(loss, weights, learning_rate=10e-5)

In [None]:
import theano
train_step = theano.function([],loss,updates=updates)

# Demo run

In [None]:
#for MountainCar-v0 evaluation session is cropped to 200 ticks
untrained_reward = pool.evaluate(save_path="./records",record_video=True)

#video is in the ./records folder

# Training loop


In [None]:
#starting epoch
epoch_counter = 1

#full game rewards
rewards = {}

In [None]:
import tqdm
from IPython.display import clear_output

In [None]:
#the loop may take eons to finish.
#consider interrupting early.
loss = 0
for i in tqdm.tnrange(100000):    
    
    #train
    pool.update(SEQ_LENGTH,append=True)
    
    loss = loss*0.99 + train_step()*0.01
        
    
    
    if epoch_counter%100==0:
        #average reward per game tick in current experience replay pool
        pool_mean_reward = np.average(pool.experience_replay.rewards.get_value()[:,:-1],
                                      weights=1+pool.experience_replay.is_alive.get_value()[:,:-1])
        print("iter=%i\treward/step=%.5f\tloss ma=%.5f"%(epoch_counter,
                                                        pool_mean_reward,
                                                        loss))
        

    ##record current learning progress and show learning curves
    if epoch_counter%500 ==0:
        clear_output(True)

        n_games = 10
        rewards[epoch_counter] = pool.evaluate( record_video=False,n_games=n_games,
                                               verbose=False)
        iters,session_rewards=zip(*sorted(rewards.items(), key=lambda x: x[0])) #key=lambda (k,v):k))
        mean_rewards = [np.mean(x) for x in session_rewards]
        plt.figure(figsize=(12, 8))
        plt.plot(iters, mean_rewards)
        plt.show()
        if np.mean(rewards[epoch_counter]) > -105:
            break
        print("Current score(mean over %i) = %.3f"%(n_games,np.mean(rewards[epoch_counter])))
        
    
    epoch_counter  +=1

    
# Time to drink some coffee!

In [None]:
iters,session_rewards=zip(*sorted(rewards.items(),key=lambda (k,v):k))

In [None]:
plt.plot(iters,map(np.mean,session_rewards))

In [None]:
pool = EnvPool(agent,make_env="MountainCar-v0", 
               n_games=1,
               max_size=1000) 
pool.evaluate(record_video=False,
              n_games=10000, 
              verbose=False)

In [None]:
# gym.upload('/home/common/nexes/Practical_RL/week6/records',
#            api_key="")
