In [1]:
#experiment name and snapshot folder (used for model persistence)
experiment_setup_name = "tutorial.gym.atari.pendulum-v0.cnn"
snapshot_path = "."


#gym game title
GAME_TITLE = 'LunarLander-v0'

#how many parallel game instances can your machine tolerate
N_PARALLEL_GAMES = 10


#how long is one replay session from a batch

#since we have window-like memory (no recurrent layers), we can use relatively small session weights
replay_seq_len = 5

In [2]:
#this is my machine-specific config. replace if are not me.

#theano device selection
%env THEANO_FLAGS='device=cpu'


#snapshot path - where neural network snapshots are saved during the main training loop
!mkdir ./agentnet_snapshots/
snapshot_path = "./agentnet_snapshots/"


env: THEANO_FLAGS='device=cpu'
mkdir: cannot create directory `./agentnet_snapshots/': File exists


# This tutorial is a showcase on how to use advanced AgentNet techniques 


# [new]

* The notebook is mostly based on "Playing atari with Deep Reinforcement Learning (OpenAI Gym)" example
 * All changes against that example will be marked with #[new] sign, like one above
* We use a recurrent memory layer, implemented via Gated Recurrent Unit
* We use advantage actor-critic method to train agent (using policy + state values instead of q-values)
* We train agent to hurt humans in KungFu master game
* We also use a bit heavier a network to process inputs (~convolutional from basic example)
  * If you have no GPU and want agent to train faster than human child, replace it with what worked in basic example

## About OpenAI Gym

* Its a recently published platform that basicly allows you to train agents in a wide variety of environments with near-identical interface.
* This is twice as awesome since now we don't need to write a new wrapper for every game
* Go check it out!
  * Blog post - https://openai.com/blog/openai-gym-beta/
  * Github - https://github.com/openai/gym
  
  
### Installing it
 * If nothing changed on their side, to run this, you bacically need to follow their install instructions - 
 
```
git clone https://github.com/openai/gym.git
cd gym
pip install -e .[all]
```

In [3]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline


#theano imports

import theano
import theano.tensor as T
floatX = theano.config.floatX

import lasagne


In [4]:
%load_ext autoreload
%autoreload 2

# Experiment setup
* Here we basically just load the game

In [13]:
gym

'Zaxxon-ram'

In [14]:
import gym
atari = gym.make("BipedalWalker-v0")
atari.reset()
#plt.imshow(atari.render('rgb_array'))

[2016-05-16 04:57:13,201] Making new env: BipedalWalker-v0


AttributeError: 'module' object has no attribute 'RAND_LIMIT_swigconstant'

### Game Parameters
* observation dimensions, actions, etc

In [None]:
n_actions = atari.action_space.shape[0]
observation_shape = (None,)+atari.observation_space.shape


In [None]:
#del atari

# agent setup step by step
* An agent implementation may contain these parts:
 * Observation(s)
   * InputLayers where observed game states (here - images) are sent at each tick 
 * Memory layer(s)
   * A dictionary that maps "New memory layers" to "prev memory layers"
 * Policy layer (e.g. Q-values or probabilities)
   * in this case, a lasagne dense layer based on observation layer
 * Resolver - acton picker layer
   * chooses what action to take given Q-values
   * in this case, the resolver has epsilon-greedy policy
  




##### Agent observations

* Here you define where observations (game images) appear in the network
* You can use any lasagne architecture you want. We provide several examples

In [None]:
import lasagne

from lasagne.layers import InputLayer,DropoutLayer,DenseLayer, ExpressionLayer, Conv2DLayer,MaxPool2DLayer
from lasagne.layers import flatten, dimshuffle

In [None]:


#image observation
observation_layer = lasagne.layers.InputLayer(observation_shape,
                                                    name="images input")





#dense with dropout    
dnn = DenseLayer(observation_layer,num_units=50,name='dense0')
dnn = DropoutLayer(dnn,name = "dropout", p=0.05) #will get deterministic during evaluation
dnn = DenseLayer(dnn,num_units=30,name='dense1')

# [end of that part]

inp_nn = dnn

  
### Agent memory states
 * Here you can define arbitrary transitions between "previous state" variables and their next states
 * The rules are
   * previous states must be input layers
   * next states must have same shape as previous ones
   * otherwise it can be any lasagne network
   * AgentNet.memory has several useful layers
   
 * During training and evaluation, your states will be updated recurrently
   * next state at t=1 is given as previous state to t=2
 
 * Finally, you have to define a dictionary mapping new state -> previous state


Atari game environments are known to have __flickering__ effect where some sprites are shown only on odd frames and others on even ones - that was used to optimize performance at the time.

To compensate for this, we shall use the memory layer called __WindowAugmentation__ which basically maintains a K previous time steps of what it is fed with.


# [new]
We shall also use a GRUMemoryLayer to represent agent's recurrent memory state. This state is updated on every turn given window state.
Not that this is __not the same__ as `lasagne.layers.GRULayer` as GRUMemoryLayer only does a single time tick (lasagne version iterates over the whole sequence).



In [None]:
#a dictionary that maps next agent memory states to previous ones
from collections import OrderedDict
memory_dict = OrderedDict([])



##### Agent policy and action picking


# [new]

* Since we are using Actor-Critic method, we need to predict 2 values:
  * State value - basicly a Q-value of best action in a state
  * Agent policy - probabilities of taking actions
  

* To pick actions, we use a probablistic resolver
  * That one picks actions with given probabilities
  * We use a laplacian smoothing to pick actions in training (to bolster exploration) 
  * The resolver output is considered agent's next action and sent into the environment

In [None]:
def scaled_tanh(x):
    tanh = lasagne.nonlinearities.tanh(x)/2+0.5
    tanh *=  np.float32(atari.action_space.high - atari.action_space.low)
    tanh += np.float32(atari.action_space.low)
    return tanh

In [None]:
#policy
policy_layer= DenseLayer(inp_nn,
                         num_units = 1,
                         nonlinearity = scaled_tanh,
                         name = "mu")





from lasagne.layers import concat
#State values
state_value_layer = DenseLayer(concat([inp_nn,policy_layer]),
                               num_units = 1,
                               nonlinearity = lasagne.nonlinearities.linear,
                               name = "Vpredicted")


#resolver


resolver = policy_layer



##### Finally, agent
We declare that this network is and MDP agent with such and such inputs, states and outputs

# [new]
* Note that we can have any (incl. none) number of agent policy variables,
* so you can use that to track any layer output
* It is also possible to have multiple observations and actions, but that's not aplicable to Atari environment

In [None]:
from agentnet.agent import Agent


#all together
agent = Agent(observation_layer,
              memory_dict,
              [policy_layer,state_value_layer],
              resolver)


In [None]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params((resolver,state_value_layer),trainable=True)
weights

In [None]:
actor_weights = filter(lambda w: not w.name.startswith("Vpredicted"),weights)
critic_weights = filter(lambda w: not w.name.startswith("mu"),weights)
print 'actor:',actor_weights
print 'critic:',critic_weights

# Agent step function
* Compute action and next state given observation and prev state
* The code was written in a generic way and did not undergo any changes since previous turorial

In [None]:
applier_observation = T.matrix("input image",dtype=floatX)


# inputs to all agent memory states (usng lasagne defaults, may use any theano inputs)
applier_memories = OrderedDict([ (new_st,prev_st.input_var)
                                for new_st, prev_st in agent.state_variables.items()
                               ])


res =agent.get_agent_reaction(applier_memories,
                              applier_observation,
                              deterministic = True #disable dropout here. Only enable in experience replay
                             )


applier_actions,applier_new_states,applier_policy = res

applier_fun = theano.function([applier_observation]+applier_memories.values(),
        applier_actions+applier_new_states)


In [None]:
#a nice pythonic interface
def step(observation, prev_memories = 'zeros',batch_size = N_PARALLEL_GAMES):
    """ returns actions and new states given observation and prev state
    Prev state in default setup should be [prev window,]"""
    #default to zeros
    if prev_memories == 'zeros':
        prev_memories = [np.zeros((batch_size,)+tuple(mem.output_shape[1:]),
                                  dtype=floatX) 
                         for mem in agent.state_variables]
    
    res = applier_fun(np.array(observation,dtype=floatX),*prev_memories)
    action = res[0]
    memories = res[1:]
    return action,memories

# Create and manage a pool of atari sessions to play with

* To make training more stable, we shall have an entire batch of game sessions each happening independent of others
* We define a small container that stores
 * game emulators
 * last agent observations
 * agent memories at last time tick
* This allows us to instantly continue a session from where it stopped



* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf

In [None]:
#A whole lot of space invaders

class GamePool:
    def __init__(self,game_title,n_games):
        """
        A pool that stores several
           - game states (gym environment)
           - prev_observations - last agent observations
           - prev memory states - last agent hidden states
           
       """
        
        
        self.ataries = [gym.make(game_title) for i in range(n_games)]

        self.prev_observations = [atari.reset() for atari in self.ataries]
    
        self.prev_memory_states = 'zeros'

pool = GamePool(GAME_TITLE, N_PARALLEL_GAMES)


In [None]:
# a function that creates and records environment interaction sessions
def interact(pool,n_steps = 100,verbose=False):
    """generate interaction sessions with ataries (openAI gym atari environments)
    Sessions will have length n_steps. 
    Each time one of games is finished, it is immediately getting reset"""
    history_log = []
        
    prev_observations = pool.prev_observations 
    
    prev_memory_states = pool.prev_memory_states
    
    
    
    for i in range(n_steps):
        
        actions,new_memory_states = step(prev_observations,prev_memory_states)

        
        new_observations, cur_rewards, is_done, infos = \
            zip(*map(
                     lambda atari, action: atari.step(action), 
                     pool.ataries,actions))
            
        new_observations = np.array(new_observations,dtype=floatX)
        
        for i in range(len(pool.ataries)):
            if is_done[i]:
                new_observations[i] = pool.ataries[i].reset()
                
                for m_i in range(len(new_memory_states)):
                    new_memory_states[m_i][i] = 0
                    
                if verbose:
                    print "atari",i,"reloaded"
        
        
        #append observation -> action -> reward tuple
        history_log.append((prev_observations,actions,cur_rewards,new_memory_states,is_done,infos))
        
        prev_observations = new_observations
        prev_memory_states = new_memory_states
                
    pool.prev_memory_states = prev_memory_states
    pool.prev_observations = prev_observations
    
    return zip(*history_log)
    

In [None]:
%%time
observation_log,action_log,reward_log,_,_,_  = interact(pool,50)


print np.array(reward_log)[:10].T
#print np.array(action_names)[np.array(action_log)[:3,:5]]

# experience replay pool

Since our network exists in a theano graph and OpenAI gym doesn't, we shall train out network via experience replay.

To do that in AgentNet, one can use a SessionPoolEnvironment.

It's simple: you record new sessions using `interact(...)`, and than immediately train on them.

1. Interact with Atari, get play sessions
2. Store them into session environment
3. Train on them
4. Repeat


In [None]:
#Create an environment with all default parameters
from agentnet.environment import SessionPoolEnvironment
env = SessionPoolEnvironment(observations = observation_layer,
                             actions=resolver,
                             agent_memories=[])

In [None]:
def update_pool(env, pool,n_steps=100):
    """ a function that creates new sessions and ads them into the pool
    throwing the old ones away entirely for simplicity"""

    
    observation_log,action_log,reward_log,_,is_done_log,_= interact(pool,n_steps=n_steps)
    
    
    #tensor dimensions
    
    # [batch_i, time_i, width, height, rgb]
    observation_tensor = np.array(observation_log).swapaxes(0,1)
    
    # [batch_i,time_i]
    action_tensor = np.array(action_log).swapaxes(0,1)
    
    # [batch_i, time_i]
    reward_tensor = np.array(reward_log).swapaxes(0,1)

    # [batch_i, time_i]
    is_alive_tensor = 1- np.array(is_done_log,dtype = 'int8').swapaxes(0,1)
    
    env.load_sessions(observation_tensor,action_tensor,reward_tensor,is_alive_tensor,[])
    
    

In [None]:
#load first  sessions
update_pool(env,pool,replay_seq_len)

A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. 
* Why that is expected to be better - http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html
* Or less proprietary - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

To do that, one might make use of
* ```env.load_sessions(...)``` - load new sessions
* ```env.get_session_updates(...)``` - does the same thing via theano updates (advanced)
* ```batch_env = env.sample_session_batch(batch_size, ...)``` - create an experience replay environment that contains batch_size random sessions from env (rerolled each time). Should be used in training instead of env.
* ```env.select_session_batch(indices)``` does the same thing deterministically.


# Interacting with environment
* An agent has a method that produces symbolic environment interaction sessions
* Such sessions are in sequences of observations, agent memory, actions, q-values,etc
  * one has to pre-define maximum session length.

* SessionPool also stores rewards (reinforcement learning objective)

### Training via experience replay

* We use agent we have created to replay environment interactions inside Theano
* to than train on the replayed sessions via theano gradient propagation
* this is essentially basic Lasagne code after the following cell

# [new]
* Note that we not unpack several variables (policy, values) instead of Q-values
* We than reshape V_seq from (batch, time, 1 unit) intp (batch_time)

In [None]:
#replay interaction sequences of length <= replay_seq_len

_,observation_seq,_,_,(policy_seq,V_seq) = agent.get_sessions(
    env,
    session_length=replay_seq_len,
    batch_size=env.batch_size,
)




#observation seq are the observation tensor we just loaded
#policy seq are agent action probabilities predicted within experience replay
# V_seq are agent state values

#The three "_"s are
#first - environment states - which is empty since we are using session pool as our environment
#second - a dictionary of all agent memory units (RNN, GRU, NTM) - empty as we use none of them
#last - "imagined" actions - actions agent would pick now if he was in that situation 
#                              - irrelevant since we are replaying and not actually playing the game now


#reshape V_seq from (batch, time, 1 unit) intp (batch_time)
V_seq = V_seq[:,:,0]


#the actions agent took in the original recorded game
action_seq = env.actions[0]

#get rewards for all actions
rewards_seq = env.rewards

#get indicator whether session is still active
is_alive_seq = env.is_alive


# Evaluating loss function

# [new]
* In this part we are using an n-step Advantage Actor-Critic (A2c)
* In this case, it's a 10-step a2c (see n_steps parameter)
* To use Q-learning or sarsa, you will have to predict Q-values instead of probabilities

* The basic interface is .get_elementwise_objective 
  * it returns loss function (here - actor-critic loss function)
  * $ log {\pi} \cdot (V_{percieved} - V_{predicted}) $
  * $\pi$ is agent policy, $V$'s are state values
  * Read more at http://www.arxiv.org/pdf/1602.01783v1.pdf
    
* If you want to do it the hard way instead, try .get_reference_state_values and compute errors on ya own
  

In [None]:
#get reference Qvalues according to Qlearning algorithm


from agentnet.learning import dpg_n_step

#gamma - delayed reward coefficient - what fraction of reward is retained if it is obtained one tick later
gamma = theano.shared(np.float32(0.99),name = 'q_learning_gamma')


#IMPORTANT!
# If you are training on a game that has rewards far outside some [-5,+5]
# it is a good idea to downscale them to avoid divergence
scaled_reward_seq = rewards_seq
#For KungFuMaster, however, not scaling rewards is at least working


elwise_actor_loss,elwise_critic_loss = dpg_n_step.get_elementwise_objective_components(policy_seq,
                                                       rewards_seq,
                                                       V_seq,
                                                       V_seq,
                                                       is_alive_seq,
                                                       n_steps=10, #using n-step on-policy actor-critic
                                                       gamma_or_gammas=gamma,)

#compute mean over "alive" fragments
actor_loss = elwise_actor_loss.sum() / is_alive_seq.sum()
critic_loss = elwise_critic_loss.sum() / is_alive_seq.sum()

### Regularization

# [new]
We regularize agent's policy with entropy
 * See here http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.54.3433&rep=rep1&type=pdf (ref by that article above)
 * Basically, we punish agent for being too certain on what to do

In [None]:
actor_grads = dict(zip(actor_weights,T.grad(actor_loss,actor_weights)))
critic_grads = dict(zip(critic_weights,T.grad(critic_loss,critic_weights)))


grads = dict(actor_grads)
for param,cgrad in critic_grads.items():
    if param in grads:
        grads[param] += cgrad
    else:
        grads[param] = cgrad
        
        
grads = [grads[w] for w in weights]

#### Compute weight updates

In [None]:

updates = lasagne.updates.adadelta(grads,
                                             weights,learning_rate=0.01)

#### Some auxilary evaluation metrics

In [None]:
mean_session_reward = rewards_seq.sum(axis=1).mean()
#...

# Compile train and evaluation functions

In [None]:
train_fun = theano.function([],[actor_loss+critic_loss,mean_session_reward],updates=updates)

evaluation_fun = theano.function([],[actor_loss+critic_loss,actor_loss,critic_loss,mean_session_reward])#!!!!fix losses

# Training loop

In [None]:
#tools for model persistence (in progress now. Requires unique names)
from agentnet.utils.persistence import save,load
import os

In [None]:
from agentnet.display import Metrics
score_log = Metrics()

In [None]:
#starting epoch
epoch_counter = 1
#moving average estimation
alpha = 0.1
ma_reward_current = -40.
ma_reward_greedy =-40.

In [None]:
n_epochs = 100000
batch_size= 10

for i in range(n_epochs):    
    
    
    #train
    update_pool(env,pool,replay_seq_len)
    loss,avg_reward = train_fun()
    
    
    
    
    
    ##record current learning progress and show learning curves
    if epoch_counter%5 ==0:

        ##update learning curves
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()
        ma_reward_current = (1-alpha)*ma_reward_current + alpha*avg_reward_current
        score_log["expected e-greedy reward"][epoch_counter] = ma_reward_current
        
        
        
        #greedy train
        update_pool(env,pool,replay_seq_len)

        avg_reward_greedy = evaluation_fun()[-1]
        ma_reward_greedy = (1-alpha)*ma_reward_greedy + alpha*avg_reward_greedy
        score_log["expected greedy reward"][epoch_counter] = ma_reward_greedy
        
        
        #back to epsilon-greedy
        update_pool(env,pool,replay_seq_len)
        if epoch_counter %500 ==0:
            print "epoch %i,loss %.5f, epsilon %.5f, rewards: ( e-greedy %.5f, greedy %.5f) "%(
                epoch_counter,full_loss,float('inf'),ma_reward_current,ma_reward_greedy)
            print "rec %.3f reg %.3f"%(q_loss,l2_penalty)

    if epoch_counter %500 ==0:
        print "Learning curves:"
        score_log.plot()


    
    #save snapshot
    if epoch_counter %10000 ==0:
        snap_name = "{}.epoch{}.pcl".format(os.path.join(snapshot_path,experiment_setup_name), epoch_counter)
        save(resolver,snap_name)
        print "saved", snap_name

        
        
    epoch_counter  +=1

    
# Time to drink some coffee!

# Evaluating results
 * Here we plot learning curves and sample testimonials
 * we took epoch 7k for final submission
 * done via `load(resolver,"./agentnet_snapshots/{your experiment_setup_name}.epoch7000.pcl")`

In [None]:
score_log.plot("final")

In [None]:
print "Random session examples"
!!! !!display_sessions(with_plots=True)


# Submission

In [None]:
epsilon.set_value(0)