In [None]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline


#theano imports
#the problem is too simple to be run on GPU. Seriously.
%env THEANO_FLAGS='device=cpu'
import theano
import theano.tensor as T
floatX = theano.config.floatX

import lasagne


In [None]:
%load_ext autoreload
%autoreload 2

# This tutorial explains the basic pipline of Agentnet experiment
* experiment setup
* designing agent
* interacting with environment
* computing losses
* training

# Experiment setup
* Here we load a simple experiment environment (description below)
* Designing one from scratch is explained in later tutorials

In [None]:
import agentnet.experiments.boolean_reasoning as experiment
print(experiment.__doc__)

In [None]:
#Create an environment with all default parameters
env = experiment.BooleanReasoningEnvironment()

# agent setup
* An agent implementation contains three parts:
 * Memory layer(s)
   * in this case, a single one-step RNN
   * may be any amount of recurrent layers [GRU/LSTM, LTM, custom or none at all]
   * you have to create a dict {new recurrent state : previous state} (see next tabs)
   
 * Policy estimation layers
   * In this case, predicted Qvalues for all actions (via DenseLayer)
   * Whatever is required for agent to pick action
   * Can be any lasagne network
   
 * Resolver - acton picker
   * in this case, the resolver has epsilon-greedy policy

In [None]:
from agentnet.resolver import EpsilonGreedyResolver
from agentnet.memory.rnn import RNNCell
from agentnet.agent import Agent

In [None]:
from lasagne.layers import InputLayer,DenseLayer

#64 hidden neurons
n_hid=64


observation_size = (None,)+tuple(env.observation_shapes)

observation_layer = InputLayer(observation_size,name="obs_input")
prev_state_layer = InputLayer([None,n_hid],name="prev_state_input")

#memory
#note that this isn't the same as lasagne recurrent units (see next text tab for detail)
rnn = RNNCell(prev_state_layer,
              observation_layer,
              name="rnn0")




#Qvalues estimator
q_eval = DenseLayer(rnn,
                    num_units = env.n_actions,
                    nonlinearity=lasagne.nonlinearities.linear,name="QEvaluator")



#create epsilon-greedy resolver with default epsilon [theano shared]
resolver = EpsilonGreedyResolver(q_eval,name="resolver")


#all together
agent = Agent(observation_layer,
              {rnn:prev_state_layer},
              q_eval,resolver)


### More about memory layers

In AgentNet, recurrent layers are defined as one-step layers that return new state given last state and inputs.

Using basic lasagne recurrences is still okay, but one should understand what they mean.

To create exactly what is above using lasagne layers only, one should use
```
from lasagne.layers import dimshuffle, RecurrentLayer

#reshape observation as 1-element sequence
observation_reshape = dimshuffle(observation_layer,(0,'x',1))

rnn = RecurrentLayer(observation_reshape,       #observation input
                     num_units=n_hid,           # amount of cells
                     hid_init=prev_state_layer,  #initialize with previous state
                     only_return_final=True,   #return final state, not sequence
                     unroll_scan=True,      #highly recommended for speedup
                     name='rnn')
```


Lasagne recurrence is also very useful to create recurrence inside recurrence.

Say, you have a text sequence as input on each step (e.g. conversation models).
Than you can read it with any recurrent layer and embed the result into your agent's state using other lasagne layers.


Alternatively, one can build RNN layer as an ElemwiseSumLayer of two DenseLayers without nonlinearity for input and prev state (and than apply NonlinearityLayer to the sum)

In [None]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params(resolver,trainable=True)
weights

# Interacting with environment
* an agent has a method that produces symbolic environment interaction sessions
* Such sessions are represented as tensors with dimensions matching pattern [batch_session_i, time_tick, ...]
* interactions result in sequences of observations, actions, q-values,etc
* one has to pre-define maximum session length.
 * in this case, environment implements an indicator of whether session has ended by current tick
* Since this environment also implements Objective methods, it can evaluate rewards for each [batch, time_tick]


In [None]:
#produce interaction sequences of length <= 10
(state_seq,),observation_seq,agent_state,action_seq,qvalues_seq = agent.get_sessions(
    env,
    session_length=10,
    batch_size=env.batch_size,
)


hidden_seq = agent_state[rnn]

#get rewards for all actions
rewards_seq = env.get_reward_sequences(state_seq,action_seq)

#get indicator whether session is still active
is_alive_seq = env.get_whether_alive(observation_seq)


# Evaluating loss function
Here we use a simple Q-learning algorithm.

The function below 
* takes qvalues, actions, rewards and session indicators,
* computes reference Q-values as $Qref(S,a_{taken}) = r + \gamma \cdot \max _{a} (Q(S_{next},a))$
* returns elementwise MSE, $L = ( Qpref(S,a_{taken}) - Qref(S,a_{taken}))^2$


AgentNet has plenty of such algorithms 
* n-step Qlearning, SARSA, actor-critic, det. policy gradient, etc
* one can easily define (and contribute) their other algorithms by theano operations.

#### Define loss functions

In [None]:
#get reference Qvalues according to Qlearning algorithm


from agentnet.learning import qlearning

#compute MSE between reference Qvalues and predicted ones
#use default gamma

squarred_Qerror = qlearning.get_elementwise_objective(
    qvalues_seq,
    action_seq,
    rewards_seq,
    is_alive_seq,
    gamma_or_gammas = 0.9)


loss = squarred_Qerror.sum(axis = 1).mean()

#### Compute weight updates

In [None]:
updates = lasagne.updates.adadelta(loss,weights,learning_rate=0.1)

#### expected total reward per session

In [None]:
mean_session_reward = rewards_seq.sum(axis=1).mean()
#...

# Compile train and evaluation functions

In [None]:
train_fun = theano.function([],[loss,mean_session_reward],updates=updates)

evaluation_fun = theano.function([],[loss,mean_session_reward])

# Training loop

In [None]:
from agentnet.display import Metrics
score_log = Metrics()

In [None]:
#starting epoch
epoch_counter = 1

#moving average estimation
alpha = 0.1
ma_reward_current = 0.
ma_reward_greedy = 0.

In [None]:
n_epochs = 5000
batch_size=10

for i in range(n_epochs):    
    
    #train
    env.generate_new_data_batch(batch_size)
    loss,avg_reward = train_fun()
    
    
    
    ##update resolver's epsilon (chance of random action instead of optimal one)
    current_epsilon =  0.05 + 0.95*np.exp(-epoch_counter/2500.)
    resolver.epsilon.set_value(np.float32(current_epsilon))
    
    
    
    ##show current learning progress
    if epoch_counter%100 ==0:

        ##update learning curves
        full_loss, avg_reward_current = evaluation_fun()
        ma_reward_current = (1-alpha)*ma_reward_current + alpha*avg_reward_current
        score_log["expected epsilon-greedy reward"][epoch_counter] = ma_reward_current
        
        
        
        #greedy train
        resolver.epsilon.set_value(0)
        avg_reward_greedy = evaluation_fun()[-1]
        ma_reward_greedy = (1-alpha)*ma_reward_greedy + alpha*avg_reward_greedy
        score_log["expected greedy reward"][epoch_counter] = ma_reward_greedy
        
        
        #back to epsilon-greedy
        resolver.epsilon.set_value(np.float32(current_epsilon))

        print("epoch %i, mse %.5f, epsilon %.5f, rewards: ( e-greedy %.5f, greedy %.5f) "%(
            epoch_counter,full_loss,current_epsilon,ma_reward_current,ma_reward_greedy))

        
    #visualize learning curve and sample sessions
    if epoch_counter %1000 ==0:
        print("Learning curves:")
        score_log.plot()
        
    epoch_counter  +=1


# Evaluate results

In [None]:
score_log.plot("final")

# session visualization tools


* this is a completely optional step of visualizing agent's sessions as chains of actions
* usually useful to get insight on what worked and what din't
* in this case, we print strings following pattern
  * [action_name] ([predicted action qvalue]) -> reward [reference qvalue] | next iteration

* plot shows
    * time ticks over X, abstract values over Y
    * bold lines are Qvalues for actions
    * dots on bold lines represent what actions were taken at each moment of time
    * dashed lines are agent's hidden state neurons
    * blue vertical line - session end
    
    
__Warning! the visualization tools are underdeveloped and only allow simple operations.__

if you found yourself struggling to make it do what you want for 5 minutes, go write your own tool [and contribute it :)]


In [None]:
from agentnet.display.sessions import print_sessions
get_printables = theano.function([], [
        hidden_seq,qvalues_seq, action_seq,rewards_seq,is_alive_seq
    ])

def display_sessions(with_plots = False):
        
    hidden_log,qvalues_log,actions_log,reward_log, is_alive_log = get_printables()
    
    
    print_sessions(qvalues_log,actions_log,reward_log,
                   is_alive_seq = is_alive_log,
                   action_names=env.feature_names,
                  
                  plot_policy = with_plots)

In [None]:
resolver.epsilon.set_value(np.float32(0.))
print("Random session examples")
env.generate_new_data_batch(10)
display_sessions(with_plots=True)
