In [1]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline


#theano imports
#the problem is too simple to be run on GPU. Seriously.
%env THEANO_FLAGS='device=gpu2'
import lasagne
import theano
import theano.tensor as T
import sys
sys.setrecursionlimit(10000)
floatX = theano.config.floatX



env: THEANO_FLAGS='device=gpu2'


Using gpu device 2: Tesla K40m (CNMeM is disabled, CuDNN 4004)


In [2]:
%load_ext autoreload
%autoreload 2

# [in development] this is just a minimalistic language model that uses stack-augmented memory

In [3]:
def generate_sequence(batch_size = 10,crop_length = 50 ):
    """
    Generates sequence from pattern [0*n 1*m 2*(n+m)]
    """
    sequences=[]
    for i in range(batch_size):
        seq = []
        while len(seq) < crop_length:
            n,m = np.random.randint(1,5,2)
        
            seq += [0] + [1]*n+[2]*m+[3]*(n+m)
        seq = seq[:crop_length]
        sequences.append(seq)
    return np.array(sequences,dtype='int32')

alphabet = np.array(list('|abc'))

In [4]:
%%time
generate_sequence(100,50)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 4.58 ms


array([[0, 1, 2, ..., 2, 2, 2],
       [0, 1, 2, ..., 1, 1, 1],
       [0, 1, 1, ..., 3, 3, 3],
       ..., 
       [0, 1, 1, ..., 2, 2, 3],
       [0, 1, 2, ..., 0, 1, 2],
       [0, 1, 1, ..., 2, 3, 3]], dtype=int32)

# agent setup
* An agent implementation has to contain three parts:
 * Memory layer(s)
  * in this case, a single one-step GRU
 * Q-values evaluation layers
  * in this case, a lasagne dense layer based on memory layer
 * Resolver - acton picker layer
  * in this case, the resolver has epsilon-greedy policy

In [5]:
import lasagne
from agentnet.memory import GRUMemoryLayer
from agentnet.agent import Generator
from agentnet.resolver import ProbablisticResolver

In [6]:
X_sequence = T.matrix('int32')



In [7]:


#observation
output_shape = (None,)
observation_layer = lasagne.layers.InputLayer(output_shape,name="obs_input")

n_tokens = len(alphabet)

def to_one_hot(x):
    return T.extra_ops.to_one_hot(x,n_tokens,dtype=floatX)

obs_one_hot = lasagne.layers.ExpressionLayer(observation_layer,to_one_hot,
                                     output_shape=output_shape+(n_tokens,),
                                     name="token_one_hot")




#memory

n_hid_1 = 256
prev_gru1_layer = lasagne.layers.InputLayer((None,n_hid_1),name="prev_gru1_state_input")

gru1 = GRUMemoryLayer(n_hid_1,
                     obs_one_hot,
                     prev_gru1_layer,
                     name="gru1")



n_hid_2 = 256

prev_gru2_layer = lasagne.layers.InputLayer((None,n_hid_2),name="prev_gru2_state_input")

gru2 = GRUMemoryLayer(n_hid_2,
                     gru1,        #note that it takes CURRENT gru1 output as input.
                                  #replacing that with _prev_gru1_state would imply taking previous one.
                     prev_gru2_layer,
                     name="gru2")

from collections import OrderedDict

memory_dict = OrderedDict([
            (gru1,prev_gru1_layer),
            (gru2,prev_gru2_layer)
    ])


#policy

greed = theano.shared(np.float32(1),"prob_multiplier")


policy_layer = lasagne.layers.DenseLayer(gru2, #taking both memories. 
                                                        #Replacing with gru1 or gru2 would mean taking one
                                         num_units = n_tokens,
                                         nonlinearity=lambda x: lasagne.nonlinearities.softmax(x*greed),
                                         name="policy_original")

#resolver


resolver = ProbablisticResolver(policy_layer,assume_normalized=True,name="resolver")

assert tuple(lasagne.layers.get_output_shape(resolver)) == tuple(output_shape)





#all together
agent = Generator(
    observation_layer,
    memory_dict,
    policy_layer,
    resolver
    )


In [8]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params(resolver,trainable=True)
weights

[gru1.W_in_to_updategate,
 gru1.W_hid_to_updategate,
 gru1.b_updategate,
 gru1.W_in_to_resetgate,
 gru1.W_hid_to_resetgate,
 gru1.b_resetgate,
 gru1.W_in_to_hidden_update,
 gru1.W_hid_to_hidden_update,
 gru1.b_hidden_update,
 gru2.W_in_to_updategate,
 gru2.W_hid_to_updategate,
 gru2.b_updategate,
 gru2.W_in_to_resetgate,
 gru2.W_hid_to_resetgate,
 gru2.b_resetgate,
 gru2.W_in_to_hidden_update,
 gru2.W_hid_to_hidden_update,
 gru2.b_hidden_update,
 policy_original.W,
 policy_original.b]

## Agent setup in detail
* __Memory layers__
 * One-step recurrent layer
     * takes input and one's previous state
     * returns new memory state
   * Can be arbitrary lasagne layer
   * Several one-step recurrent units are implemented in __agentnet.memory__
   * Note that lasagne's default recurrent networks roll for several steps at once
     * in other words, __using lasagne recurrent units as memory means recurrence inside recurrence__
 * Using more than one memory layer is explained in farther tutorials


* __Q-values evaluation layer__
 * Can be arbitrary lasagne network
 * returns predicted Q-values for each action
 * Usually depends on memory as an input


* __Resolver__ - action picker
 * Decides on what action is taken
 * Normally takes Q-values as input
 * Currently all experiments require integer output
 * Several resolver layers are implemented in __agentnet.resolver__

# Interacting with environment
* an agent has a method that produces symbolic environment interaction sessions
* Such sessions are represented as tensors with dimensions matching pattern [batch_session_i, time_tick, ...]
* interactions result in sequences of observations, actions, q-values,etc
* one has to pre-define maximum session length.
 * in this case, environment implements an indicator of whether session has ended by current tick
* Since this environment also implements Objective methods, it can evaluate rewards for each [batch, time_tick]



In [9]:
seq_length = 50

sequences_batch = theano.shared(np.zeros([3,seq_length],dtype="int32"),name="reference_sequences")

batch_size = sequences_batch.shape[0]




history = agent.get_sessions(session_length=seq_length,
                             recorded_sequences=sequences_batch,
                             batch_size=batch_size,)

env_states,observation_seq,agent_states,action_seq,policy_seq = history


gru1_seq = agent_states[gru1]
gru2_seq = hidden_seq = agent_states[gru2]

# Evaluating loss function
* In this case, we want to 
 * first get pairs of (predicted Qvalue, reference Qvalue) for all actions commited
 * second, define loss function
 * third, compute grad and update weights

#### Define loss functions

In [10]:
predicted_probas = policy_seq[:,:-1].reshape([-1,n_tokens])
predicted_probas = T.maximum(predicted_probas,1e-10)

model_loss = lasagne.objectives.categorical_crossentropy(predicted_probas,
                                                         sequences_batch[:,1:].ravel()).mean()

In [11]:
#regularize network weights

from lasagne.regularization import regularize_network_params, l2
reg_l2 = regularize_network_params(resolver,l2)*10**-5

In [12]:
loss = model_loss + reg_l2

#### Compute weight updates

In [13]:
updates = lasagne.updates.adadelta(loss,weights,learning_rate=0.1)

# generation

In [26]:
#actions generated in active mode
generated_action_seq = agent.get_sessions(session_length=seq_length,
                             recorded_sequences=sequences_batch,
                             batch_size=batch_size,)[-2]


# Compile train and evaluation functions

In [27]:
train_fun = theano.function([],[loss],updates=updates)
evaluation_fun = theano.function([],[loss,model_loss,reg_l2])
get_sequences = theano.function([],generated_action_seq)

In [22]:
log = ""

In [None]:
loss_seq = []
for i in range(50000):
    new_batch = generate_sequence(10,seq_length)
    sequences_batch.set_value(new_batch)
    
    loss_seq.append(train_fun())
    
    if i % 100==0:
        quality = "iter:%i\tfull:%.5f\tllh:%.5f\treg:%.5f"%tuple([i]+map(float,evaluation_fun()))        
        print quality
        log+=quality+'\n'
        
        examples = get_sequences()[:3]
        for tid_line in examples:
            line = ' '.join(map(alphabet.__getitem__,tid_line))
            print line
            log += line+'\n'

    

iter:0	full:0.46341	llh:0.40408	reg:0.05933
a b c c a b c c c c c c c | c | | a a b a a b b c c c c c c c | | a b b c b b c c c c c c b a b c c
a b a b b c b b c c c c c c c | | a a a a a b c c c c c c c a a a b c c b c | a a b a b b c c c c c
a a b c b b c c | a a c b c c | a a a b b b c c c c c c c a a a b b c c c c c | a a b a c b c c c c
iter:100	full:0.43279	llh:0.37342	reg:0.05937
b a b b b b b b c c c c c c | a a a b b c c | c | a a a b b c b b c c c c c c | a a a a b b c c c b
b a a b b b c c c c | | a a b b b c c c c c c c c | a a a b b b b c b c c c c c c c | a a b b c c |
a a a a b b c c c c c c c c | a a b b c b c c c c | a a b b b c c a a b b c c c c c | a b b b c c c
iter:200	full:0.37810	llh:0.31870	reg:0.05940
a a b b c c c c c c | a a a b b b c c c c | a b a c c c | a a b b c b c c c c c c c a a b b b b c c
a a a b b b c c c c | a a b a b c b c c c c a b b b c c c c c c c c | a a b c c a a b b c c c c c c
a a a b b b c c c | | a a a b b b b c c c c c c c c c | a a a b 

In [None]:
plt.plot(loss_seq)

In [None]:
print log[-1000:]

In [None]:
from agentnet.utils.persistence import save
save(resolver,"./lm-dense.pcl")

In [None]:
loss_seq