Skip to content
This repository has been archived by the owner on Mar 31, 2019. It is now read-only.

Commit

Permalink
organize utils.layers
Browse files Browse the repository at this point in the history
  • Loading branch information
justheuristic committed Aug 3, 2017
1 parent c5803ba commit 9c33040
Show file tree
Hide file tree
Showing 9 changed files with 322 additions and 76 deletions.
28 changes: 14 additions & 14 deletions agentnet/memory/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class AttentionLayer(DictLayer):
A layer that implements basic Bahdanau-style attention. Implementation is inspired by tfnn@yandex.
Kurzgesagt, attention lets network decide which fraction of sequence/image should it view now
by using small one-layer block that predicts (input_element,controller) -> do i want to see input_element
by using small one-layer block that predicts (input_element,query) -> do i want to see input_element
for all input_elements. You can read more about it here - http://distill.pub/2016/augmented-rnns/ .
AttentionLayer also allows you to have separate keys and values:
Expand All @@ -33,8 +33,8 @@ class AttentionLayer(DictLayer):
:param input_sequence: sequence of inputs to be processed with attention
:type input_sequence: lasagne.layers.Layer with shape [batch,seq_length,units]
:param conteroller_state: single time-step state of decoder (usually lstm/gru/rnn hid)
:type controller_state: lasagne.layers.Layer with shape [batch,units]
:param query: single time-step state of decoder (usually lstm/gru/rnn hid)
:type query: lasagne.layers.Layer with shape [batch,units]
:param num_units: number of hidden units in attention intermediate activation
:type num_units: int
Expand Down Expand Up @@ -68,7 +68,7 @@ class AttentionLayer(DictLayer):

def __init__(self,
input_sequence,
controller_state,
query,
num_units,
mask_input = None,
key_sequence=None,
Expand All @@ -80,16 +80,16 @@ def __init__(self,
**kwargs
):
assert len(input_sequence.output_shape)==3,"input_sequence must be a 3-dimensional (batch,time,units)"
assert len(controller_state.output_shape)==2,"controller_state must be a 2-dimensional for single tick (batch,units)"
assert len(query.output_shape) == 2, "query must be a 2-dimensional for single tick (batch,units)"
assert mask_input is None or len(mask_input.output_shape)==2,"mask_input must be 2-dimensional (batch,time) or None"

batch_size,seq_len,enc_units = input_sequence.output_shape
dec_units = controller_state.output_shape[-1]
dec_units = query.output_shape[-1]

#if no key sequence is given, use input_sequence as key
key_sequence = key_sequence or input_sequence

incomings = [input_sequence,key_sequence,controller_state]
incomings = [input_sequence, key_sequence, query]
if mask_input is not None:
incomings.append(mask_input)

Expand Down Expand Up @@ -171,7 +171,7 @@ class DotAttentionLayer(DictLayer):
it computes logits with keys, then converts them to weights(probs) and averages _values_ with those weights.
Kurzgesagt, attention lets network decide which fraction of sequence/image should it view now
by using small one-layer block that predicts (input_element,controller) -> do i want to see input_element
by using small one-layer block that predicts (input_element,query) -> do i want to see input_element
for all input_elements. You can read more about it here - http://distill.pub/2016/augmented-rnns/ .
This layer outputs a dict with keys "attn" and "probs"
Expand All @@ -193,8 +193,8 @@ class DotAttentionLayer(DictLayer):
:type input_sequence: lasagne.layers.Layer with shape [batch,seq_length,units]
:param query: single time-step state of decoder that is used as query (usually custom layer or lstm/gru/rnn hid)
If it matches input_sequence one-step size, controller_state is used as is.
Otherwise, DotAttention is performed from DenseLayer(controller_state,input_units,nonlinearity=None).
If it matches input_sequence one-step size, query is used as is.
Otherwise, DotAttention is performed from DenseLayer(query,input_units,nonlinearity=None).
:type query: lasagne.layers.Layer with shape [batch,units]
:param key_sequence: a sequence of keys to compute dot_product with. By default, uses input_sequence instead.
Expand All @@ -203,7 +203,7 @@ class DotAttentionLayer(DictLayer):
:param mask_input: mask for input_sequence (like other lasagne masks). Default is no mask
:type mask_input: lasagne.layers.Layer with shape [batch,seq_length]
:param use_dense_layer: if True, forcibly creates intermediate dense layer on top of controller state.
:param use_dense_layer: if True, forcibly creates intermediate dense layer on top of query
:param probs_nonlinearity: nonlinearity that converts logits of shape [batch,seq_length] into attention weights of same shape
(you can provide softmax with tunable temperature or gumbel-softmax or anything of the sort)
Expand All @@ -221,15 +221,15 @@ def __init__(self,
**kwargs
):
assert len(input_sequence.output_shape)==3,"input_sequence must be a 3-dimensional (batch,time,units)"
assert len(query.output_shape) == 2, "controller_state must be a 2-dimensional for single tick (batch,units)"
assert len(query.output_shape) == 2, "query must be a 2-dimensional for single tick (batch,units)"
assert mask_input is None or len(mask_input.output_shape)==2,"mask_input must be 2-dimensional (batch,time) or None"

batch_size,seq_len,enc_units = input_sequence.output_shape
dec_units = query.output_shape[-1]

if (dec_units != enc_units) and not use_dense_layer:
warn("Input sequence and controller_state have different unit sizes. "
"Using DenseLayer from controller state instead of controller_state."
warn("Input sequence and query have different unit sizes. "
"Using DenseLayer from query instead of raw query."
"To suppress this warning, set use_dense_layer=True.")
use_dense_layer=True
if use_dense_layer:
Expand Down
1 change: 1 addition & 0 deletions agentnet/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
from .shared import *
from .persistence import *
from .layers import *
from .format import *
16 changes: 12 additions & 4 deletions agentnet/utils/clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@
- DPG-like methods where critic has to process both optimal and actual actions
"""
import lasagne
from .format import check_list,check_ordered_dict
from copy import deepcopy

import lasagne

from .format import check_list, check_ordered_dict
from ..utils.logging import warn
from ..utils.reapply import reapply


from .layers import reapply as _reapply
def reapply(*args,**kwargs):
warn("DEPRECATION: Reapply has been moved to agentnet.utils.layers.reapply (or agentnet.utils.reapply)."
"It will be removed from agentnet.utils.clone in the next release.")
return _reapply(*args,**kwargs)

def clone_network(original_network, bottom_layers=None,
share_params=False, share_inputs=True,name_prefix = None):
Expand Down Expand Up @@ -77,7 +85,7 @@ def clone_network(original_network, bottom_layers=None,
#add shared weights
if share_params:
warn("clone_network with share_params=True may be unreliable in some cases. "\
"If you want to simply apply the network elsewhere, use reapply")
"If you want to simply apply the network elsewhere, use agentnet.utils.layers.reapply")
all_weights = lasagne.layers.get_all_params(original_layers)
for weight_var in all_weights:
#if weight already in memo
Expand Down
11 changes: 11 additions & 0 deletions agentnet/utils/layers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""
'Layers' introduces a number of auxiliary lasagne layers that are used throughout AgentNet and examples
"""


from .broadcast import BroadcastLayer,UnbroadcastLayer,UpcastLayer
from .dict import DictLayer,DictElementLayer
from helpers import *
from reapply import ReapplyLayer,reapply


212 changes: 212 additions & 0 deletions agentnet/utils/layers/broadcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""layers that simplify applying lasagne layes across custom axes"""


import numpy as np
import theano.tensor as T
from lasagne.layers import Layer, MergeLayer
from agentnet.utils.format import check_list


class BroadcastLayer(Layer):
"""
Merges certain axes of network into first (batch) axis to allow broadcasting over them.
:param incoming: layer to be broadcasted
:type incoming: Layer
:param broadcasted_axes: an axis (or axes) to be broadcasted
:type broadcasted_axes: int or tuple of int
:force_broadcastable_batch: if True, raises an eror whenever batch (0'th) axis is not included in broadcasted_axes
"""

def __init__(self, incoming, broadcasted_axes, force_broadcastable_batch=True, **kwargs):

self.incoming_ndim = len(incoming.output_shape)

# axes that are to be broadcasted -- in ascending order
# ax%self.incoming_ndim is used to replace negative axes with N-ax+1 so that -1 becomes last axis
self.broadcasted_axes = sorted([ax % self.incoming_ndim for ax in check_list(broadcasted_axes)])

# sanity checks
assert max(self.broadcasted_axes) < self.incoming_ndim
assert len(self.broadcasted_axes) > 0
if force_broadcastable_batch and (0 not in self.broadcasted_axes):
raise ValueError("BroadcastLayer was asked NOT to broadcast over batch (0'th) axis.\n"
"If you know what you're doing, set force_broadcastable_batch=False.\n"
"Otherwise just add 0 to the broadcasted_axes")

# axed that are NOT broadcasted = all other axes in respective order
self.non_broadcasted_axes = [ax for ax in range(self.incoming_ndim) if ax not in self.broadcasted_axes]

super(BroadcastLayer, self).__init__(incoming, **kwargs)

def get_output_for(self, input, **kwargs):
"""
performs theanic magic (see layer description)
:param input: activation to be reshaped into broadcastable shape
:param kwargs: no effect
:return: symbolic expression for reshaped layer activation
"""

# save symbolic input shape for unbroadcaster
self.symbolic_input_shape = input.shape

# dimshuffle so that the new order is [ all_broadcasted_axes, all_non_broadcasted_axes]

input = input.dimshuffle(self.broadcasted_axes + self.non_broadcasted_axes)

# flatten broadcasted axes into a single axis
input = input.reshape((-1,) + tuple(input.shape[len(self.broadcasted_axes):]))

# now shape should be [ product(broadcasted_axes_shapes), non_broadcasted_axes ]

return input

def get_output_shape_for(self, input_shape):

broadcasted_shapes = [input_shape[ax] for ax in self.broadcasted_axes]

if None not in broadcasted_shapes:
new_batch_size = np.prod(broadcasted_shapes)
else:
new_batch_size = None

non_broadcasted_shapes = tuple(input_shape[ax] for ax in self.non_broadcasted_axes)

return (new_batch_size,) + non_broadcasted_shapes


class AwaitLayer(MergeLayer):
"""dummy layer that makes sure that output of original layer is only computed after layer_to_await"""

def __init__(self, incoming, layer_to_await, **kwargs):
super(AwaitLayer, self).__init__([incoming, layer_to_await], **kwargs)

def get_output_for(self, inputs, **kwargs):
return inputs[0]

def get_output_shape_for(self, input_shapes, **kwargs):
return input_shapes[0]


class UnbroadcastLayer(Layer):
"""
Does the inverse of BroadcastLayer
:param incoming: a layer to be unbroadcasted. (!) Must have same number of dimensions as before broadcasting
:type incoming: Layer
:param broadcast_layer: a broadcasting to be undone
:type broadcast_layer: BroadcastLayer
"""

def __init__(self, incoming, broadcast_layer, **kwargs):
self.broadcast_layer = broadcast_layer

# assert that dimensionality is same as before broadcast
assert len(incoming.output_shape) == len(self.broadcast_layer.output_shape)

incoming = AwaitLayer(incoming,
layer_to_await=broadcast_layer) # make sure incoming is not evaluated before broadcast_layer
super(UnbroadcastLayer, self).__init__(incoming, **kwargs)

def get_output_for(self, input, **kwargs):
"""
Un-broadcasts the broadcast layer (see class description)
:param input: input tensor
:param kwargs: no effect
:return: un-broadcasted tensor
"""

if not hasattr(self.broadcast_layer, "symbolic_input_shape"):
raise ValueError(
"UnbroadcastLayer.get_output_for must be called after respective BroadcastLayer.get_output_for")

# symbolic shape. dirty hack to handle "None" axes
pre_broadcast_shape = self.broadcast_layer.symbolic_input_shape

broadcasted_axes_shapes = tuple(pre_broadcast_shape[ax] for ax in self.broadcast_layer.broadcasted_axes)

# convert shape from [bc_ax0*bc_ax1*.., non_bc_ax0, non_bc_ax1,...] to [bc_ax0,bc_ax1,...,non_bc_ax0,non_bc_ax1,...]
unrolled_shape = broadcasted_axes_shapes + tuple(input.shape)[1:]
input = input.reshape(unrolled_shape)

# rearrange axes to their order before broadcasting
current_dim_order = self.broadcast_layer.broadcasted_axes + self.broadcast_layer.non_broadcasted_axes

dimshuffle_order = [current_dim_order.index(i) for i in range(len(current_dim_order))]

return input.dimshuffle(dimshuffle_order)

def get_output_shape_for(self, input_shape, **kwargs):

new_non_broadcast_shapes = input_shape[1:]

# this one is NOT symbolic. list() is used as a shallow copy op.
original_shape = list(self.broadcast_layer.input_shape)

# set new non-broadcasted axes shapes instead of old ones
for ax, new_ax_shape in zip(self.broadcast_layer.non_broadcasted_axes,
new_non_broadcast_shapes):
original_shape[ax] = new_ax_shape

# return updated shape
return tuple(original_shape)


class UpcastLayer(Layer):
"""
Repeats the layer along batch axis to allow elementwise operations with given broadcasted layer.
:param incoming: a layer to be upcasted.
:type incoming: Layer
:param broadcast_layer: a broadcasting to be matched
:type broadcast_layer: BroadcastLayer
"""

def __init__(self, incoming, broadcast_layer, **kwargs):
self.broadcast_layer = broadcast_layer
incoming = AwaitLayer(incoming,
layer_to_await=broadcast_layer) # make sure incoming is not evaluated before broadcast_layer
super(UpcastLayer, self).__init__(incoming, **kwargs)

def get_output_for(self, input, **kwargs):
"""
Upcasts the given layer (see class description)
:param input: input tensor
:param kwargs: no effect
:return: upcasted tensor
"""

if not hasattr(self.broadcast_layer, "symbolic_input_shape"):
raise ValueError("UpcastLayer.get_output_for must be called after respective BroadcastLayer.get_output_for")

# symbolic shape. dirty hack to handle "None" axes
pre_broadcast_shape = self.broadcast_layer.symbolic_input_shape

broadcasted_axes_shapes = tuple(pre_broadcast_shape[ax] for ax in self.broadcast_layer.broadcasted_axes)

n_repeats = T.prod(broadcasted_axes_shapes)
if 0 in self.broadcast_layer.broadcasted_axes:
n_repeats /= pre_broadcast_shape[0] # if batch size is already accounted for, ignore it.

return T.repeat(input, n_repeats, axis=0)

def get_output_shape_for(self, input_shape, **kwargs):

# this one is NOT symbolic. list() is used as a shallow copy op.
original_shape = list(self.broadcast_layer.input_shape)
broadcasted_dims = [original_shape[ax] for ax in self.broadcast_layer.broadcasted_axes]

if input_shape[0] is None or None in broadcasted_dims:
new_batch_size = None

else:
new_batch_size = input_shape[0] * np.prod(broadcasted_dims)
if 0 in self.broadcast_layer.broadcasted_axes:
assert input_shape[0] == original_shape[
0], "batch sizes of upcasted layer and broadcast_layer must be equal"
new_batch_size /= input_shape[0] # if batch size is already accounted for, ignore it.

input_shape = (new_batch_size,) + tuple(input_shape)[1:]

# return updated shape
return input_shape

0 comments on commit 9c33040

Please sign in to comment.