Permalink
Browse files

Batch normalized GRU, Untied episodic module, He's weight init method

  • Loading branch information...
therne committed Jun 27, 2016
1 parent 1007968 commit ed70ead0f53f2cbcbe06e8d0cb6e92c78bc707b9
Showing with 63 additions and 17 deletions.
  1. +2 −2 main.py
  2. +13 −7 models/new/dmn_plus.py
  3. +2 −2 models/new/episode_module.py
  4. +13 −3 utils/attn_gru.py
  5. +33 −3 utils/nn.py
View
@@ -18,7 +18,7 @@
flags.DEFINE_bool('gpu', True, 'Use GPU? [True]')
flags.DEFINE_integer('batch_size', 128, 'Batch size during training and testing [128]')
flags.DEFINE_integer('num_epochs', 256, 'Number of epochs for training [256]')
flags.DEFINE_float('learning_rate', 0.001, 'Learning rate [0.001]')
flags.DEFINE_float('learning_rate', 0.002, 'Learning rate [0.002]')
flags.DEFINE_boolean('load', False, 'Start training from saved model? [False]')
flags.DEFINE_integer('acc_period', 10, 'Training accuracy display period [10]')
flags.DEFINE_integer('val_period', 40, 'Validation period (for display purpose) [40]')
@@ -34,7 +34,7 @@
# train hyperparameters
flags.DEFINE_float('weight_decay', 0.001, 'Weight decay - 0 to turn off L2 regularization [0.001]')
flags.DEFINE_float('keep_prob', 0.9, 'Dropout rate [0.9]')
flags.DEFINE_float('keep_prob', 1., 'Dropout rate - 1.0 to turn off [1.0]')
flags.DEFINE_bool('batch_norm', True, 'Use batch normalization? [True]')
# bAbi dataset params
View
@@ -4,7 +4,7 @@
from models.base_model import BaseModel
from models.new.episode_module import EpisodeModule
from utils.nn import weight, bias, batch_norm, dropout
from utils.nn import weight, bias, dropout, batch_norm
class DMN(BaseModel):
@@ -23,15 +23,13 @@ def build(self):
fact_counts = tf.placeholder('int64', shape=[N], name='fc')
input_mask = tf.placeholder('float32', shape=[N, F, L, V], name='xm')
is_training = tf.placeholder(tf.bool)
self.att = tf.constant(0.)
# Prepare parameters
gru = rnn_cell.GRUCell(d)
l = self.positional_encoding()
embedding = weight('embedding', [A, V], init='uniform', range=3**(1/2))
w_t = weight('w_t', [3 * d, d])
b_t = bias('b_t', d)
with tf.name_scope('SentenceReader'):
input_list = tf.unpack(tf.transpose(input)) # L x [F, N]
input_embed = []
@@ -67,7 +65,7 @@ def build(self):
# Episodic Memory
with tf.variable_scope('Episodic'):
episode = EpisodeModule(d, question_vec, facts)
episode = EpisodeModule(d, question_vec, facts, is_training, params.batch_norm)
memory = tf.identity(question_vec)
for t in range(params.memory_step):
@@ -78,7 +76,15 @@ def build(self):
# ReLU update
c = episode.new(memory)
concated = tf.concat(1, [memory, c, question_vec])
memory = tf.nn.relu(tf.matmul(concated, w_t) + b_t) # [N, d]
w_t = weight('w_t', [3 * d, d])
z = tf.matmul(concated, w_t)
if params.batch_norm:
z = batch_norm(z, is_training)
else:
b_t = bias('b_t', d)
z = z + b_t
memory = tf.nn.relu(z) # [N, d]
scope.reuse_variables()
@@ -89,7 +95,7 @@ def build(self):
with tf.name_scope('Answer'):
# Answer module : feed-forward version (for it is one word answer)
w_a = weight('w_a', [d, A])
w_a = weight('w_a', [d, A], init='xavier')
logits = tf.matmul(memory, w_a) # [N, A]
with tf.name_scope('Loss'):
@@ -6,7 +6,7 @@
class EpisodeModule:
""" Inner GRU module in episodic memory that creates episode vector. """
def __init__(self, num_hidden, question, facts):
def __init__(self, num_hidden, question, facts, is_training, bn):
self.question = question
self.facts = tf.unpack(tf.transpose(facts, [1, 2, 0])) # F x [d, N]
@@ -19,7 +19,7 @@ def __init__(self, num_hidden, question, facts):
self.b1 = bias('b1', [num_hidden, 1])
self.w2 = weight('w2', [1, num_hidden])
self.b2 = bias('b2', [1, 1])
self.gru = AttnGRU(num_hidden)
self.gru = AttnGRU(num_hidden, is_training, bn)
@property
def init_state(self):
View
@@ -1,13 +1,15 @@
import tensorflow as tf
from tensorflow.python.ops.nn import tanh
from utils.nn import weight, bias
from utils.nn import weight, bias, batch_norm
class AttnGRU:
"""Attention-based Gated Recurrent Unit cell (cf. https://arxiv.org/abs/1603.01417)."""
def __init__(self, num_units):
def __init__(self, num_units, is_training, bn):
self._num_units = num_units
self.is_training = is_training
self.batch_norm = bn
def __call__(self, inputs, state, attention, scope=None):
"""Gated recurrent unit (GRU) with nunits cells."""
@@ -26,4 +28,12 @@ def _linear(self, x, h, bias_default=0.0):
w = weight('W', [I, D])
u = weight('U', [D, D])
b = bias('b', D, bias_default)
return tf.matmul(x, w) + tf.matmul(h, u) + b
if self.batch_norm:
with tf.variable_scope('Linear1'):
x_w = batch_norm(tf.matmul(x, w), is_training=self.is_training)
with tf.variable_scope('Linear2'):
h_u = batch_norm(tf.matmul(h, u), is_training=self.is_training)
return x_w + h_u + b
else:
return tf.matmul(x, w) + tf.matmul(h, u) + b
View
@@ -1,21 +1,27 @@
import math
import tensorflow as tf
import numpy as np
def weight(name, shape, init='xavier', range=None):
def weight(name, shape, init='he', range=None):
""" Initializes weight.
:param name: Variable name
:param shape: Tensor shape
:param init: Init mode. xavier / normal / uniform (default is 'xavier')
:param init: Init mode. xavier / normal / uniform / he (default is 'he')
:param range:
:return: Variable
"""
initializer = tf.constant_initializer()
if init == 'xavier':
fan_in, fan_out = shape
fan_in, fan_out = _get_dims(shape)
range = math.sqrt(6.0 / (fan_in + fan_out))
initializer = tf.random_uniform_initializer(-range, range)
elif init == 'he':
fan_in, _ = _get_dims(shape)
std = math.sqrt(2.0 / fan_in)
initializer = tf.random_normal_initializer(stddev=std)
elif init == 'normal':
initializer = tf.random_normal_initializer(stddev=0.1)
@@ -29,6 +35,12 @@ def weight(name, shape, init='xavier', range=None):
return var
def _get_dims(shape):
fan_in = shape[0] if len(shape) == 2 else np.prod(shape[:-1])
fan_out = shape[1] if len(shape) == 2 else shape[-1]
return fan_in, fan_out
def bias(name, dim, initial_value=0.0):
""" Initializes bias parameter.
:param name: Variable name
@@ -77,3 +89,21 @@ def dropout(x, keep_prob, is_training):
:return: dropout applied tensor
"""
return tf.cond(is_training, lambda: tf.nn.dropout(x, keep_prob), lambda: x)
def conv(x, filter, is_training):
l = tf.nn.conv2d(x, filter, strides=[1, 1, 1, 1], padding='SAME')
l = batch_norm(l, is_training)
return tf.nn.relu(l)
def flatten(x):
return tf.reshape(x, [-1])
def fully_connected(input, num_neurons, name, is_training):
input_size = input.get_shape()[1]
w = weight(name, [input_size, num_neurons], init='he')
l = tf.matmul(input, w)
l = batch_norm(l, is_training)
return tf.nn.relu(l)

0 comments on commit ed70ead

Please sign in to comment.