Modules for Autograd (#146)

* Added masked batch normalization, layer normalization, and soft attention utility functions. * Moved new util functions to new module directory. * Added build support for modules and fixed filename. * Added unit tests and various fixes for new modules. * Added torch-dokx style documentation to modules.
twitter-archive · Aug 4, 2016 · 9c20f7b · Atcold · Dec 20, 2016 · Atcold
1 parent dae7f53
commit 9c20f7b
Show file tree

Hide file tree

Showing 7 changed files with 351 additions and 2 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,7 +8,8 @@ FILE(GLOB luasrc src/*.lua)
 ADD_TORCH_PACKAGE(autograd "" "${luasrc}")
 
 INSTALL(DIRECTORY "src/model" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/autograd")
+INSTALL(DIRECTORY "src/module" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/autograd")
 INSTALL(DIRECTORY "src/loss" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/autograd")
 INSTALL(DIRECTORY "src/auto" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/autograd")
 INSTALL(DIRECTORY "src/optim" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/autograd")
-INSTALL(DIRECTORY "src/runtime" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/autograd")
+INSTALL(DIRECTORY "src/runtime" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/autograd")
diff --git a/src/init.lua b/src/init.lua
@@ -26,6 +26,7 @@ end
 
 autograd.auto = require 'autograd.auto'
 autograd.model = require 'autograd.model'
+autograd.module = require 'autograd.module'
 autograd.loss = require 'autograd.loss'
 autograd.util = require 'autograd.util'
 autograd.optim = require 'autograd.optim'

diff --git a/src/module/LayerNormalization.lua b/src/module/LayerNormalization.lua
@@ -0,0 +1,41 @@
+local util = require 'autograd.util'
+return function(opt, params)
+  local opt = opt or {}
+  local params = params or {}
+
+  local nOutputs = opt.nOutputs or 10
+  p = {gain = torch.ones(1, nOutputs),
+       bias = torch.zeros(1, nOutputs)}
+  table.insert(params, p)
+
+  local function layer_norm(params, x, eps)
+    --[[ Layer Normalization of Ba, Kiros, and Hinton (https://arxiv.org/abs/1607.06450)
+
+    Normalizes activations x at a layer by their mean and std.
+
+    Parameters:
+    * `params` - Gain and bias parameters to adjust normalized output.
+    * `x` - ([batch, nOutputs]) tensor to be normalized.
+    * `eps` - Small constant to avoid divide by zero for small std.
+
+    Returns:
+    * `x_corrected` - ([batch,] nOutputs]) layer normalized tensor.
+    --]]
+    local p = params[1] or params
+    local eps = eps or 1e-5
+    local x_in = x
+    if torch.nDimension(x) == 1 then
+      x_in = torch.view(x, 1, torch.size(x, 1))
+    end
+    local n = torch.size(x_in,2)
+    local mean = torch.expand(torch.mean(x_in, 2), torch.size(x_in))
+    local x_centered = x_in - mean
+    local std = torch.expand(torch.sqrt(torch.sum(torch.cmul(x_centered, x_centered) / n, 2)) + eps, torch.size(x_in))
+    local x_normed = torch.cdiv(x_centered, std)
+    local gain = torch.expand(p.gain, torch.size(x_in))
+    local bias = torch.expand(p.bias, torch.size(x_in))
+    local x_corrected = torch.view(torch.cmul(x_normed, gain) + bias, torch.size(x))
+    return x_corrected
+  end
+  return layer_norm, params
+end
diff --git a/src/module/MaskedBatchNormalization.lua b/src/module/MaskedBatchNormalization.lua
@@ -0,0 +1,63 @@
+local util = require 'autograd.util'
+return function(opt, params)
+  local opt = opt or {}
+  local params = params or {}
+
+  local nOutputs = opt.nOutputs or 10
+  local momentum = opt.momentum or 0.1
+
+  batchNormState = {momentum = momentum, train = 1,
+                    running_mean = torch.zeros(1, nOutputs),
+                    running_std = torch.ones(1, nOutputs)}
+
+  -- initializing gain to < 1 is recommended for LSTM batch norm.
+  p = {gain = torch.zeros(1, nOutputs):fill(0.1),
+       bias = torch.zeros(1, nOutputs)}
+  table.insert(params, p)
+
+  local function masked_batch_norm(params, x, mask, state, eps)
+    --[[ Masked batch normalization for minibatches with variable length sequences.
+
+    Based on sequence batch norm from Batch Normalized Recurrent Neural Networks by Laurent et al.
+    (http://arxiv.org/abs/1510.01378)
+
+    Parameters:
+    * `params` - Gain and bias parameters to adjust normalized output.
+    * `x` - ([batch, [time,], nOutputs]) tensor to be normalized.
+    * `mask` - Tensor with the same size as x that is 1 where x is valid and 0 otherwise.
+    * `state` - Running mean and std estimates, momentum for estimates, and train flag.
+    * `eps` - Small constant to avoid divide by zero for small std.
+
+    Returns:
+    * `x_corrected` - ([batch, [time,], nOutputs]) batch normalized tensor.
+    --]]
+    local p = params[1] or params
+    local eps = eps or 1e-5
+    local train = state.train or 1
+    local momentum = (state.momentum or 0.1) * train -- kill state updates during evaluation
+    local x_in = x
+    local mask_in = mask
+    if torch.nDimension(x) == 3 then -- collapse batch and time dimensions
+      x_in = torch.view(x, -1, torch.size(x, 3))
+      mask_in = torch.view(mask, -1, torch.size(mask, 3))
+    elseif torch.nDimension(x) == 1 then -- expand batch dimension
+      x_in = torch.view(x, 1, torch.size(x, 1))
+      mask_in = torch.view(mask, 1, torch.size(mask, 1))
+    end
+    local n = torch.sum(mask)
+    mask_in = torch.expand(mask_in, torch.size(x_in))
+    local x_masked = torch.cmul(x_in, mask_in)
+    local mean = torch.sum(x_masked / n, 1)
+    state.running_mean = momentum * mean + (1 - momentum) * state.running_mean
+    local x_centered = torch.cmul(x_masked - torch.expand(state.running_std, torch.size(x_in)), mask_in)
+    local var = torch.sum(torch.cmul(x_centered, x_centered) / n, 1) + eps
+    local std = torch.sqrt(var)
+    state.running_std = momentum * std + (1 - momentum) * state.running_std
+    local x_normed = torch.cdiv(x_centered, torch.expand(state.running_std, torch.size(x_in)))
+    local gain = torch.expand(p.gain, torch.size(x_in))
+    local bias = torch.expand(p.bias, torch.size(x_in))
+    local x_corrected = torch.view(torch.cmul(x_normed, gain) + bias, torch.size(x))
+    return x_corrected
+  end
+  return masked_batch_norm, params, batchNormState
+end
diff --git a/src/module/SoftAttention.lua b/src/module/SoftAttention.lua
@@ -0,0 +1,76 @@
+local functionalize = require('autograd.nnwrapper').functionalize
+local nn = functionalize('nn')
+local LayerNorm = require 'autograd.module.LayerNormalization'
+
+local softMax = nn.SoftMax()
+
+return function(opt, params)
+  local opt = opt or {}
+  local params = params or {}
+
+  local layerNormalization = opt.layerNormalization or false
+  local hiddenFeatures = opt.hiddenFeatures or 10
+  local subjectFeatures = opt.subjectFeatures or 15
+  local subjectChoices = opt.subjectChoices or 20
+
+  p = {W_att_subject = torch.zeros(1, 1, subjectFeatures),
+       W_att_h = torch.zeros(hiddenFeatures, subjectChoices),
+       b_att = torch.zeros(1, subjectChoices)}
+
+  if layerNormalization then
+    local focus_ln_params = LayerNorm({nOutputs = subjectChoices})
+    p.focus_ln_gain = focus_ln_params.gain
+    p.focus_ln_bias = focus_ln_params.bias
+    p.b_att = nil
+  end
+  table.insert(params, p)
+
+  local soft_attention = function(params, subject, h)
+    --[[ Soft attention over subject given hidden state.
+
+    Deterministic soft attention of Show, Attend, and Tell by Xu et al. (http://arxiv.org/abs/1502.03044)
+
+    Parameters:
+    * `params`  - Weights to combine subject and hidden features to score choices.
+    * `subject` - ([batch,] subjectFeatures, subjectChoices) tensor.
+    * `h`       - ([batch,] hiddenFeatures) tensor.
+
+    Returns:
+    * `attention` - ([batch,], subjectFeatures) tensor that is the expectation of the attended subject vector.
+    * `focus`     - ([batch,], subjectChoices) tensor that is the probability of selecting any given subject choice.
+    --]]
+    local p = params[1] or params
+    local subject_in = subject
+    local h_in = h
+    if torch.nDimension(subject) == 2 then
+      subject_in = torch.view(subject, 1, torch.size(subject, 1), torch.size(subject, 2))
+    end
+    if torch.nDimension(h) == 1 then
+      h_in = torch.view(h, 1, torch.size(h, 1))
+    end
+    local batchSize = torch.size(subject_in, 1)
+    local subjectFeatures = torch.size(subject_in, 2)
+    local subjectChoices = torch.size(subject_in, 3)
+    -- Activations for each subject choice and hidden state.
+    local W_subject = torch.expand(p.W_att_subject, batchSize, 1, subjectFeatures)
+    local subject_logit = torch.squeeze(torch.bmm(W_subject, subject_in), 2)
+    local hidden_logit = h_in * p.W_att_h
+    -- Focus distribution over subject choices.
+    local focus_logit = subject_logit + hidden_logit
+    if layerNormalization then
+      focus_logit = layer_norm({gain = p.focus_ln_gain, bias = p.focus_ln_bias}, focus_logit)
+    else
+      focus_logit = focus_logit + torch.expand(p.b_att, batchSize, subjectChoices)
+    end
+    local focus = softMax(focus_logit)
+    -- Attend to choice in expectation.
+    local expanded_focus = torch.expand(torch.view(focus, batchSize, 1, subjectChoices), torch.size(subject_in))
+    local attention = torch.squeeze(torch.sum(torch.cmul(subject_in, expanded_focus), 3), 3)
+    if torch.nDimension(subject) == 2 then
+      attention = torch.squeeze(attention, 1)
+      focus = torch.squeeze(focus, 1)
+    end
+    return attention, focus
+  end
+  return soft_attention, params
+end
diff --git a/src/module/init.lua b/src/module/init.lua
@@ -0,0 +1,8 @@
+-- autograd native modules
+local module = {
+  LayerNormalization = require 'autograd.module.LayerNormalization',
+  MaskedBatchNormalization = require 'autograd.module.MaskedBatchNormalization',
+  SoftAttention = require 'autograd.module.SoftAttention'
+}
+
+return module
diff --git a/test/test.lua b/test/test.lua
@@ -1247,6 +1247,166 @@ local tests = {
       tester:assert(gradcheck(loss, params, i), 'incorrect gradients')
    end,
 
+    Modules_LayerNormalization = function()
+      local f,params = autograd.module.LayerNormalization({nOutputs = 100})
+
+      -- Loss:
+      local loss = function(params, input)
+        local normed = f(params, input)
+        local l = torch.sum(normed)
+        return l, normed
+      end
+
+      dloss = autograd(loss)
+
+      params[1].gain:fill(1)
+      params[1].bias:fill(0.1)
+
+      local i = torch.randn(100) -- test 1D input
+      local l, i_normed = loss(params, i)
+      local grads = dloss(params, i)
+
+      tester:asserteq(type(l), 'number', 'loss should be a scalar')
+      tester:asserteq(grads[1].gain:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(grads[1].bias:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(i_normed:dim(), i:dim(), 'normed input has incorrect dim')
+
+      i = torch.randn(5,100) -- batch x nOutputs
+      l, i_normed = loss(params, i)
+      grads = dloss(params, i)
+
+      tester:asserteq(type(l), 'number', 'loss should be a scalar')
+      tester:asserteq(grads[1].gain:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(grads[1].bias:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(i_normed:dim(), i:dim(), 'normed input has incorrect dim')
+
+      -- Gradcheck
+      tester:assert(gradcheck(loss, params, i), 'incorrect gradients')
+    end,
+
+    Modules_SoftAttention = function()
+      local f,params = autograd.module.SoftAttention({
+        hiddenFeatures = 50,
+        subjectFeatures = 100,
+        subjectChoices = 16
+      })
+
+      -- Loss:
+      local loss = function(params, input, hidden)
+        local at, ft = f(params, input, hidden)
+        local l = torch.sum(at)
+        return l, at, ft
+      end
+
+      local dloss = autograd(loss)
+
+      params[1].W_att_subject:normal(0, 0.01)
+      params[1].W_att_h:normal(0, 0.01)
+      params[1].b_att:zero()
+
+      local x = torch.randn(100, 16)
+      local h = torch.randn(50)
+      local l, a, f = loss(params, x, h)
+      local grads = dloss(params, x, h)
+
+      tester:asserteq(type(l), 'number', 'loss should be a scalar')
+      tester:asserteq(grads[1].W_att_subject:dim(), 3, 'W_att_subject grad has incorrect dim')
+      tester:asserteq(grads[1].W_att_h:dim(), 2, 'W_att_h grad has incorrect dim')
+      tester:asserteq(grads[1].b_att:dim(), 2, 'b_att grad has incorrect dim')
+      tester:asserteq(torch.size(a, 1), torch.size(x,1), 'attention has incorrect dim')
+      tester:asserteq(torch.size(f, 1), torch.size(x,2), 'focus has incorrect dim')
+
+      -- Gradcheck
+      tester:assert(gradcheck(loss, params, x, h), 'incorrect gradients')
+
+      x = torch.randn(10, 100, 16)
+      h = torch.randn(10, 50)
+      local l, a, f = loss(params, x, h)
+      local grads = dloss(params, x, h)
+      tester:asserteq(type(l), 'number', 'loss should be a scalar')
+      tester:asserteq(grads[1].W_att_subject:dim(), 3, 'W_att_subject grad has incorrect dim')
+      tester:asserteq(grads[1].W_att_h:dim(), 2, 'W_att_h grad has incorrect dim')
+      tester:asserteq(grads[1].b_att:dim(), 2, 'b_att grad has incorrect dim')
+      tester:asserteq(torch.size(a, 2), torch.size(x,2), 'attention has incorrect dim')
+      tester:asserteq(torch.size(f, 2), torch.size(x,3), 'focus has incorrect dim')
+      tester:asserteq(torch.size(a, 1), torch.size(x,1), 'attention has incorrect batch size')
+      tester:asserteq(torch.size(f, 1), torch.size(x,1), 'focus has incorrect batch size')
+
+      -- Gradcheck
+      tester:assert(gradcheck(loss, params, x, h), 'incorrect gradients')
+    end,
+
+    Modules_MaskedBatchNormalization = function()
+      local f, params, state = autograd.module.MaskedBatchNormalization({nOutputs = 100})
+      local threshold = 1e-5
+      local eval_state = {momentum = state.momentum, train = 0,
+                          running_mean = state.running_mean,
+                          running_std = state.running_std}
+
+      -- Loss:
+      local loss = function(params, input, mask, state)
+        local normed = f(params, input, mask, state)
+        local l = torch.sum(normed)
+        return l, normed
+      end
+
+      local dloss = autograd(loss)
+
+      params[1].gain:fill(0.1)
+      params[1].bias:fill(0.1)
+
+      local i = torch.randn(100) -- test 1D input
+      local mask = torch.bernoulli(i.new(i:size()))
+      local pre_mean = state.running_mean:clone()
+      local pre_std = state.running_std:clone()
+      local l, i_normed = loss(params, i, mask, state)
+      local grads = dloss(params, i, mask, state)
+
+      tester:asserteq(type(l), 'number', 'loss should be a scalar')
+      tester:asserteq(grads[1].gain:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(grads[1].bias:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(i_normed:dim(), i:dim(), 'normed input has incorrect dim')
+      tester:assert(not pre_mean:equal(state.running_mean), 'running mean did not change with train = 1')
+      tester:assert(not pre_std:equal(state.running_std), 'running std did not change with train = 1')
+
+      -- Gradcheck
+      tester:assert(gradcheck(loss, params, i, mask, eval_state), 'incorrect gradients')
+
+      i = torch.randn(5,100) -- batch x nOutputs
+      mask = torch.bernoulli(i.new(i:size()))
+      pre_mean = eval_state.running_mean:clone()
+      pre_std = eval_state.running_std:clone()
+      l, i_normed = loss(params, i, mask, eval_state)
+      grads = dloss(params, i, mask, eval_state)
+
+      tester:asserteq(type(l), 'number', 'loss should be a scalar')
+      tester:asserteq(grads[1].gain:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(grads[1].bias:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(i_normed:dim(), i:dim(), 'normed input has incorrect dim')
+      tester:assert(pre_mean:equal(eval_state.running_mean), 'running mean changed with train = 0')
+      tester:assert(pre_std:equal(eval_state.running_std), 'running std changed with train = 0')
+
+      -- Gradcheck
+      tester:assert(gradcheck(loss, params, i, mask, eval_state), 'incorrect gradients')
+
+      i = torch.randn(5,10,100) -- batch x time x nOutputs
+      mask = torch.bernoulli(i.new(i:size()))
+      pre_mean = state.running_mean:clone()
+      pre_std = state.running_std:clone()
+      l, i_normed = loss(params, i, mask, state)
+      grads = dloss(params, i, mask, state)
+
+      tester:asserteq(type(l), 'number', 'loss should be a scalar')
+      tester:asserteq(grads[1].gain:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(grads[1].bias:dim(), 2, 'gain has incorrect dim')
+      tester:asserteq(i_normed:dim(), i:dim(), 'normed input has incorrect dim')
+      tester:assert(not pre_mean:equal(state.running_mean), 'running mean did not change with train = 1')
+      tester:assert(not pre_std:equal(state.running_std), 'running std did not change with train = 1')
+
+      -- Gradcheck
+      tester:assert(gradcheck(loss, params, i, mask, eval_state), 'incorrect gradients')
+    end,
+
    DebuggerDivZero = function()
       -- Parameters:
       local W = torch.Tensor(32,100):fill(.5)
@@ -1770,4 +1930,3 @@ tester:add(prefixTests("Optimized_", tests, { })):run(prefixTests("Optimized_",
 autograd.optimize(false)
 tester = torch.Tester()
 tester:add(prefixTests("Direct_", tests, { GradGrad = true, AutoModule = true, DebuggerDivZero = true, StableGradients = true, ZeroGrad = true, SimpleGradGrad = true })):run(arg[1])
-