diff --git a/GPU.lua b/GPU.lua
new file mode 100644
index 000000000..455a353ce
--- /dev/null
+++ b/GPU.lua
@@ -0,0 +1,274 @@
+------------------------------------------------------------------------
+--[[ GPU ]]--
+-- Decorates a module such that its parameters are
+-- hosted on a specified GPU device.
+-- The operations are also executed on that device.
+-- Arguments input and gradOutput are converted to the specified device
+-- before being fed to the decorated module.
+-- Returned output is on the specified outdevice (defaults to device).
+-- Returned gradInput is allocated on the same device as the input.
+-- The unit test is located in cunn.
+------------------------------------------------------------------------
+local GPU, parent = torch.class("nn.GPU", "nn.Container")
+
+function GPU:__init(module, device, outdevice)
+ parent.__init(self)
+ assert(torch.type(device) == 'number')
+ self.device = device
+ self.outdevice = outdevice or device
+
+ assert(torch.isTypeOf(module, 'nn.Module'))
+ self.modules[1] = module
+
+ if module:type() == 'torch.CudaTensor' then
+ self:cuda()
+ end
+end
+
+function GPU.recursiveModuleDevice(obj, device)
+ if type(obj) == 'table' and not torch.isTypeOf(obj, 'nn.GPU') then
+ for k,v in pairs(obj) do
+ obj[k] = GPU.recursiveModuleDevice(v, device)
+ end
+ elseif torch.type(obj):match('torch.Cuda.*Tensor') then
+ if obj:getDevice() ~= device then
+ obj = obj:clone() -- this will reallocate it to device
+ local newdevice = obj:getDevice()
+ -- when nElement() == 0 newdevice is 0
+ assert(newdevice == device or newdevice == 0)
+ end
+ end
+ assert(obj ~= nil)
+ return obj
+end
+
+-- set the device of the decorated module
+function GPU:setDevice(device)
+ self.device = device or self.device
+
+ assert(self.modules[1])
+ self.modules[1] = cutorch.withDevice(self.device, function()
+ return self.recursiveModuleDevice(self.modules[1], self.device)
+ end)
+ return self
+end
+
+-- returns a dst that has device device for each element in src
+function GPU.recursiveSetDevice(dst, src, device)
+ if torch.type(src) == 'table' then
+ dst = torch.type(dst) == 'table' and dst or {}
+ for k,v in ipairs(src) do
+ dst[k] = GPU.recursiveSetDevice(dst[k], v, device)
+ end
+ for k=#src+1,#dst do
+ dst[k] = nil
+ end
+ elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= device and src:getDevice() ~= 0 then
+ if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then
+ dst = src.new()
+ end
+ dst:resizeAs(src):copy(src)
+ else
+ dst = src
+ end
+ return dst
+end
+
+-- makes sure dst is a identical to src except but on the same device as proto
+function GPU.recursiveSetDeviceAs(dst, src, proto)
+ local device
+ if torch.isTensor(proto) then
+ device = proto:getDevice()
+ elseif torch.type(proto) == 'number' then
+ device = proto
+ end
+ if torch.type(src) == 'table' then
+ dst = torch.type(dst) == 'table' and dst or {}
+ for k,v in ipairs(src) do
+ dst[k] = GPU.recursiveSetDeviceAs(dst[k], v, proto[k])
+ end
+ for k=#src+1,#dst do
+ dst[k] = nil
+ end
+ elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= device and src:getDevice() ~= 0 then
+ if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then
+ dst = src.new()
+ end
+ cutorch.withDevice(device, function() dst:resizeAs(src):copy(src) end)
+ else
+ dst = src
+ end
+ return dst
+end
+
+function GPU:updateOutput(input)
+ if self._type == 'torch.CudaTensor' then
+ local output = cutorch.withDevice(self.device, function()
+ self._input = self.recursiveSetDevice(self._input, input, self.device)
+ return self.modules[1]:updateOutput(self._input)
+ end)
+
+ if self.device ~= self.outdevice then
+ self.output = cutorch.withDevice(self.outdevice, function()
+ return self.recursiveSetDevice(self.output, output, self.outdevice)
+ end)
+ else
+ self.output = output
+ end
+ else
+ self.output = self.modules[1]:updateOutput(input)
+ end
+
+ return self.output
+end
+
+function GPU:updateGradInput(input, gradOutput)
+ if self._type == 'torch.CudaTensor' then
+ local gradInput = cutorch.withDevice(self.device, function()
+ self._gradOutput = self.recursiveSetDevice(self._gradOutput, gradOutput, self.device)
+ return self.modules[1]:updateGradInput(self._input, self._gradOutput)
+ end)
+
+ self.gradInput = self.recursiveSetDeviceAs(self.gradInput, gradInput, input)
+ else
+ self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+ end
+
+ return self.gradInput
+end
+
+function GPU:accGradParameters(input, gradOutput, scale)
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function()
+ self.modules[1]:accGradParameters(self._input, self._gradOutput, scale)
+ end)
+ else
+ self.modules[1]:accGradParameters(input, gradOutput, scale)
+ end
+end
+
+function GPU:apply(callback)
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.apply(self, callback) end)
+ else
+ parent.apply(self, callback)
+ end
+end
+
+function GPU:type(type, typecache)
+ if type and type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.type(self, type, typecache) end)
+ self:setDevice()
+ else
+ self.output = nil
+ self.gradInput = nil
+ self._input = nil
+ self._gradOutput = nil
+ parent.type(self, type, typecache)
+ end
+ return self
+end
+
+function GPU:clearState()
+ self.output = nil
+ self.gradInput = nil
+ self._input = nil
+ self._gradOutput = nil
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.clearState(self) end)
+ else
+ parent.clearState(self)
+ end
+end
+
+function GPU:zeroGradParameters()
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.zeroGradParameters(self) end)
+ else
+ parent.zeroGradParameters(self)
+ end
+end
+
+function GPU:updateParameters(lr)
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.updateParameters(self, lr) end)
+ else
+ parent.updateParameters(self, lr)
+ end
+end
+
+function GPU:training()
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.training(self) end)
+ else
+ parent.training(self)
+ end
+end
+
+function GPU:evaluate()
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.evaluate(self) end)
+ else
+ parent.evaluate(self)
+ end
+end
+
+function GPU:share(mlp, ...)
+ local args = {...}
+ if self._type == 'torch.CudaTensor' then
+ cutorch.withDevice(self.device, function() parent.share(self, mlp, unpack(args)) end)
+ else
+ parent.share(self, mlp, unpack(args))
+ end
+ return self
+end
+
+function GPU:clone(...)
+ local args = {...}
+ if self._type == 'torch.CudaTensor' then
+ return cutorch.withDevice(self.device, function() parent.clone(self, unpack(args)) end)
+ else
+ return parent.clone(self, unpack(args))
+ end
+end
+
+function GPU:write(file)
+ -- Write all values in the object as a table.
+ local object = {}
+ for k, v in pairs(self) do
+ object[k] = v
+ end
+ local header = {self._type, self.device}
+ file:writeObject(header)
+ file:writeObject(object)
+end
+
+function GPU:read(file)
+ local header = file:readObject()
+ local object
+ if header[1] == 'torch.CudaTensor' then
+ object = cutorch.withDevice(header[2], function() return file:readObject() end)
+ else
+ object = file:readObject()
+ end
+
+ for k, v in pairs(object) do
+ self[k] = v
+ end
+end
+
+function GPU:__tostring__()
+ if self.modules[1].__tostring__ then
+ return torch.type(self) .. '(' .. self.device ..') @ ' .. self.modules[1]:__tostring__()
+ else
+ return torch.type(self) .. '(' .. self.device ..') @ ' .. torch.type(self.modules[1])
+ end
+end
+
+function GPU:accUpdateGradParameters(input, gradOutput, lr)
+ error"Not Implemented"
+end
+
+function GPU:sharedAccUpdateGradParameters(input, gradOutput, lr)
+ error"Not Implemented"
+end
diff --git a/GPUParallelTable.lua b/GPUParallelTable.lua
new file mode 100644
index 000000000..b37a541d5
--- /dev/null
+++ b/GPUParallelTable.lua
@@ -0,0 +1,119 @@
+local GPUParallelTable, parent = torch.class('nn.GPUParallelTable', 'nn.ParallelTable')
+
+function GPUParallelTable:__init()
+ parent.__init(self)
+ self.modules = {}
+ self.devices = {}
+ self.outdevices = {}
+ self.output = {}
+ self.gradInput = {}
+end
+
+function GPUParallelTable:add(module, device, outdevice)
+ assert(torch.isTypeOf(module, 'nn.Module'))
+ assert(torch.type(device) == 'number')
+ table.insert(self.modules, module)
+ table.insert(self.devices, device)
+ assert(#self.modules == #self.devices)
+ self.outdevices[#self.modules] = outdevice or device
+ return self
+end
+
+function GPUParallelTable:updateOutput(input)
+ if self._type == 'torch.CudaTensor' then
+ -- send input to appriopriate device, if necessary (blocking, so serial)
+ self._input = nn.GPU.recursiveSetDeviceAs(self._input, input, self.devices)
+
+ -- then forward in parallel across devices (if module is non-blocking, happens concurrently, yay!)
+ local output = {}
+ for i,module in ipairs(self.modules) do
+ local device = self.devices[i]
+ output[i] = cutorch.withDevice(device, function()
+ return self:rethrowErrors(module, i, 'updateOutput', self._input[i])
+ end)
+ end
+
+ -- send output to appriopriate device, if necessary (blocking, so serial)
+ self.output = nn.GPU.recursiveSetDeviceAs(self.output, output, self.outdevices)
+ else
+ parent.updateOutput(self, input)
+ end
+
+ return self.output
+end
+
+function GPUParallelTable:updateGradInput(input, gradOutput)
+ if self._type == 'torch.CudaTensor' then
+ -- send gradOutput to appriopriate device, if necessary (blocking, so serial)
+ self._gradOutput = nn.GPU.recursiveSetDeviceAs(self._gradOutput, gradOutput, self.devices)
+
+ -- then updateGradInput in parallel across devices (if module is non-blocking, happens concurrently)
+ local gradInput = {}
+ for i,module in ipairs(self.modules) do
+ local device = self.devices[i]
+ gradInput[i] = cutorch.withDevice(device, function()
+ return self:rethrowErrors(module, i, 'updateGradInput', self._input[i], self._gradOutput[i])
+ end)
+ end
+
+ -- send gradInput to appriopriate device, if necessary (blocking, so serial)
+ self.gradInput = nn.GPU.recursiveSetDeviceAs(self.gradInput, gradInput, self.input)
+ else
+ parent.updateGradInput(self, input, gradOutput)
+ end
+
+ return self.gradInput
+end
+
+function GPUParallelTable:accGradParameters(input, gradOutput, scale)
+ scale = scale or 1
+
+ if self._type == 'torch.CudaTensor' then
+ -- accGradParameters in parallel across devices (if module is non-blocking, happens concurrently)
+ for i,module in ipairs(self.modules) do
+ cutorch.withDevice(self.devices[i], function()
+ self:rethrowErrors(module, i, 'accGradParameters', self._input[i], self._gradOutput[i], scale)
+ end)
+ end
+ else
+ parent.accGradParameters(self, input, gradOutput, scale)
+ end
+end
+
+function GPUParallelTable:accUpdateGradParameters(input, gradOutput, lr)
+ lr = lr or 1
+
+ if self._type == 'torch.CudaTensor' then
+ -- accUpdateGradParameters in parallel across devices (if module is non-blocking, happens concurrently)
+ for i,module in ipairs(self.modules) do
+ cutorch.withDevice(self.devices[i], function()
+ self:rethrowErrors(module, i, 'accUpdateGradParameters', self._input[i], self._gradOutput[i], lr)
+ end)
+ end
+ else
+ parent.accUpdateGradParameters(self, input, gradOutput, lr)
+ end
+end
+
+function GPUParallelTable:type(type, typecache)
+ self.output = {}
+ self.gradInput = {}
+ self._input = {}
+ self._gradOutput = {}
+ if type and type == 'torch.CudaTensor' then
+ for i,module in ipairs(self.modules) do
+ local device = self.devices[i]
+ cutorch.withDevice(self.device, function() module:type(type, typecache) end)
+ self.modules[i] = cutorch.withDevice(device, function()
+ return nn.GPU.recursiveModuleDevice(module, device)
+ end)
+ end
+ self._type = type
+ else
+ parent.type(self, type, typecache)
+ end
+ return self
+end
+
+
+-- TODO : wrap all the other fucking methods.
diff --git a/doc/simple.md b/doc/simple.md
index e29813c03..2cbef531d 100644
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -51,6 +51,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
* [Padding](#nn.Padding) : adds padding to a dimension ;
* [L1Penalty](#nn.L1Penalty) : adds an L1 penalty to an input (for sparsity) ;
* [GradientReversal](#nn.GradientReversal) : reverses the gradient (to maximize an objective function) ;
+ * [GPU](#nn.GPU) : decorates a module so that it can be executed on a specific GPU device.
## Linear ##
@@ -1357,3 +1358,50 @@ One can also call:
module:setLambda(lambda)
```
to set the hyper-parameter `lambda` dynamically during training.
+
+
+## GPU ##
+
+```lua
+gpu = nn.GPU(module, device, [outdevice])
+require 'cunn'
+gpu:cuda()
+```
+
+Decorates an encapsulated `module` so that it can be executed on a specific GPU `device`.
+The decorated module's `parameters` are thus hosted on the specified GPU `device`.
+All operations on the `gpu` module are executed on that device.
+Calls to `forward`/`backward` will transfer arguments `input` and `gradOutput` to the specified `device`,
+which are then fed as arguments to the decorated `module`.
+Returned `output` is located on the specified `outdevice` (defaults to `device`).
+Returned `gradInput` is allocated on the same device as the `input`.
+
+When serialized/deserialized, the `gpu` module will be run on the same `device` that it was serialized with.
+To prevent this from happening, the module can be converted to float/double before serialization:
+
+```lua
+gpu:float()
+gpustr = torch.serialize(gpu)
+```
+
+The module is located in the __nn__ package instead of __cunn__ as this allows
+it to be used in CPU-only enviroments, which are common for production models.
+
+The module supports nested table `input` and `gradOutput` tensors originating from multiple devices.
+Each nested tensor in the returned `gradInput` will be transfered to the device its commensurate tensor in the `input`.
+
+The intended use-case is not for model-parallelism where the models are executed in parallel on multiple devices, but
+for sequential models where a single GPU doesn't have enough memory.
+
+Example using 4 GPUs:
+
+```lua
+mlp = nn.Sequential()
+ :add(nn.GPU(nn.Linear(10000,10000), 1))
+ :add(nn.GPU(nn.Linear(10000,10000), 2))
+ :add(nn.GPU(nn.Linear(10000,10000), 3))
+ :add(nn.GPU(nn.Linear(10000,10000), 4, cutorch.getDevice()))
+```
+
+Note how the last `GPU` instance will return an `output` tensor on the same device as the current device (`cutorch.getDevice`).
+
diff --git a/init.lua b/init.lua
index 516f29b19..16eb32af4 100644
--- a/init.lua
+++ b/init.lua
@@ -124,6 +124,9 @@ require('nn.VolumetricMaxUnpooling')
require('nn.VolumetricAveragePooling')
require('nn.VolumetricBatchNormalization')
+require('nn.GPU')
+require('nn.GPUParallelTable')
+
require('nn.ParallelTable')
require('nn.Identity')
require('nn.ConcatTable')
diff --git a/test.lua b/test.lua
index 4f0a3e89b..f78967438 100644
--- a/test.lua
+++ b/test.lua
@@ -6228,6 +6228,11 @@ function nntest.ErrorHandling()
)
end
+function nntest.GPU()
+ -- this is a placeholder to let you know that the nn.GPU unit test
+ -- is located in cunn package.
+end
+
mytester:add(nntest)
jac = nn.Jacobian