torch · nicholas-leonard · May 27, 2016 · Jun 14, 2016
diff --git a/GPU.lua b/GPU.lua
@@ -0,0 +1,274 @@
+------------------------------------------------------------------------
+--[[ GPU ]]--
+-- Decorates a module such that its parameters are
+-- hosted on a specified GPU device.
+-- The operations are also executed on that device.
+-- Arguments input and gradOutput are converted to the specified device 
+-- before being fed to the decorated module. 
+-- Returned output is on the specified outdevice (defaults to device). 
+-- Returned gradInput is allocated on the same device as the input.
+-- The unit test is located in cunn.
+------------------------------------------------------------------------
+local GPU, parent = torch.class("nn.GPU", "nn.Container")
+
+function GPU:__init(module, device, outdevice)
+   parent.__init(self)
+   assert(torch.type(device) == 'number')
+   self.device = device
+   self.outdevice = outdevice or device
+
+   assert(torch.isTypeOf(module, 'nn.Module'))
+   self.modules[1] = module
+
+   if module:type() == 'torch.CudaTensor' then
+      self:cuda()
+   end
+end
+
+function GPU.recursiveModuleDevice(obj, device)
+   if type(obj) == 'table' and not torch.isTypeOf(obj, 'nn.GPU') then
+      for k,v in pairs(obj) do
+         obj[k] = GPU.recursiveModuleDevice(v, device)
+      end
+   elseif torch.type(obj):match('torch.Cuda.*Tensor') then
+      if obj:getDevice() ~= device then
+         obj = obj:clone() -- this will reallocate it to device
+         local newdevice = obj:getDevice()
+         -- when nElement() == 0 newdevice is 0
+         assert(newdevice == device or newdevice == 0)
+      end
+   end
+   assert(obj ~= nil)
+   return obj
+end
+
+-- set the device of the decorated module
+function GPU:setDevice(device)
+   self.device = device or self.device
+
+   assert(self.modules[1])
+   self.modules[1] = cutorch.withDevice(self.device, function() 
+      return self.recursiveModuleDevice(self.modules[1], self.device) 
+   end)
+   return self
+end
+
+-- returns a dst that has device device for each element in src
+function GPU.recursiveSetDevice(dst, src, device)
+   if torch.type(src) == 'table' then
+      dst = torch.type(dst) == 'table' and dst or {}
+      for k,v in ipairs(src) do
+         dst[k] = GPU.recursiveSetDevice(dst[k], v, device)
+      end
+      for k=#src+1,#dst do
+         dst[k] = nil
+      end
+   elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= device and src:getDevice() ~= 0 then
+      if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then
+         dst = src.new()
+      end
+      dst:resizeAs(src):copy(src)
+   else
+      dst = src
+   end
+   return dst 
+end
+
+-- makes sure dst is a identical to src except but on the same device as proto
+function GPU.recursiveSetDeviceAs(dst, src, proto)
+   local device
+   if torch.isTensor(proto) then
+      device = proto:getDevice()
+   elseif torch.type(proto) == 'number' then
+      device = proto
+   end
+   if torch.type(src) == 'table' then
+      dst = torch.type(dst) == 'table' and dst or {}
+      for k,v in ipairs(src) do
+         dst[k] = GPU.recursiveSetDeviceAs(dst[k], v, proto[k])
+      end
+      for k=#src+1,#dst do
+         dst[k] = nil
+      end
+   elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= device and src:getDevice() ~= 0 then
+      if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then
+         dst = src.new()
+      end
+      cutorch.withDevice(device, function() dst:resizeAs(src):copy(src) end)
+   else
+      dst = src
+   end
+   return dst 
+end
+
+function GPU:updateOutput(input)
+   if self._type == 'torch.CudaTensor' then
+      local output = cutorch.withDevice(self.device, function()
+         self._input = self.recursiveSetDevice(self._input, input, self.device)
+         return self.modules[1]:updateOutput(self._input)
+      end)
+
+      if self.device ~= self.outdevice then
+         self.output = cutorch.withDevice(self.outdevice, function()
+            return self.recursiveSetDevice(self.output, output, self.outdevice)
+         end)
+      else
+         self.output = output
+      end
+   else
+      self.output = self.modules[1]:updateOutput(input)
+   end
+
+   return self.output
+end
+
+function GPU:updateGradInput(input, gradOutput)
+   if self._type == 'torch.CudaTensor' then
+      local gradInput = cutorch.withDevice(self.device, function()
+         self._gradOutput = self.recursiveSetDevice(self._gradOutput, gradOutput, self.device)
+         return self.modules[1]:updateGradInput(self._input, self._gradOutput)
+      end)
+
+      self.gradInput = self.recursiveSetDeviceAs(self.gradInput, gradInput, input)
+   else
+      self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   end
+
+   return self.gradInput
+end
+
+function GPU:accGradParameters(input, gradOutput, scale) 
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function()
+         self.modules[1]:accGradParameters(self._input, self._gradOutput, scale)
+      end)
+   else
+      self.modules[1]:accGradParameters(input, gradOutput, scale)
+   end
+end
+
+function GPU:apply(callback)
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.apply(self, callback) end)
+   else
+      parent.apply(self, callback)
+   end
+end
+
+function GPU:type(type, typecache)
+   if type and type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.type(self, type, typecache) end)
+      self:setDevice()
+   else
+      self.output = nil
+      self.gradInput = nil
+      self._input = nil
+      self._gradOutput = nil
+      parent.type(self, type, typecache)
+   end
+   return self
+end
+
+function GPU:clearState()
+   self.output = nil
+   self.gradInput = nil
+   self._input = nil
+   self._gradOutput = nil
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.clearState(self) end)
+   else
+      parent.clearState(self)
+   end
+end
+
+function GPU:zeroGradParameters()
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.zeroGradParameters(self) end)
+   else
+      parent.zeroGradParameters(self)
+   end
+end
+
+function GPU:updateParameters(lr)
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.updateParameters(self, lr) end)
+   else
+      parent.updateParameters(self, lr)
+   end
+end
+
+function GPU:training()
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.training(self) end)
+   else
+      parent.training(self)
+   end
+end
+
+function GPU:evaluate()
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.evaluate(self) end)
+   else
+      parent.evaluate(self)
+   end
+end
+
+function GPU:share(mlp, ...)
+   local args = {...}
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.share(self, mlp, unpack(args)) end)
+   else
+      parent.share(self, mlp, unpack(args))
+   end
+   return self
+end
+
+function GPU:clone(...)
+   local args = {...}
+   if self._type == 'torch.CudaTensor' then
+      return cutorch.withDevice(self.device, function() parent.clone(self, unpack(args)) end)
+   else
+      return parent.clone(self, unpack(args))
+   end
+end
+
+function GPU:write(file)
+   -- Write all values in the object as a table.
+   local object = {}
+   for k, v in pairs(self) do
+      object[k] = v
+   end
+   local header = {self._type, self.device}
+   file:writeObject(header)
+   file:writeObject(object)
+end
+
+function GPU:read(file)
+   local header = file:readObject()
+   local object
+   if header[1] == 'torch.CudaTensor' then
+      object = cutorch.withDevice(header[2], function() return file:readObject() end)
+   else
+      object = file:readObject()
+   end
+
+   for k, v in pairs(object) do
+      self[k] = v
+   end
+end
+
+function GPU:__tostring__()
+   if self.modules[1].__tostring__ then
+      return torch.type(self) .. '(' .. self.device ..') @ ' .. self.modules[1]:__tostring__()
+   else
+      return torch.type(self) .. '(' .. self.device ..') @ ' .. torch.type(self.modules[1])
+   end
+end
+
+function GPU:accUpdateGradParameters(input, gradOutput, lr)
+   error"Not Implemented"
+end
+
+function GPU:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   error"Not Implemented"
+end
diff --git a/GPUParallelTable.lua b/GPUParallelTable.lua
@@ -0,0 +1,119 @@
+local GPUParallelTable, parent = torch.class('nn.GPUParallelTable', 'nn.ParallelTable')
+
+function GPUParallelTable:__init()
+   parent.__init(self)
+   self.modules = {}
+   self.devices = {}
+   self.outdevices = {}
+   self.output = {}
+   self.gradInput = {}
+end
+
+function GPUParallelTable:add(module, device, outdevice)
+   assert(torch.isTypeOf(module, 'nn.Module'))
+   assert(torch.type(device) == 'number')
+   table.insert(self.modules, module)
+   table.insert(self.devices, device)
+   assert(#self.modules == #self.devices)
+   self.outdevices[#self.modules] = outdevice or device
+   return self
+end
+
+function GPUParallelTable:updateOutput(input)
+   if self._type == 'torch.CudaTensor' then
+      -- send input to appriopriate device, if necessary (blocking, so serial)
+      self._input = nn.GPU.recursiveSetDeviceAs(self._input, input, self.devices)
+
+      -- then forward in parallel across devices (if module is non-blocking, happens concurrently, yay!)
+      local output = {}
+      for i,module in ipairs(self.modules) do
+         local device = self.devices[i]
+         output[i] = cutorch.withDevice(device, function()
+            return self:rethrowErrors(module, i, 'updateOutput', self._input[i])
+         end)
+      end
+
+      -- send output to appriopriate device, if necessary (blocking, so serial)
+      self.output = nn.GPU.recursiveSetDeviceAs(self.output, output, self.outdevices)
+   else
+      parent.updateOutput(self, input) 
+   end
+
+   return self.output
+end
+
+function GPUParallelTable:updateGradInput(input, gradOutput)
+   if self._type == 'torch.CudaTensor' then
+      -- send gradOutput to appriopriate device, if necessary (blocking, so serial)
+      self._gradOutput = nn.GPU.recursiveSetDeviceAs(self._gradOutput, gradOutput, self.devices)
+
+      -- then updateGradInput in parallel across devices (if module is non-blocking, happens concurrently)
+      local gradInput = {}
+      for i,module in ipairs(self.modules) do
+         local device = self.devices[i]
+         gradInput[i] = cutorch.withDevice(device, function()
+            return self:rethrowErrors(module, i, 'updateGradInput', self._input[i], self._gradOutput[i])
+         end)
+      end
+
+      -- send gradInput to appriopriate device, if necessary (blocking, so serial)
+      self.gradInput = nn.GPU.recursiveSetDeviceAs(self.gradInput, gradInput, self.input)
+   else
+      parent.updateGradInput(self, input, gradOutput) 
+   end
+
+   return self.gradInput
+end
+
+function GPUParallelTable:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+
+   if self._type == 'torch.CudaTensor' then
+      -- accGradParameters in parallel across devices (if module is non-blocking, happens concurrently)
+      for i,module in ipairs(self.modules) do
+         cutorch.withDevice(self.devices[i], function()
+            self:rethrowErrors(module, i, 'accGradParameters', self._input[i], self._gradOutput[i], scale)
+         end)
+      end
+   else
+      parent.accGradParameters(self, input, gradOutput, scale) 
+   end
+end
+
+function GPUParallelTable:accUpdateGradParameters(input, gradOutput, lr)
+   lr = lr or 1
+
+   if self._type == 'torch.CudaTensor' then
+      -- accUpdateGradParameters in parallel across devices (if module is non-blocking, happens concurrently)
+      for i,module in ipairs(self.modules) do
+         cutorch.withDevice(self.devices[i], function()
+            self:rethrowErrors(module, i, 'accUpdateGradParameters', self._input[i], self._gradOutput[i], lr)
+         end)
+      end
+   else
+      parent.accUpdateGradParameters(self, input, gradOutput, lr) 
+   end
+end
+
+function GPUParallelTable:type(type, typecache)
+   self.output = {}
+   self.gradInput = {}
+   self._input = {}
+   self._gradOutput = {}
+   if type and type == 'torch.CudaTensor' then
+      for i,module in ipairs(self.modules) do
+         local device = self.devices[i]
+         cutorch.withDevice(self.device, function() module:type(type, typecache) end)
+         self.modules[i] = cutorch.withDevice(device, function() 
+            return nn.GPU.recursiveModuleDevice(module, device)
+         end)
+      end
+      self._type = type
+   else
+      parent.type(self, type, typecache)
+   end
+   return self
+end
+
+
+-- TODO : wrap all the other fucking methods.