# MultiGPU training on CIFAR-10

In this notebook we will train a wide residual network http://arxiv.org/abs/1605.07146 using multiple GPUs in a single machine. With minimal changes it achieves state-of-the-art on CIFAR-10, CIFAR-100 and SVHN.

You will need cudnn to run this notebook, which can be downloaded from https://developer.nvidia.com/cudnn, and NVIDIA NCCL library for efficient multiGPU communications https://github.com/NVIDIA/nccl
After, the following rocks are required:

```
luarocks install cudnn
luarocks install optnet
luarocks install nccl
```

In [1]:
cudnn = require 'cudnn'
cunn = require 'cunn'
optnet = require 'optnet'
tablex = require 'pl.tablex'
optim = require 'optim'

In [8]:
opt = {
    dataset = 'cifar10_original.t7',
    num_classes = 10,
    batchSize = 128,
    learningRate = 0.1,
    learningRateDecay = 0,
    learningRateDecayRatio = 0.2,
    weightDecay = 0.0005,
    dampening = 0,
    momentum = 0.9,
    epoch_step = "80",
    max_epoch = 300,
    optimMethod = 'sgd',
    nesterov = false,
    nGPU = 1,
    
    depth = 16,
    width = 4,
}

In [17]:
-- define wide residual network (WRN) model
function createModel(opt)
   local depth = opt.depth
   local width = opt.width
   
   -- define WRN basic block
   local function wide_basic(nInputPlane, nOutputPlane, stride)
      local nBottleneckPlane = nOutputPlane

      local block = nn.Sequential()
      local convs = nn.Sequential()     

      local module = nInputPlane == nOutputPlane and convs or block
      module:add(nn.SpatialBatchNormalization(nInputPlane))
      module:add(nn.ReLU(true))
      convs:add(nn.SpatialConvolution(nInputPlane,nBottleneckPlane,3,3,stride,stride,1,1):noBias())
      convs:add(nn.SpatialBatchNormalization(nBottleneckPlane))
      convs:add(nn.ReLU(true))
      convs:add(nn.SpatialConvolution(nBottleneckPlane,nBottleneckPlane,3,3,1,1,1,1):noBias())
      
      local shortcut = nInputPlane == nOutputPlane and
         nn.Identity() or
         nn.SpatialConvolution(nInputPlane,nOutputPlane,1,1,stride,stride,0,0):noBias()
     
      return block
         :add(nn.ConcatTable()
            :add(convs)
            :add(shortcut))
         :add(nn.CAddTable(true))
   end

   -- Stacking Residual Units on the same stage
   local function layer(block, nInputPlane, nOutputPlane, count, stride)
      local s = nn.Sequential()
      s:add(block(nInputPlane, nOutputPlane, stride))
      for i=2,count do
         s:add(block(nOutputPlane, nOutputPlane, 1))
      end
      return s
   end

   local model = nn.Sequential()
   do
      assert((depth - 4) % 6 == 0, 'depth should be 6n+4')
      local n = (depth - 4) / 6
      local k = width
      local nStages = torch.Tensor{16, 16*k, 32*k, 64*k} -- the difference between WRN and ResNet

      model:add(nn.SpatialConvolution(3,nStages[1],3,3,1,1,1,1)) -- one conv at the beginning (spatial size: 32x32)
      model:add(layer(wide_basic, nStages[1], nStages[2], n, 1)) -- Stage 1 (spatial size: 32x32)
      model:add(layer(wide_basic, nStages[2], nStages[3], n, 2)) -- Stage 2 (spatial size: 16x16)
      model:add(layer(wide_basic, nStages[3], nStages[4], n, 2)) -- Stage 3 (spatial size: 8x8)
      model:add(nn.SpatialBatchNormalization(nStages[4]))
      model:add(nn.ReLU(true))
      model:add(nn.SpatialAveragePooling(8, 8, 1, 1))
      model:add(nn.View(nStages[4]):setNumInputDims(3))
      model:add(nn.Linear(nStages[4], opt.num_classes))
   end

    -- init 
    local function MSRinit(model)
       for k,v in pairs(model:findModules('nn.SpatialConvolution')) do
          local n = v.kW*v.kH*v.nOutputPlane
          v.weight:normal(0,math.sqrt(2/n))
          if v.bias then v.bias:zero() end
       end
    end

    local function FCinit(model)
       for k,v in pairs(model:findModules'nn.Linear') do
         v.bias:zero()
       end
    end
    
   MSRinit(model)
   FCinit(model)

   return model
end

-- define base network that will be used inside DataParallelTable
net = createModel(opt):float()

In [18]:
graphgen = require 'optnet.graphgen'

-- optnet needs to propagate through the network to generate the graph
-- so we create a dummy input for it
local input = torch.randn(1,3,32,32):float()

-- we will generate svg image from our graph
local svgname = paths.tmpname() .. '.svg'
graph.graphvizFile(graphgen(net,input), 'dot', svgname)
itorch.svg(svgname)

In [19]:
cudnn.convert(net, cudnn):cuda()

-- utilize optnet to reduce memory usage
local sample_input = torch.randn(8,3,32,32):cuda()
optnet.optimizeMemory(net, sample_input, {inplace = false, mode = 'training'})
cudnn.benchmark = true

model = nn.Sequential()
    :add(nn.Copy('torch.ByteTensor', 'torch.CudaTensor'))
    :add(nn.Mul(1/256))
    :cuda()
if opt.nGPU == 1 then
    model:add(net)
else
    -- this will wrap our initial network into data parallel module
    -- that will take care around multi-GPU communications in an efficient
    -- manner, and will launch CUDA kernels in parallel to reduce overhead on
    -- kernel launches
    model:add(nn.DataParallelTable(1, true, true)
         :add(model, gpus)
         :threads(function()
            local cudnn = require 'cudnn'
            cudnn.benchmark = true
         end))
end

In [12]:
provider = torch.load(opt.dataset)

In [13]:
-- don't call this more than once
parameters,gradParameters = model:getParameters()

In [14]:
local criterion = nn.CrossEntropyCriterion():cuda()

-- a-la autograd
local f = function(inputs, targets)
   model:forward(inputs)
   local loss = criterion:forward(model.output, targets)
   local df_do = criterion:backward(model.output, targets)
   model:backward(inputs, df_do)
   return loss
end

local optimState = tablex.deepcopy(opt)


function train()
  model:training()

  local targets = torch.CudaTensor(opt.batchSize)
  local indices = torch.randperm(provider.trainData.data:size(1)):long():split(opt.batchSize)
  -- remove last element so that all minibatches have equal size
  indices[#indices] = nil

  local loss = 0

  for t,v in ipairs(indices) do
    local inputs = provider.trainData.data:index(1,v)
    targets:copy(provider.trainData.labels:index(1,v))

    optim[opt.optimMethod](function(x)
      if x ~= parameters then parameters:copy(x) end
      model:zeroGradParameters()
      loss = loss + f(inputs, targets)
      return f,gradParameters
    end, parameters, optimState)
  end

  return loss / #indices
end

function test()
  model:evaluate()
  local confusion = optim.ConfusionMatrix(opt.num_classes)
  local data_split = provider.testData.data:split(opt.batchSize,1)
  local labels_split = provider.testData.labels:split(opt.batchSize,1)

  for i,v in ipairs(data_split) do
    confusion:batchAdd(model:forward(v), labels_split[i])
  end

  confusion:updateValids()
  return confusion.totalValid * 100
end

In [15]:
train()

1.3923781518753	


In [8]:
test()

53.72	
