    NJU LAMDA
    Video classification contest
    fc -> sigm -> l2 loss
    Author: Hao Zhang
    Date: 2016.06.23
    File: logreg.pynb

In [58]:
require('nn')
require('optim')
require('cutorch')
require('cunn')
require('gnuplot')

torch.manualSeed(0)
torch.setdefaulttensortype('torch.FloatTensor')
dtype = 'torch.CudaTensor'

opt = {
    bs = 128,
    m = 6000,
    m_frame = 3059,
    n = 26,
    K = 5,
    m_dev = 32,
    m_train = 5500,
    m_val = 500
}

## 1. Prepare the Data

### Load the Data

In [59]:
data = torch.load('data.txt')
labels = torch.load('labels.txt')

print(data:size())
print(labels:size())

 6000
 3059
   26
[torch.LongStorage of size 3]



 6000
    5
[torch.LongStorage of size 2]



### Shuffle the Training Dataset and Move to GPU

In [60]:
local shuffled_ind = torch.randperm(opt.m):long()
data = data:index(1, shuffled_ind):type(dtype)
labels = labels:index(1, shuffled_ind):type(dtype)

### Mean Substraction and Std Normalization

In [61]:
local mean = data[{{1, opt.m_train}, {}, {}}]:mean()
data:csub(mean)

local std = data[{{1, opt.m_train}, {}, {}}]:std()
data:div(std)

local stat = {}
stat.mean = mean
stat.std = std
torch.save('stat.bin', stat)

### Split it into Train, Dev, and Val Dataset

In [62]:
-- Split the train, val, test set
X = data[{{1, opt.m_train}, {}, {}}]
Y = labels[{{1, opt.m_train}}]

X_chunks = X:split(opt.bs, 1)
Y_chunks = Y:split(opt.bs, 1)

X_dev = data[{{1, opt.m_dev}, {}, {}}]
Y_dev = labels[{{1, opt.m_dev}}]

--X_chunks = X:split(opt.bs, 1)
--y_chunks = y:split(opt.bs, 1)

X_val = data[{{opt.m_train + 1, opt.m_train + opt.m_val}, {}, {}}]
Y_val = labels[{{opt.m_train + 1, opt.m_train + opt.m_val}}]

print('X:')
print(X:size())
print('Y:')
print(Y:size())
print('X_dev:')
print(X_dev:size())
print('Y_dev:')
print(Y_dev:size())
print('X_val:')
print(X_val:size())
print('Y_val:')
print(Y_val:size())

X:	
 5500
 3059
   26
[torch.LongStorage of size 3]

Y:	
 5500
    5
[torch.LongStorage of size 2]

X_dev:	
   32
 3059
   26
[torch.LongStorage of size 3]

Y_dev:	
 32
  5
[torch.LongStorage of size 2]

X_val:	
  500
 3059
   26
[torch.LongStorage of size 3]

Y_val:	
 500
   5
[torch.LongStorage of size 2]



## 2. Define the Model and Loss Function

### Define the Model and Move to GPU

In [63]:
model = nn.Sequential()
model:add(nn.Reshape(opt.m_frame * opt.n))
model:add(nn.Linear(opt.m_frame * opt.n, opt.K))
model:add(nn.Sigmoid())
model:type(dtype)
print(tostring(model))

nn.Sequential {
  [input -> (1) -> (2) -> (3) -> output]
  (1): nn.Reshape(79534)
  (2): nn.Linear(79534 -> 5)
  (3): nn.Sigmoid
}	


### Define the Loss Function and Move to GPU

In [64]:
crit = nn.MSECriterion()
crit:type(dtype)

## 3. Dev Test

### Double Check the Loss is Reasonable

In [43]:
model:reset()

local S = model:forward(X_dev)
local J = crit:forward(S, Y_dev)
print('It should be near '.. torch.dot(Y_dev, Y_dev) / opt.m_dev / opt.n, J)

It should be near 0.053291834317721	0.038674008101225	


### Define loss_grad Function for Dev Test

In [44]:
theta, dtheta = model:getParameters()

function loss_grad_dev(theta_new)
    if theta ~= theta_new then
        theta:copy(theta_new)
    end
    dtheta:zero()
        
    -- Forward and backward pass.
    local S = model:forward(X_dev)
    local J = crit:forward(S, Y_dev)
    
    local dS = crit:backward(S, Y_dev)
    model:backward(X_dev, dS)
    return J, dtheta
end



### Define the Function to Compute Error

In [65]:
-- Check the error.
function err(X, Y, m)
    local S = model:forward(X)
    
    -- Compute the l1 dist.
    local J = torch.norm(S - Y, 1) / m / opt.K
    return J
end

-- Check the error for big data.
function err_big(X, Y, m)
    local tot_err = 0
    local m_split = m / 5
    local X_split = X:split(m_split, 1)
    local Y_split = Y:split(m_split, 1)
    for i = 1, 5 do
        tot_err = tot_err + err(X_split[i], Y_split[i], m_split)
    end
    return tot_err / 5
end

### Define the Funtion to Plot

In [66]:
-- Plot train and val figure in 1 figure.
function plot1(name, train_plots, val_plots)
    train_plots[1][1] = 'train'
    val_plots[1][1] = 'val'
    train_plots[2] = val_plots[1]
    gnuplot.pngfigure(name)
    gnuplot.plot(train_plots)
    gnuplot.grid('on')
    gnuplot.plotflush()
    print('done')
end

-- Plot train and val figure in 2 separate figures.
function plot2(name_train, val_name, train_plots, val_plots)
    gnuplot.pngfigure(name_train)
    gnuplot.plot(train_plots)
    gnuplot.grid('on')
    gnuplot.plotflush()
    gnuplot.pngfigure(val_name)
    gnuplot.plot(val_plots)
    gnuplot.grid('on')
    gnuplot.plotflush()
    print('done')
end

### Overfitting Small Dataset

In [17]:
-- Reset the network from scratch to make a fair comparasion.
model:reset()

local T = 60
local train_err = torch.Tensor(T)
local val_err = torch.Tensor(T)

hpara = {
    learningRate = 3e-3,
    weightDecay = 1e-6,
    momentum = 0,
    learningRateDecay = 0
}

for t = 1, T do
    optim.sgd(loss_grad_dev, theta, hpara)
        
    train_err[t] = err(X_dev, Y_dev, opt.m_dev)
    val_err[t] = err(X_val, Y_val, opt.m_val)
end

local train_plots = {}
local val_plots = {}
table.insert(train_plots, {'', train_err})
table.insert(val_plots, {'', val_err})
plot1('overfit_small_debug.png', train_plots, val_plots)

done	


## 4. Optimization

### Define loss_grad Function

In [67]:
theta, dtheta = model:getParameters()

count = 1
function loss_grad(theta_new)
    if theta ~= theta_new then
        theta:copy(theta_new)
    end
    dtheta:zero()
    
    -- Fetch the data.
    local X = X_chunks[count]
    local Y = Y_chunks[count]
    
    if count == #X_chunks then
        count = 1
    else
        count = count + 1
    end
    
    -- Forward and backward pass.
    local S = model:forward(X)
    local J = crit:forward(S, Y)
    
    local dS = crit:backward(S, Y)
    model:backward(X, dS)
    return J, dtheta
end

### Define Training Procedure

In [68]:
iter = math.ceil(opt.m / opt.bs)

function train(hpara, epochs, i)
    -- Reset the network from scratch to make a fair comparasion.
    model:reset()
    count = 1
    local T = epochs * iter
    local train_err = torch.Tensor(epochs)
    local val_err = torch.Tensor(epochs)

    for t = 1, T do
        optim.sgd(loss_grad, theta, hpara)
    
        if t % iter == 0 then -- Print after one epoch.            
            train_err[t / iter] = err_big(X, Y, opt.m)
            val_err[t / iter] = err(X_val, Y_val, opt.m_val)
        end
    end
    
    table.insert(train_plots, {'' .. i, train_err})
    table.insert(val_plots, {'' .. i, val_err})
end

### Start with Small $\lambda$ and Find $\alpha$ that Makes the Loss go Down.

In [70]:
local alpha_tensor = torch.logspace(-3, -1, 10)
train_plots = {}
val_plots = {}

for i = 1, alpha_tensor:size(1) do
    local alpha = alpha_tensor[i]
    print(i, 'alpha:', alpha)
    
    hpara = {
        learningRate = alpha,
        weightDecay = 1e-6,
        momentum = 0.9,
        learningRateDecay = 5e-7
    }    
    train(hpara, 3, i)
end

plot2('coarse_train.png', 'coarse_val.png', train_plots, val_plots)

1	alpha:	0.0010000000474975	


2	alpha:	0.0016681009437889	


3	alpha:	0.002782559255138	


4	alpha:	0.0046415897086263	


5	alpha:	0.0077426359057426	


6	alpha:	0.012915498577058	


7	alpha:	0.021544348448515	


8	alpha:	0.035938140004873	


9	alpha:	0.059948425740004	


10	alpha:	0.10000000149012	


done	


### Coarse to Fine Tuning the Best $\alpha$ and $\lambda$ with Random Search

In [78]:
train_plots = {}
val_plots = {}
for times = 1, 10 do
    local alpha = (torch.rand(1)[1] * 5 + 5) * 10^(-4)
    local lambda = torch.rand(1)[1] * 7 + 3
    local tau = 10 ^ (torch.rand(1)[1] * -3 - 5)
    print(times, 'alpha:', alpha, 'lambda:', lambda, 'tau:', tau)
    
    hpara = {
        learningRate = alpha,
        weightDecay = lambda,
        momentum = 0.9,
        learningRateDecay = tau
    }
    train(hpara, 40, times)
end

plot2('fine_train.png', 'fine_val.png', train_plots, val_plots)

1	alpha:	0.00089058804512024	lambda:	8.895623087883	tau:	2.8716179816755e-08	


2	alpha:	0.00064225347340107	lambda:	7.6392441987991	tau:	3.5793290563736e-07	


3	alpha:	0.00081647369265556	lambda:	5.1519630551338	tau:	8.6773674822229e-08	


4	alpha:	0.00066141852736473	lambda:	7.3674860596657	tau:	1.3092239195384e-06	


5	alpha:	0.00087683382630348	lambda:	5.7031597495079	tau:	5.1255674150276e-07	


6	alpha:	0.00082990938425064	lambda:	6.4832113981247	tau:	

1.018605475708e-06	


7	alpha:	0.00065149796009064	lambda:	5.7740335762501	tau:	1.2500347884642e-07	


8	alpha:	0.00067041157186031	lambda:	4.7581212222576	tau:	7.5396920609713e-08	


9	alpha:	0.00097438177466393	lambda:	7.4036501049995	tau:	5.5574452470173e-08	


10	alpha:	0.00097258642315865	lambda:	4.7743104696274	tau:	5.2613215724391e-07	


done	


### Train the Best Model

In [79]:
train_plots = {}
val_plots = {}

hpara = {
        learningRate = 8.3e-4,
        weightDecay = 6.5,
        momentum = 0.9,
        learningRateDecay = 1.01e-6
    }
train(hpara, 100, '')

plot1('final.png', train_plots, val_plots)

done	


In [80]:
torch.save('model.bin', model)


