In [2]:
require("optim")
require('hdf5')

<h3>Multinomial Logistic Regression - LBFGS Minibatch - L2 Norm</h3>

In [39]:
function ml(W, X, Y)
    local W = W:reshape(Y:size(2), X:size(2)+1)
    local b = W:sub(1, W:size(1), W:size(2),W:size(2)):t()
    W = W:sub(1, W:size(1),1,W:size(2)-1)
    local p = X*W:t()
    p:add(b:expand(b,p:size(1),b:size(2)))
    local arr = p:clone()
    arr = arr:t()
    local vmax = arr:max(1)
    local evmax = torch.expand(vmax,arr:size(1),vmax:size(2))
    arr:csub(evmax)
    arr:exp()
    arr = arr:sum(1)
    arr:log()
    arr:add(vmax)
    arr = arr:t()
    arr:expand(arr, arr:size(1), p:size(2))
    p:csub(arr)
    local norm = W:reshape(W:size(1)*W:size(2), 1)
    local loss = (torch.sum(torch.cmul(Y,p))*-1) + 1.0 *0.5 * torch.dot(norm, norm)
    p:exp()
    return loss, p, W
end

function mlg(W, X, Y)
    local bsize = 5000
    local idx = torch.randperm(X:size(1)):sub(1,bsize)
    local X_batch = torch.Tensor(bsize, X:size(2))
    local Y_batch = torch.Tensor(bsize, Y:size(2))
    
    for i=1,bsize do
        X_batch[i] = X[idx[i]]
        Y_batch[i] = Y[idx[i]]
    end

    local grad = torch.zeros(Y_batch:size(2), X_batch:size(1)+1)
    local loss, p, W = ml(W, X_batch, Y_batch)
    local diff = torch.csub(p,Y_batch)
 
    local grad = diff:t()*X_batch

    grad:add(W)
    grad = grad:cat(torch.zeros(grad:size(1),1), 2)
    grad:sub(1, grad:size(1), grad:size(2), grad:size(2)):add(diff:sum(1))
    print(loss)
    return loss, grad:reshape(grad:size(1)*grad:size(2), 1), p
end


function fit(X, Y, rate, iter, lX)

    local W = torch.zeros(Y:size(2) * (X:size(2)+1), 1)    
    local func = function(W)
        loss, grad, p = mlg(W, X, Y)
        return loss, grad
    end
    local state = {learningRate = rate, maxIter=iter, tolX=lX}
    W, f_hist, currentFuncEval = optim.lbfgs(func, W, state)
    W = W:reshape(Y:size(2), X:size(2)+1)
    b = W:sub(1, W:size(1), W:size(2), W:size(2))
    W = W:sub(1, W:size(1), 1, W:size(2)-1)
    return W, b
end

function predict(X, W, b)
    local b = b:t()
    return (X*W:t()):add(b:expand(b, X:size(1), b:size(2)))
end

function predict_score(ypred, ytrue)
    local c = 0
    for i=1,ypred:size(1) do
        if ypred[i][1] == ytrue[i][1] then
            c = c + 1       
        end
    end
    return c/ypred:size(1)
end

<h3>Create Document Word Matrix and One Hot Encoding</h3>

In [5]:
--feature weight: counts
function createDocWordMatrix(vocab, max_sent_len, sparseMatrix)
    docword = torch.zeros(sparseMatrix:size(1), vocab)
    for i=1,sparseMatrix:size(1) do
        for j=1, max_sent_len do
            local idx = (sparseMatrix[i][j])
            if idx ~= 0 then
                docword[i][idx] = 1 + docword[i][idx]
            end
        end
    end
    return docword
end
 
function onehotencode(classes, target)
    onehot = torch.zeros(target:size(1), classes)
    for i=1,target:size(1) do
        onehot[i][target[i]] = 1
    end
    return onehot
end


In [6]:
f = hdf5.open("SST1.hdf5", "r")

X_train = f:read("train_input"):all()
Y_train = f:read("train_output"):all()
X_valid = f:read("valid_input"):all()
Y_valid = f:read("valid_output"):all()
X_test = f:read("test_input"):all()
nclasses = f:read('nclasses'):all():long()[1]
nfeatures = f:read('nfeatures'):all():long()[1]

f:close()

In [7]:
X_train =createDocWordMatrix(nfeatures, 53, X_train)
Y_train = onehotencode(nclasses, Y_train)
X_test = createDocWordMatrix(nfeatures, 53, X_valid)
Y_test = onehotencode(nclasses, Y_valid)

In [45]:
start_time = os.time()
W, b = fit(X_train, Y_train, 0.1, 100)
end_time = os.time()
print(end_time - start_time)

8047.1895621711	


8034.2150007982	


7786.4488870014	


6754.1685375427	


6719.858801218	


6673.8443615626	


6625.6743854115	


6502.9222548766	


6512.9323891926	


6427.7984747649	


6399.7436447472	


6374.5803966976	


6324.2485244929	


6194.2188614458	


6324.5130549656	


6314.5935067485	


6322.7240424031	


6227.0022025586	


6227.1570480744	


6206.1907395885	


6191.0529431972	


6091.9605025345	


6180.3573618335	


6229.3154092704	


6261.4026533607	


6142.3291894578	


6147.2063133995	


6139.5653968708	


6152.4426592705	


6224.3444695085	


6243.9924217454	


6205.8550058916	


6205.9371941422	


6116.0417361475	


6210.3460709715	


6265.5834837266	


6147.402547356	


6154.1188046439	


6132.6467434086	


6095.1326698292	


6211.9629188845	


6265.9054878674	


6014.8985868335	


6126.4269583814	


6089.7283562214	


6162.2470356336	


6080.8452258715	


6211.3244501146	


6133.5062990389	


6169.8657492213	


31108.91154856	


27962.696009961	


7008.9386459226	


6097.5256671331	


6119.5321893925	


6103.1542757935	


6056.0652155795	


6079.5397368546	


6038.7168955052	


5923.8081358096	


6018.6621790674	


5980.8905911602	


5965.3685893487	


5978.1128821277	


5887.0213068807	


5973.842837281	


5983.4816179291	


5986.130443867	


6039.057287334	


5833.1167346367	


6022.8903323289	


5956.9730061427	


6039.5853013717	


5976.968191069	


5891.3734982801	


5938.2326951705	


5880.7491873169	


5936.0461986104	


5883.3569177921	


5950.2532316054	


5932.2011324286	


5918.768913342	


5874.6530388013	


5841.0127809212	


5854.9239398558	


5914.8912463653	


5821.4723092382	


5885.4596694655	


5888.7588929599	


5893.4886596748	


5733.5287217821	


5807.8013364884	


5867.819837934	


5873.3662717232	


5836.2665710443	


5763.3528937926	


5812.9357167324	


5931.5591954396	


5806.9128820495	


5810.4558742062	


142	


In [42]:
Y_pred = predict(X_test, W, b)
_, Y_pred = torch.max(Y_pred, 2)
_,Y_true = torch.max(Y_test, 2)
acc_score = predict_score(Y_pred, Y_true)
print(acc_score)