In [1]:
require("optim")
require('hdf5')

<h3>Multinomial Logistic Regression - LBFGS Minibatch - L2 Norm</h3>

In [16]:
neval = 0

function ml(W, X, Y)
    local W = W:reshape(Y:size(2), X:size(2)+1)
    
    --intercept
    local b = W:sub(1, W:size(1), W:size(2),W:size(2)):t()
    
    --coefficient
    W = W:sub(1, W:size(1),1,W:size(2)-1)
    
    --XW^T
    local p = X*W:t()
    
    --XW^T + b
    p:add(b:expand(b,p:size(1),b:size(2)))
    
    local arr = p:clone()
    arr = arr:t()
    
    --predicted Z scores for y_hat
    local vmax = arr:max(1)

    local evmax = torch.expand(vmax,arr:size(1),vmax:size(2))

    arr:csub(evmax)

    arr:exp()
    arr = arr:sum(1)
    arr:log()
    
    arr:add(vmax)

    arr = arr:t()
    arr:expand(arr, arr:size(1), p:size(2))
    p:csub(arr)
    
    --L2 regularization
    local norm = W:reshape(W:size(1)*W:size(2), 1)
    
    local loss = (torch.sum(torch.cmul(Y,p))*-1) + 1.0 *0.5 * torch.dot(norm, norm)
    
    p:exp()
    
    return loss, p, W
end

function mlg(W, X, Y, bsize)

    local bsize = 1000
    
    --random ordering of ints [1,nexamples] and take first bsize
    local idx = torch.randperm(X:size(1)):sub(1,bsize)
    
    --training minibatches
    local X_batch = torch.Tensor(bsize, X:size(2))
    local Y_batch = torch.Tensor(bsize, Y:size(2))
    
    for i=1,bsize do
        X_batch[i] = X[idx[i]]
        Y_batch[i] = Y[idx[i]]
    end

    --initialize gradient
    local grad = torch.zeros(Y_batch:size(2), X_batch:size(1)+1)
    
    --calculate loss, updated weight matrix
    local loss, p, W = ml(W, X_batch, Y_batch)
    local diff = torch.csub(p,Y_batch)
 
    local grad = diff:t()*X_batch

    grad:add(W)
    grad = grad:cat(torch.zeros(grad:size(1),1), 2)
    grad:sub(1, grad:size(1), grad:size(2), grad:size(2)):add(diff:sum(1))
    neval = neval + 1
    print(neval, loss)
    return loss, grad:reshape(grad:size(1)*grad:size(2), 1)
end


function fit(X, Y, rate, iter, lX)
    --Weight matrix must be passed in as vector
    local W = torch.zeros(Y:size(2) * (X:size(2)+1), 1)   
    
    --define local function for optimization
    local func = function(W)
        loss, grad = mlg(W, X, Y)
        return loss, grad
    end
    
    --optimization parameters
    local state = {learningRate = rate, maxIter=iter, tolX=lX}
    
    --LBFGS with no line search, therefore specify learning rate
    W, f_hist, currentFuncEval = optim.lbfgs(func, W, state)
    
    W = W:reshape(Y:size(2), X:size(2)+1)
    
    --intercept
    b = W:sub(1, W:size(1), W:size(2), W:size(2))
    
    --coefficients
    W = W:sub(1, W:size(1), 1, W:size(2)-1)
    
    return W, b
end

function predict(X, W, b)
    local b = b:t()
    return (X*W:t()):add(b:expand(b, X:size(1), b:size(2)))
end

function predict_score(ypred, ytrue)
    local c = 0
    for i=1,ypred:size(1) do
        if ypred[i][1] == ytrue[i][1] then
            c = c + 1       
        end
    end
    return c/ypred:size(1)
end

<h3>Create Document Word Matrix and One Hot Encoding</h3>

In [7]:
--feature weight: counts
function createDocWordMatrix(vocab, max_sent_len, sparseMatrix)
    docword = torch.zeros(sparseMatrix:size(1), vocab)
    for i=1,sparseMatrix:size(1) do
        for j=1, max_sent_len do
            local idx = (sparseMatrix[i][j])
            if idx ~= 0 then
                docword[i][idx] = 1 + docword[i][idx]
            end
        end
    end
    return docword
end
 
function onehotencode(classes, target)
    onehot = torch.zeros(target:size(1), classes)
    for i=1,target:size(1) do
        onehot[i][target[i]] = 1
    end
    return onehot
end

function write2file(fname, pred) 
    f = io.open(fname, "w")
    f:write("ID,Category\n")
    for i=1,pred:size(1) do
        f:write(tostring(i) .. "," .. tostring(pred[i][1]) .. "\n")
    end
    f:close()
end

In [12]:
f = hdf5.open("SST1.hdf5", "r")

--X_train = f:read("train_input"):all()
--Y_train = f:read("train_output"):all()
--X_valid = f:read("valid_input"):all()
--Y_valid = f:read("valid_output"):all()
X_test = f:read("test_input"):all()
--nclasses = f:read('nclasses'):all():long()[1]
--nfeatures = f:read('nfeatures'):all():long()[1]

f:close()

In [13]:
--X_train =createDocWordMatrix(nfeatures, 53, X_train)
--Y_train = onehotencode(nclasses, Y_train)
X_test = createDocWordMatrix(nfeatures, 53, X_test)
--Y_test = onehotencode(nclasses, Y_valid)

In [17]:
start_time = os.time()
W, b = fit(X_train, Y_train, 0.1, 100)
end_time = os.time()
print(end_time - start_time)

1	1609.4379124341	


2	1607.2033720465	


3	1602.8105017748	


4	1592.6863631158	




5	1584.5786903493	


6	1563.2366732467	


7	1346.9585354891	


8	1345.3083369596	


9	1326.876010918	


10	1276.8779199979	


11	1225.7305496861	


12	1290.9593722085	


13	1279.5207245123	


14	1271.4026449479	


15	1294.094045808	


16	1268.2024104429	


17	1270.15280979	


18	1218.6230895379	


19	1225.9927015649	


20	1255.9484227329	


21	1276.6301416091	


22	1248.7244542371	


23	1314.692794746	


24	1281.3666347559	


25	1301.632795193	


26	1289.1405333903	


27	1275.2171771723	


28	1279.2757598215	


29	1243.6636596819	


30	1227.780051562	


31	1243.7900649843	


32	1234.5281145301	


33	1197.4630584482	


34	1281.6741509297	


35	1301.2676784501	


36	1242.0211502833	


37	1238.2747527623	


38	1175.3759340169	


39	1205.6889102892	


40	1216.2690989087	


41	1254.6288945137	


42	1275.1325114215	


43	1270.0629313879	


44	1226.756347707	


45	1224.8655949262	


46	1206.5233093759	


47	1220.47755852	


48	1197.7218694503	


49	1214.8310943082	


50	1228.014843027	


51	1245.5949804946	


52	1221.3242146546	


53	1206.3080806664	


54	1201.6186944524	


55	1199.4425110644	


56	1196.3196626218	


57	1202.9457419369	


58	1190.7192905136	


59	1216.6555074147	


60	1214.8592644303	


61	1225.6683291668	


62	1189.5841144664	


63	1221.8312019394	


64	1227.3368877606	


65	1174.2385961624	


66	1213.5203428416	


67	1193.1405692245	


68	1175.3644360008	


69	5301.4650676738	


70	4704.1792701725	


71	1362.7198217078	


72	1202.7681987951	


73	1233.2255384915	


74	1275.4695530721	


75	1155014.7454203	


76	983725.416469	


77	665943.37148217	


78	345606.48153623	


79	184720.10070482	


80	152558.77188418	


81	125968.1573731	


82	96889.918646057	


83	79478.062070856	


84	68191.893969856	


85	52381.604595955	


86	46695.324314022	


87	39682.294669855	


88	38731.909065143	


89	35404.084106843	


90	28346.943702674	


91	27945.244074101	


92	18385.817276536	


93	18110.291721675	


94	88462.231097811	


95	69654.84271923	


96	12918.641165694	


97	12180.038457255	


98	11766.538825795	


99	9027.9305596895	


100	8995.2636450021	


28	


In [14]:
Y_pred = predict(X_test, W, b)
_, Y_pred = torch.max(Y_pred, 2)
_,Y_true = torch.max(Y_test, 2)
acc_score = predict_score(Y_pred, Y_true)
print(acc_score)

In [15]:
write2file("MLR_8.csv", Y_pred)




In [8]:
print(X_train[1]:sum())

1	
