In [2]:
require 'hdf5'
require 'math'

f = hdf5.open("data.hdf5", "r")

X_train = f:read("windows_train"):all()
Y_train = f:read("train_Y"):all()

X_valid = f:read("windows_valid"):all()
X_valid_nospaces = f:read("valid_kaggle_without_spaces"):all()

Y_valid = f:read("valid_reduced_Y"):all()
Y_valid_spaces = f:read("valid_answers"):all()

nclasses = f:read('nclasses'):all():long()[1]
nfeatures = f:read('nfeatures'):all():long()[1]

f:close()

In [3]:
--Returns converted LongTensor to Str split by space row-wise
function createHash(longTensor) 
    local s = ""
    for i=1,longTensor:size(1) do
        s = s .. tostring(longTensor[i]) .. " "
    end
    return s
end

--Returns converted LongTensor to Str split by space col-wise
function createHash2(longTensor)
    local s = ""
    for i=1,longTensor:size(2) do
        s = s .. tostring(longTensor[1][i]) .. " "
    end
    return s
end

--Return added padding when necessary at </s>
function add_padding(r, n, win) 
    if r < n then
        for i=1,n-r do
            win[i] = 30
        end
        r = r + 1
    end
    if win[n] == 30 then
        r = 1
    end
    return r, win
end

--Returns table of space and total counts
function count_train(windows_train, space_train, n_gram)
    local count_table = {}
    local space_table = {}
    local restart = 1
    for i=1,windows_train:size(1) do
        --Checks if a </s> is necessary (only for CBM!)
        restart, padded = add_padding(restart, n_gram, windows_train[i])
        local key = createHash(padded)
        if count_table[key] then
            count_table[key] = count_table[key] + 1
        else
            count_table[key] = 1
            space_table[key] = 0
        end
        if space_train[i] == 2 then
            space_table[key] = space_table[key] + 1
        end
    end
    return space_table, count_table
end  

--Returns normalized probability for two scores
function normalize(num, den) 
    return num/(num+den)
end

--Returns smoothed counts
function laplace_smooth(count, total, alpha, vocab_size)
    return (count+alpha)/(total+alpha*vocab_size)
end

--Returns perplexity for CBM
function count_perp(space_table, total_table, windows_valid, space_valid, n_gram, alpha, vocab_size)
    local restart = 1
    local perp = 0
    for i=1,windows_valid:size(1) do
        --Checks if a </s> is necessary (only for CBM!)
        restart, padded = add_padding(restart, n_gram, windows_valid[i])
        local key = createHash(padded)
        if total_table[key] then
            local p_space = laplace_smooth(space_table[key], total_table[key], alpha, vocab_size)
            local p_nospace = laplace_smooth(total_table[key] - space_table[key], total_table[key], alpha, vocab_size)
            
            --Next char is NOT space
            if space_valid[i] == 1 then
                perp = perp + math.log(normalize(p_nospace, p_space))
            --Next char IS space
            else
                perp = perp + math.log(normalize(p_space, p_nospace))
            end
        else
            --Probability for unseen counts
            perp = perp + math.log(alpha/vocab_size)
        end
    end
    return math.exp(-perp/windows_valid:size(1))
end

--Returns array of perplexity scores for different alphas
function count_based_CV(space_table, total_table, x_valid, y_valid, w, a, n)
    local list=""
    for i=1,100 do
        local a = i/10
        local perplexity = count_perp(space_table, total_table, x_valid, y_valid, w, a, n)
        list=list..tostring(perplexity)..","
    end
    return list
end

--Returns padded X_valid_nospaces for space count
function pre_pad(x_valid_nospaces, w)
    local padding = torch.Tensor(x_valid_nospaces:size(1), w-1):fill(30)
    padding = padding:type('torch.LongTensor')
    return torch.cat(padding, x_valid_nospaces, 2)
end

--Returns MSE of spaces for GREEDY CBM given following parameters:
    --space_table : hash_table of ngram : n_spaces
    --total_table : hash_table of ngram : counts
    --x_valid : sentences with NO spaces
    --y_valid : number of spaces in each sentence
    --w : window size
    --n : nfeatures
    --a : additive alpha
    --p : threshold paramters
function count_greedy_predict(space_table, total_table, x_valid, y_valid, w, n, a, p)
    local mean_error_sq = 0
    for i=1,x_valid:size(1) do
        local spaces = 0
        local sentence = x_valid:sub(i,i,1,x_valid:size(2)):clone()
        for j=1,x_valid:size(2)-w+1 do
            if sentence[1][j+w] == 30 then
                break
            end
            local context = sentence:sub(1,1,j,j+w-1)
            local key = createHash2(context)
            local p_space = 0
            local p_nospace = 0
            if total_table[key] then
                p_space = laplace_smooth(space_table[key], total_table[key], a, n)
                p_nospace = laplace_smooth(total_table[key] - space_table[key], total_table[key], a, n)
                p_space = normalize(p_space, p_nospace)
                ----print(p_space)
            else
                p_space = 1/n
            end
            if p_space > p then
                spaces = spaces + 1
                --Loop through backwards from the second to last entry up to position of next space
                for k=1,sentence:size(2)-j-w do
                    local idx = sentence:size(2) - k
                    sentence[1][idx+1] = sentence[1][idx]
                end
                sentence[1][j+w] = 30
            end
        end
        mean_error_sq = mean_error_sq + (spaces - y_valid[i])^2
    end
    return mean_error_sq/x_valid:size(1)
end

In [4]:
win_size = X_train:size(2)
ngram_space, ngram_total = count_train(X_train, Y_train, win_size)
X_valid_nospaces = pre_pad(X_valid_nospaces, win_size)

In [5]:
additive = 1
perplexity = count_perp(ngram_space, ngram_total, X_valid, Y_valid, win_size, additive, nfeatures)

print("COUNT BASED MODEL")
print("================================")
print("Window size: " .. tostring(win_size))
print("Laplace smoothing parameter: " .. tostring(additive))
print("Perplexity: " .. tostring(perplexity))

COUNT BASED MODEL	
Window size: 2	
Laplace smoothing parameter: 1	
Perplexity: 1.2736517323433	


In [253]:
threshold = 0.37
additive = 1
print(count_greedy_predict(ngram_space, ngram_total, X_valid_nospaces, Y_valid_spaces, win_size, nfeatures, additive, threshold))

In [None]:
function count_dp_predict(space_table, total_table, x_valid, y_valid, w, n, a, p)
    