In [130]:
require 'hdf5'
require 'math'

f = hdf5.open("data.hdf5", "r")

X_train = f:read("windows_train"):all()
Y_train = f:read("train_Y"):all()

X_valid = f:read("windows_valid"):all()
X_valid_nospaces = f:read("valid_kaggle_without_spaces"):all()

Y_valid = f:read("valid_reduced_Y"):all()
Y_valid_spaces = f:read("valid_answers"):all()

nclasses = f:read('nclasses'):all():long()[1]
nfeatures = f:read('nfeatures'):all():long()[1]

f:close()

In [131]:
--Returns converted LongTensor to Str split by space row-wise
function createHash(longTensor) 
    local s = ""
    for i=1,longTensor:size(1) do
        s = s .. tostring(longTensor[i]) .. " "
    end
    return s
end

--Returns converted LongTensor to Str split by space col-wise
function createHash2(longTensor)
    local s = ""
    for i=1,longTensor:size(2) do
        s = s .. tostring(longTensor[1][i]) .. " "
    end
    return s
end

--Return added padding when necessary at </s>
function add_padding(r, n, win) 
    if r < n then
        for i=1,n-r do
            win[i] = 30
        end
        r = r + 1
    end
    if win[n] == 30 then
        r = 1
    end
    return r, win
end

--Returns table of space and total counts
function count_train(windows_train, space_train, n_gram)
    local count_table = {}
    local space_table = {}
    local restart = 1
    for i=1,windows_train:size(1) do
        --Checks if a </s> is necessary (only for CBM!)
        restart, padded = add_padding(restart, n_gram, windows_train[i])
        local key = createHash(padded)
        if count_table[key] then
            count_table[key] = count_table[key] + 1
        else
            count_table[key] = 1
            space_table[key] = 0
        end
        if space_train[i] == 2 then
            space_table[key] = space_table[key] + 1
        end
    end
    return space_table, count_table
end  

--Returns normalized probability for two scores
function normalize(num, den) 
    return num/(num+den)
end

--Returns smoothed counts
function laplace_smooth(count, total, alpha, vocab_size)
    return (count+alpha)/(total+alpha*vocab_size)
end

--Returns perplexity for CBM
function count_perp(space_table, total_table, windows_valid, space_valid, n_gram, alpha, vocab_size)
    local restart = 1
    local perp = 0
    for i=1,windows_valid:size(1) do
        --Checks if a </s> is necessary (only for CBM!)
        restart, padded = add_padding(restart, n_gram, windows_valid[i])
        local key = createHash(padded)
        if total_table[key] then
            local p_space = laplace_smooth(space_table[key], total_table[key], alpha, vocab_size)
            local p_nospace = laplace_smooth(total_table[key] - space_table[key], total_table[key], alpha, vocab_size)
            
            --Next char is NOT space
            if space_valid[i] == 1 then
                perp = perp + math.log(normalize(p_nospace, p_space))
            --Next char IS space
            else
                perp = perp + math.log(normalize(p_space, p_nospace))
            end
        else
            --Probability for unseen counts
            perp = perp + math.log(alpha/vocab_size)
        end
    end
    return math.exp(-perp/windows_valid:size(1))
end

--Returns array of perplexity scores for different alphas
function count_based_CV(space_table, total_table, x_valid, y_valid, w, a, n)
    local list=""
    for i=1,100 do
        local a = i/10
        local perplexity = count_perp(space_table, total_table, x_valid, y_valid, w, a, n)
        list=list..tostring(perplexity)..","
    end
    return list
end

--Returns padded X_valid_nospaces for space count
function pre_pad(x_valid_nospaces, w)
    local padding = torch.Tensor(x_valid_nospaces:size(1), w-1):fill(30)
    padding = padding:type('torch.LongTensor')
    return torch.cat(padding, x_valid_nospaces, 2)
end

--
function count_space_predict(space_table, total_table, x_valid, y_valid, w, n, a)
    for i=1,x_valid:size(1) do
        for j=1,x_valid:size(2)-w+1 do
            local context = x_valid:sub(i,i,j,j+w-1)
            local key = createHash2(context)
            if total_table[key] then
                local p_space = laplace_smooth(space_table[key], total_table[key], a, n)
                local p_nospace = laplace_smooth(total_table[key] - space_table[key], total_table[key], a, n)
                local p_space = normalize(p_space, p_nospace)
                
                if p_space > 0.5 then
                    --Loop through backwards from the second to last entry up to position of next space
                    for k=1,x_valid:size(2)-j-w do
                        local idx = x_valid:size(2) - k
                        x_valid[i][idx+1] = x_valid[i][idx]
                        --x_valid:sub(i,i,idx+1,idx+1) =  x_valid:sub(i,i,idx,idx)
                    end
                    x_valid[i][j+w] = 30
                end
            
            end
        print(x_valid[i])
        break
        end
    break
    end
end

In [107]:
win_size = X_train:size(2)
ngram_space, ngram_total = count_train(X_train, Y_train, win_size)

In [108]:
additive = 1
perplexity = count_perp(ngram_space, ngram_total, X_valid, Y_valid, win_size, additive, nfeatures)

print("COUNT BASED MODEL")
print("================================")
print("Window size: " .. tostring(win_size))
print("Laplace smoothing parameter: " .. tostring(additive))
print("Perplexity: " .. tostring(perplexity))

COUNT BASED MODEL	
Window size: 3	
Laplace smoothing parameter: 1	
Perplexity: 1.1981899697832	


In [132]:
X_valid_nospaces = pre_pad(X_valid_nospaces, win_size)

In [133]:
count_space_predict(ngram_space, ngram_total, X_valid_nospaces, Y_valid, win_size, nfeatures, additive)

 30
 30
  2
 14
 15
 18
 20
 12
  4
 19
 18
 12
  1
 24
 22
  1
 15
 21
 21
 14
 12
 14
 23
  4
 21
  9
  4
  8
 19
 21
  4
 13
  4
 17
  9
 14
 15
  4
 18
  1
 13
  8
 21
 21
 13
  4
  2
 13
 14
 18
  4
 19
 21
 14
 21
  9
  4
 21
 23
 18
  4
 21
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30
 30


In [157]:
function createHash2(longTensor) 
    local s = ""
    for i=1,longTensor:size(2) do
        s = s .. tostring(longTensor[1][i]) .. " "
    end
    return s
end
vocab_size = nfeatures
for i=1,X_valid_nospaces:size(1) do
    
    local sen = X_valid_nospaces:sub(i,i,1,X_valid_nospaces:size(2))
    
    local flag = true
    local idx = 1
    
    while flag do
        
        local win = sen:sub(i,i,idx, idx+win_size-1)
        local after = sen:sub(i,i,idx+win_size, sen:size(2))
        
        local key = createHash2(win)
        if ngram_total[key] then
            local p_space = laplace_smooth(ngram_space[key], ngram_total[key], 1, vocab_size)
            local p_nospace = laplace_smooth(ngram_total[key] - ngram_space[key], ngram_total[key], 1, vocab_size)
            local p_space = normalize(p_space, p_nospace)
            if p_space > 0.5 then
                print("space")
                local temp = torch.cat(win, torch.LongTensor(torch.LongStorage{30}))
                sen = torch.cat(temp, after)
            end
        end

        idx = idx + 1
        
        print(sen:sub(i,i,idx,idx+win_size))
        --print(sen:sub(i,i,idx, idx+win_size))
        if sen:sub(i,i,idx+win_size,idx+win_size)[1][1] == 30 then
            
            flag = false
        end

    end
    break
        
        

end

In [None]:
        
        
    end
    
    for j=1,X_valid_nospaces:size(2) do
        if X_valid_nospaces:sub(i,i,j+win_size-1,j+win_size-1)[1][1] == 30 then
            break
        end
        local win = X_valid_nospaces:sub(i,i,j,j+win_size-1)
        local key = createHash2(win)
        --print(key)
        sen = sen .. key
        if ngram_total[key] then
            local p_space = laplace_smooth(ngram_space[key], ngram_total[key], 1, vocab_size)
            local p_nospace = laplace_smooth(ngram_total[key] - ngram_space[key], ngram_total[key], 1, vocab_size)
            p_space = (normalize(p_space, p_nospace))
            if p_space > 0.5 then
                sen = sen .. "30 "
            end
        end
    end
    print(sen)
    break

In [68]:
for j=3,1 do
    print(j)
end