In [118]:
require 'hdf5'
require 'math'

f = hdf5.open("data.hdf5", "r")

X_train = f:read("windows_train"):all()
Y_train = f:read("train_Y"):all()
X_valid = f:read("windows_valid"):all()
X_valid_nospaces = f:read("valid_kaggle_without_spaces"):all()
Y_valid = f:read("valid_reduced_Y"):all()
Y_valid_spaces = f:read("valid_answers"):all()

nclasses = f:read('nclasses'):all():long()[1]
nfeatures = f:read('nfeatures'):all():long()[1]

f:close()

In [119]:
--Returns converted LongTensor to Str split by space
function createHash(longTensor) 
    s = ""
    for i=1,longTensor:size(1) do
        s = s .. tostring(longTensor[i]) .. " "
    end
    return s
end

--Return added padding when necessary at </s>
function add_padding(r, n, win) 
    if r < n then
        for i=1,n-r do
            win[i] = 30
        end
        r = r + 1
    end
    if win[n] == 30 then
        r = 1
    end
    return r, win
end

--Returns table of space and total counts
function count_train(windows_train, space_train, n_gram)
    local count_table = {}
    local space_table = {}
    local restart = 1
    for i=1,windows_train:size(1) do
        --Checks if a </s> is necessary (only for CBM!)
        restart, padded = add_padding(restart, n_gram, windows_train[i])
        local key = createHash(padded)
        if count_table[key] then
            count_table[key] = count_table[key] + 1
        else
            count_table[key] = 1
            space_table[key] = 0
        end
        if space_train[i] == 2 then
            space_table[key] = space_table[key] + 1
        end
    end
    return space_table, count_table
end  

--Returns normalized probability for two scores
function normalize(num, den) 
    return num/(num+den)
end

--Returns smoothed counts
function laplace_smooth(count, total, alpha, vocab_size)
    return (count+alpha)/(total+alpha*vocab_size)
end

--Returns perplexity for CBM
function count_perp(space_table, total_table, windows_valid, space_valid, n_gram, alpha, vocab_size)
    local restart = 1
    local perp = 0
    for i=1,windows_valid:size(1) do
        --Checks if a </s> is necessary (only for CBM!)
        restart, padded = add_padding(restart, n_gram, windows_valid[i])
        local key = createHash(padded)
        if total_table[key] then
            local p_space = laplace_smooth(space_table[key], total_table[key], alpha, vocab_size)
            local p_nospace = laplace_smooth(total_table[key] - space_table[key], total_table[key], alpha, vocab_size)
            
            --Next char is NOT space
            if space_valid[i] == 1 then
                perp = perp + math.log(normalize(p_nospace, p_space))
            --Next char IS space
            else
                perp = perp + math.log(normalize(p_space, p_nospace))
            end
        else
            --Probability for unseen counts
            perp = perp + math.log(alpha/vocab_size)
        end
    end
    return math.exp(-perp/windows_valid:size(1))
end


In [120]:
win_size = X_train:size(2)
ngram_space, ngram_total = count_train(X_train, Y_train, win_size)

In [121]:
additive = 1
perplexity = count_perp(ngram_space, ngram_total, X_valid, Y_valid, win_size, additive, nfeatures)

print("COUNT BASED MODEL")
print("================================")
print("Window size: " .. tostring(win_size))
print("Laplace smoothing parameter: " .. tostring(additive))
print("Perplexity: " .. tostring(perplexity))

COUNT BASED MODEL	
Window size: 3	
Laplace smoothing parameter: 1	
Perplexity: 1.1981899697832	


In [79]:
list=""
for i=1,100 do
    additive = i/10
    perplexity = count_perp(ngram_space, ngram_total, X_valid, Y_valid, win_size, additive, nfeatures)
    list=list..tostring(perplexity)..","
end

In [102]:
padding = torch.Tensor(X_valid_nospaces:size(1), win_size-1):fill(30)
padding = padding:type('torch.LongTensor')
X_valid_nospaces = torch.cat(padding, X_valid_nospaces, 2)

In [187]:
function createHash2(longTensor) 
    local s = ""
    for i=1,longTensor:size(2) do
        s = s .. tostring(longTensor[1][i]) .. " "
    end
    return s
end
vocab_size = nfeatures
for i=1,X_valid_nospaces:size(1) do
    
    local sen = X_valid_nospaces:sub(i,i,1,X_valid_nospaces:size(2))
    
    local flag = true
    local idx = 1
    
    
    while flag do
        local win = (sen:sub(1,1,idx, idx+win_size-1))
        local after = sen:sub(1,1,idx+win_size, sen:size(2))
        
        local key = createHash2(win)
        
        
        sen = torch.cat(win, torch.LongTensor(torch.LongStorage{30}))
        sen = torch.cat(sen, after)
        idx = idx + 1
        print(sen)
        break
    end
    break
        
        

end

Columns 1 to 20
  2  14  15  30  18  20  12   4  19  18  12   1  24  22   1  15  21  21  14  12

Columns 21 to 40
 14  23   4  21   9   4   8  19  21   4  13   4  17   9  14  15   4  18   1  13

Columns 41 to 60
  8  21  21  13   4   2  13  14  18   4  19  21  14  21   9   4  21  23  18   4

Columns 61 to 80
 21  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30

Columns 81 to 100
 30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30

Columns 101 to 120
 30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30

Columns 121 to 140
 30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30

Columns 141 to 160
 30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30

Columns 161 to 180
 30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30

Columns 181 to 200
 30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30  30

Columns 20

In [157]:
s = "asdf"

In [164]:
torch.cat(torch.Tensor{1}, torch.Tensor{2})

 1
 2
[torch.DoubleTensor of size 2]



In [None]:
        
        
    end
    
    for j=1,X_valid_nospaces:size(2) do
        if X_valid_nospaces:sub(i,i,j+win_size-1,j+win_size-1)[1][1] == 30 then
            break
        end
        local win = X_valid_nospaces:sub(i,i,j,j+win_size-1)
        local key = createHash2(win)
        --print(key)
        sen = sen .. key
        if ngram_total[key] then
            local p_space = laplace_smooth(ngram_space[key], ngram_total[key], 1, vocab_size)
            local p_nospace = laplace_smooth(ngram_total[key] - ngram_space[key], ngram_total[key], 1, vocab_size)
            p_space = (normalize(p_space, p_nospace))
            if p_space > 0.5 then
                sen = sen .. "30 "
            end
        end
    end
    print(sen)
    break

 1
[torch.LongTensor of size 1]

