In [2]:
clear all;

global ipynb = 'imdb-prepare';

source('clearest-nn.m');
source('utils-logging.m');
source('utils-training.m');

log2file(tmp('log'));


ans = THE CLEAREST NEURAL NETWORK FRAMEWORK BY UNDWAD


In [3]:
% LOAD DATASET

file = fopen('datasets/IMDB Dataset.csv');
% header = textscan(file,'%q %q', 'Delimiter', ',');
data = textscan(file,'%q %q', repeat=50000, 'Delimiter', ',');
fclose(file);

printsize('data');


data = cell [1 2]


In [4]:
% SAVE|LOAD LABELS

path = tmp('labels.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'Y');
else
    Y = data{1,2}; 
    for i = 1:length(Y)
        y = Y{i,1};
        if length(y) != 8
            y
        end
    end
    Y = cell2mat(Y);
    Y = Y == "positive";
    Y = all(Y, dim=2);
    Y = double(Y');
    save('-binary', path, 'Y');
end

printsize('Y');
printvar('Y(1:10)');


path = tmp/imdb-prepare.labels.mat
exists = 1
Y = matrix [1 50000]
Y(1:10) = [1 1 1 0 1 1 1 0 0 1]


In [5]:
% CONVERT DATASET

path = tmp('reviews.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'R');
else
    t      = tic();
    w2n    = struct();
    R      = data{1,1}; 
    num_r  = rows(R);
    printvar('num_r');    
    printlog('\n');
    for ri = 1:num_r
        r  = R{ri,1};
        r  = strsplit(r, {'.','?','!','<br />'});
        ns = columns(r);
        for si = 1:ns
            if toc(t) > 5
                printlog('\rri: %d, si: %d%s', ri, si, padright('',' ',30));
                t = tic();
            end
            s = r{1,si};
            s = strsplit(s, {' ','\f','\n','\r','\t','\v',',',';',':','(',')','"',"'"});
            r{1,si} = s;
        end
        R{ri,1} = r;
    end
    printlog('\n');    
    save('-binary', path, 'R');
end

printvar('R{2,1}{1,1}');


path = tmp/imdb-prepare.reviews.mat
exists = 1
R{2,1}{1,1} = 
{
  [1,1] = A
  [1,2] = wonderful
  [1,3] = little
  [1,4] = production
}



In [6]:
% PREPARE WORD COUNTERS

path = tmp('w2n.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'w2n');
    num_w = numfields(w2n);
else
    t      = tic();
    w2n    = struct();
    num_r  = rows(R);
    num_w  = 0;
    max_n  = 0;
    printvar('num_r');    
    printlog('\n');
    for ri = 1:num_r/2
        r  = R{ri,1};
        ns = columns(r);
        for si = 1:ns
            if toc(t) > 5
                printlog('\rri: %d, si: %d%s', ri, si, padright('',' ',30));
                t = tic();
            end
            s = r{1,si};
            nw = columns(s);
            for wi = 1:nw
                w = s{1,wi};
                if length(w) > 0
                    if !isfield(w2n, w)
                        w2n.(w) = 0;
                        num_w  += 1; 
                    end
                    n = w2n.(w) + 1;
                    if n > max_n
                        max_n = n;
                    end
                    w2n.(w) = n;
                end
            end
        end
    end
    printlog('\n');    
    save('-binary', path, 'w2n');
end

printvar('num_w');
printvar('w2n.("the")');
printvar('w2n.("good")');
printvar('w2n.("bad")');


path = tmp/imdb-prepare.w2n.mat
exists = 1
num_w = 122553
w2n.("the") = 287010
w2n.("good") = 14062
w2n.("bad") = 8505


In [7]:
% PREPARE INDEX-TO-WORD

path = tmp('i2w.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'i2w');
else
    t   = tic();
    i2w = {};
    printlog('\n');    
    for [n,w] = w2n
        i2w{end+1} = w;
    end
    printlog('\n');    
    printvar('i2w{1}');
    printvar('i2w{end}');
    save('-binary', path, 'i2w');
end

printvar('i2w(1)');
printvar('i2w(end)');


path = tmp/imdb-prepare.i2w.mat
exists = 1
i2w(1) = 
{
  [1,1] = One
}

i2w(end) = 
{
  [1,1] = underlays
}



In [8]:
% PREPARE INDEX-TO-FREQUENCY

path = tmp('i2f.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'i2f');
else
    t   = tic();
    i2f = {};
    printlog('\n');    
    for [n,w] = w2n
        i2f{end+1} = n / num_w;
    end
    printlog('\n');    
    save('-binary', path, 'i2f');
end

printvar('i2f{1}');
printvar('i2f{end}');


path = tmp/imdb-prepare.i2f.mat
exists = 1
i2f{1} = 0.019020
i2f{end} = 0.000008


In [9]:
% PREPARE WORD-TO-INDEX

path = tmp('w2i.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'w2i');
else
    t   = tic();
    w2i = struct();
    printlog('\n');
    for i = 1:num_w
        w = i2w{i};
        w2i.(w) = i;
    end
    printlog('\n');    
    save('-binary', path, 'w2i');
end

printvar('w2i.("the")');
printvar('w2i.("good")');
printvar('w2i.("bad")');


path = tmp/imdb-prepare.w2i.mat
exists = 1
w2i.("the") = 3
w2i.("good") = 488
w2i.("bad") = 639


In [None]:
% PREPARE INDEXED REVIEWS

path = tmp('indexed.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'R');
else
    t     = tic();
    P     = zeros(2,0);
    num_r = rows(R);
    printvar('num_r');    
    printlog('\n');
    for ri = 1:num_r
        r  = R{ri,1};
        ns = columns(r);
        for si = 1:ns
            if toc(t) > 5
                printlog('\rri: %d, si: %d%s', ri, si, padright('',' ',30));
                t = tic();
            end
            s = r{1,si};
            nw = columns(s);
            for wi = 1:nw
                w       = s{1,wi};
                if isfield(w2i, w)
                    s{1,wi} = w2i.(w);
                else
                    s{1,wi} = 0;
                end
            end
            r{1,si} = s;
        end
        R{ri,1} = r;
    end
    printlog('\n');    
    save('-binary', path, 'R');
end

printvar('count(indexed)');
printvar('R{2,1}{1,1}');


path = tmp/imdb-prepare.indexed.mat
exists = 0
num_r = 50000

ri: 4, si: 5                               

In [None]:
% PREPARE WORD-PAIRS

function x = subsampling(x, eps=0.001)
    x = (sqrt(x / eps) + 1) * (eps / x);
end

function yes = should_keep(fraq)
    yes = rand() <= subsampling(fraq);
end

function pairs = nearby_pairs(w2i, i2f, sent, wnd=1)
    pairs = zeros(2, 0);
    n     = count(sent);
    for i = 1:n
        w  = sent{i};
        if isfield(w2i,w)
            wi = w2i.(w);
            if should_keep(i2f{wi})
                delta = randi(wnd);
                first = max(1, i-delta);
                last  = min(n, i+delta);
                for j = first:last
                    w  = sent{j};
                    if j != i && isfield(w2i,w)
                        wj = w2i.(w);
                        if should_keep(i2f{wj})
                            pairs(:, end+1) = [wi; wj];
                        end
                    end
                end
            end
        end
    end
end

path = tmp('pairs.mat')

% delete(path);

exists = logical(exist(path))
if exists
    load('-binary', path, 'P');
else
    t     = tic();
    P     = zeros(2,0);
    num_r = rows(R);
    printvar('num_r');    
    printlog('\n');
    for ri = 1:num_r
        if ri <= num_r/2
            r  = R{ri,1};
            ns = columns(r);
            for si = 1:ns
                if toc(t) > 5
                    printlog('\rri: %d, si: %d%s', ri, si, padright('',' ',30));
                    t = tic();
                end
                s = r{1,si};
                p = nearby_pairs(w2i, i2f, s, wnd=3);
                P = [P p];
            end
        end
    end
    printlog('\n');    
    save('-binary', path, 'P');
end

printvar('count(pairs)');
printvar('P(:,1)');
printvar('P(:,end)');


In [None]:
% vocab   = deserialize("./trained/review_vocab.jls")
% samples = deserialize("./tmp/embedding_samples.jls")

% len = 10_000_000

% word_idx_table = begin
%     chances = last.(vocab) .^ (3/4)
%     chances = chances ./ sum(chances)
%     counts  = trunc.(Int, chances .* len)
%     [ fill(i, n) for (i, n) in enumerate(counts) ] |> flatten |> collect |> shuffle
% end

% rand_word_idxs(n, idxs = []) = begin
%     idx = rand(word_idx_table)
%     if idx in idxs 
%         rand_word_idxs(n, idxs)
%     elseif n > 1
%         rand_word_idxs(n-1, [ idxs..., idx ])
%     else
%         [ idxs..., idx ]
%     end
% end

% samples

In [None]:
%     ñ           = num_negative_samples
%     targets     = [1, zeros(ñ)...]

%         for iter in 1:typemax(Int)
%             entropyʹ = 0
%             for (si, (xi, zi)) in enumerate(samples)
%                 # select targets
%                 zis = rand_word_idxs(ñ, [zi])
%                 # propagate
%                 ys = Y[:, xi]
%                 Zʹ = Z[zis, :]
%                 zs = Zʹ * ys
%                 zs = sigmoid.(zs)
%                 # calcuate ΔZ
%                 errs = zs .- targets         
%                 ΔZ   = errs * ᵀ(ys)
%                 # calcuate ΔY
%                 errs = ᵀ(Zʹ) * errs
%                 ΔY   = errs * 1
%                 # backpropagate
%                 Z[zis, :] .= Zʹ .- rate * ΔZ
%                 Y[:, xi]  .= ys .- rate * ΔY
%                 # calculate loss
%                 Δentropyʹ = mean([ binary_crossentropy(t, z) for (t, z) in zip(targets, zs) ])
%                 entropyʹ  = entropyʹ + Δentropyʹ
%                 progress  = trunc((si / count) * 100; digits = 2)
%                 try_report("$report_msg $progress%")
%             end
%         end

