In [1]:
clear all;

global ipynb = 'tttt';

source('clearest-nn.m');
source('utils-logging.m');
source('utils-training.m');
source('game-tttt.m');

log2file(tmp('log'));

% rand('state', 1);

##########################################

% [winner,s] = play1(@randompolicy, @randompolicy);
% winner
% size(game2oh18(s)')
% size(game2oh27(s)')
% s    = game2mat(s)
% play(100, @randompolicy);


ans = THE CLEAREST NEURAL NETWORK FRAMEWORK BY UNDWAD
ans = TIC-TAC-TOE TACTICS GAME


In [None]:
global vmodel epsilon tau gamma agentpolicy rivalpolicy;

function X = nextstates2onehots(s, aaa=actions(s))
    X = map(@(a) game2oh18(game(s,a)), aaa);
end

function a = stochasticpolicy(s)
    global vmodel;
    aaa     = actions(s);
    X       = nextstates2onehots(s, aaa);
    [~,Q]   = forward(vmodel, X);
    [~,~,i] = softmaxpick(Q');
    a       = aaa(i);
end

function a = deterministicpolicy(s)
    global vmodel;
    aaa   = actions(s);
    X     = nextstates2onehots(s, aaa);
    [~,Q] = forward(vmodel, X);
    [~,i] = max(Q);
    a     = aaa(i);
end

function [a,x,p] = taustochasticpolicy(s)
    global vmodel tau;
    aaa     = actions(s);
    X       = nextstates2onehots(s, aaa);
    [~,Q]   = forward(vmodel, X);
    Q      /= tau;
    [~,p,i] = softmaxpick(Q');
    a       = aaa(i);
    x       = X(:,i);
end

function [a,x,p] = epsilongreedypolicy(s)
    global vmodel epsilon;
    aaa = actions(s)
    X   = nextstates2onehots(s, aaa);
    if rand() < epsilon
        i     = randi([1 count(aaa)]);
        p     = epsilon;
    else
        [~,Q] = forward(vmodel, X);
        [~,i] = max(Q);
        p     = 1-epsilon;
    end
    a = aaa(i);
    x = X(:,i);
end

function [err,ok] = learnvalueapprox(X, Y)
    global vmodel;
    [vmodel,Z]     = forward(vmodel, X);
    E              = cost(vmodel, Z, Y);
    dE             = gradient(vmodel, Z, Y);
    [vmodel,ggg,~] = backward(vmodel, Z, dE);
    [vmodel,ggg]   = optimize_gradient(vmodel, ggg, 1);
    new_vmodel     = update(vmodel, ggg);
    err            = mean(E);
    [~,Z]          = forward(new_vmodel, X);
    E              = cost(new_vmodel, Z, Y);
    delta          = mean(E) - err;
    ok             = delta < 0;
    if ok
        vmodel = new_vmodel;
    end
end

function s = start(agent)
    global rivalpolicy;
    s = game();
    if agent == 2
        a = rivalpolicy(s);
        s = game(s, a); 
    end
end

function [s, reward, winner] = move(agent, s, a)
    global rivalpolicy;
    s = game(s, a);            # learning player moves
    if iswin(s, a)             # learning player wins
        reward = 1;
        winner = agent;
    elseif isover(s)           # draw
        reward = 0;
        winner = 0;
    elseif player(s) == agent  # invalid move
        error('invalid move');
    else                       # continue
        a = rivalpolicy(s);   
        s = game(s, a);        # other player moves
        if iswin(s, a)         # other player wins
            reward = -1;
            winner = player(s, a);
        elseif isover(s)       # draw
            reward = 0;
            winner = 0;
        else                   # continue
            reward = 0;
            winner = [];
        end  
    end  
end

function [winner,err,ok] = episode(agent)
    global agentpolicy gamma;
    winner = [];     # draw: 0, players: 1,2
    X = zeros(18,9); # states as onehot vectors
    Y = zeros(1,9);  # targets (rewards)
    P = zeros(1,9);  # chosen action probs
    n = 0;           # number of moves
    s = start(agent);
    do
        [a,x,p]           = agentpolicy(s);
        [s,reward,winner] = move(agent, s, a);
        n                += 1;
        X(:,n)            = x;
        Y(1,n)            = reward;
        P(1,n)            = p;
    until !isempty(winner);
    for i = n-1:-1:1
        Y(1,i) += gamma * P(1,i+1) * Y(1,i+1);
    end
    [err,ok] = learnvalueapprox(X(:,1:n), Y(:,1:n));
end

function ratio = iteration(p, n, eps=1e-8)
    global vmodel;
    ERROR = zeros(1,n);
    RATIO = zeros(1,n);
    OK    = 0; 
    wins  = [0 0];
    for t = 1:n
        [winner,err,ok] = episode(p);
        if winner > 0 
            wins(winner) += 1; 
        end
        if nnz(wins) == 2
            ratio = wins(p) / wins(other(p));
        else
            ratio = 0;
        end
        ERROR(t) = err;
        RATIO(t) = ratio;
        OK      += 1*ok;
        if t > 1 
            edir = sign(err   - ERROR(t-1));
            rdir = sign(ratio - RATIO(t-1));
        else
            edir = 0;
            rdir = 0;
        end
        gradnorm = getunit(vmodel.optimizers,'gradient_clipping').norm;
        updratio = getunit(vmodel.optimizers,'stats').ratio;        
        showlog(1, 100, 'player %d, episode %d of %d (ok %d), gradnorm %f, updratio %f, wins %s, error %f %s, ratio %f %s', 
                         p, t, n, OK, gradnorm, updratio, mat2str(wins), err, dir2arrow(edir), ratio, dir2arrow(rdir));
    end
    figure('Position', [0 0 1000 400]);
    hold on;
    plot(1:n, ERROR, 'r');
    plot(1:n, RATIO, 'g');
    legend('objective', 'win ratio')
    title('training history');
end

tau         = 5;
gamma       = 1;
epsilon     = 0.1;
rivalpolicy = @randompolicy;
agentpolicy = @taustochasticpolicy;
vmodel      = model(180, {'dense', 300}, 'relu', {'dense', 100}, 'relu', {'dense', 1});
vmodel      = optimization(vmodel, {'adam', 0.01}, {'gradient_clipping', 0.9}, 'stats');
vmodel      = objective(vmodel, 'mse');

printvar('vmodel.num_p');
% printmodel('vmodel');

play(100, @randompolicy);
play(100, @randompolicy, @stochasticpolicy);
play(100, @randompolicy, @deterministicpolicy);

printstart();
ratio = iteration(2, 1000);
printend();

play(100, @randompolicy, @stochasticpolicy);
play(100, @randompolicy, @deterministicpolicy);


vmodel.num_p = 84501
playing 100 times randompolicy vs randompolicy
game 100, winner 0, wins [44 34], draws 22                                      
playing 100 times randompolicy vs stochasticpolicy
game 13, winner 2, wins [5 6], draws 2                                          

In [None]:
tau = 5;

printstart();
ratio = iteration(2, 10000);
printend(ipynb);

play(100, @randompolicy, @stochasticpolicy);
play(100, @randompolicy, @deterministicpolicy);

In [None]:
tau = 2;

printstart();
ratio = iteration(2, 10000);
printend(ipynb);

play(100, @randompolicy, @stochasticpolicy);
play(100, @randompolicy, @deterministicpolicy);

In [None]:
vmodel = TUNE(vmodel, 'adam', 0.001);
save('-binary',tmp('turns.mat'),'turns');

In [None]:
global turns;

turns = struct();

function k = state2key(s)
    k = char(s(:)'+48);
end

function enum(s = game())
    global turns;
    key = state2key(s);
    showlog(1, 50, '%d', numfields(turns));
    if !isfield(turns,key) 
        turns.(key) = player(s);
        for a = actions(s)
            [s_,winner] = game(s,a);
            if isempty(winner)
                enum(s_);
            end
        end
    end
end

printstart();
enum();
printend(sprintf('total states %d', numfields(turns)));    
