## Reinforcement Learning series: Tic-tac-toe-five game bot


In [1]:
### IMPORTS ###

ipynb = 'ttt-5'

import sys, os, json, glob
import tensorflow    as tf
import keras.backend as K

from ipywidgets           import widgets, HBox, VBox, Layout
from IPython.core.display import display, HTML, Javascript as JS
from pandas               import DataFrame
from pathlib              import Path
from pprint               import pprint
from operator             import iconcat
from functools            import reduce, partial
from itertools            import groupby
from collections          import deque
from numpy                import *
from numpy.random         import *
from os                   import listdir
from os.path              import isfile,isdir
from uuid                 import uuid4 as guid

from tensorflow.keras.layers       import Input, Dense, Conv2D, Flatten, BatchNormalization, Activation, Add
from tensorflow.keras.losses       import mse, categorical_crossentropy
from tensorflow.keras.optimizers   import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.activations  import softmax
from tensorflow.keras.models       import Model
from tensorflow.keras.utils        import Progbar, to_categorical

from matplotlib.pyplot    import *
from time                 import *

# %matplotlib inline

STDOUT = sys.stdout

from ipynbutils import *

DIR = f'tmp/{ipynb}'
Path(DIR).mkdir(parents=True, exist_ok=True)
print('DIR =', DIR)

TESTS = 100


Using TensorFlow backend.


DIR = tmp/ttt-5


## Problem formulation: Treating after-states as action-value function inputs
...

In [2]:
### ENVIRONMENT ###

NUM       = 5    
ROWS,COLS = 9,9
SIZE      = ROWS*COLS
CHANNELS  = 2
NAME      = f'ttt-{NUM}-{ROWS}x{COLS}'

print(NAME)

encoding = array([
    [0,0], # 0
    [0,1], # 1
    [1,0], # 2
]) 

def onehot(s): 
    return concatenate(encoding[s.astype(int)])

def digits(s):
    return ''.join(int64(s).ravel().astype(str))

def other(p): 
    return (p%2)+1

def player(s):
    return other(count_nonzero(s))

def action2xy(a):
    x,y = unravel_index(a, (ROWS,COLS))
    return (x,y)

def xy2action(x,y):
    a = ravel_multi_index((x,y), (ROWS,COLS))
    return a

def enum_lines(a,n=NUM):
    x0,y0 = action2xy(a)
    lines = []
    for (dx,dy) in [(1,0),(0,1),(1,1),(1,-1)]:
        line = []
        for k in range(-(n-1), +n):
            x = x0 + k*dx 
            y = y0 + k*dy
            if x >= 0 and y >= 0 and x < ROWS and y < COLS:
                line.append(xy2action(x,y))
        for i in range(len(line)-n+1):
            xx = line[i:i+n]
            xx = [x for x in xx if x != a]
            lines.append(xx)
    return lines

LINES = [ enum_lines(a) for a in range(SIZE) ]
a = choice(SIZE)
print(a, LINES[a])

def canwin(s,a,p):
    for line in LINES[int(a)]:
        if all(s[line] == p):
            return True
    return False

def iswin(s,a):
    return canwin(s,a,s[a])

def hasbegun(s):
    return count_nonzero(s) > 0

def isover(s):
    return count_nonzero(s) == len(s)

def game(s=None,a=None,overwrite=False):
    winner = None
    if s is None:
        s = zeros(SIZE)
    elif a is None:
        s = array(list(s)).astype(float)
    else:
        a    = int(a)
        assert(s[a] == 0)
        s    = s if overwrite else copy(s)
        p    = player(s)
        s[a] = p
        if iswin(s,a):
            winner = p
        elif isover(s):
            winner = 0
    return s,winner 

def actions(s):
    aa = argwhere(s == 0)
    return concatenate(aa) if len(aa) > 0 else []

def actionmask(aa):
    aa       = aa.astype(int)
    mask     = zeros(SIZE)
    mask[aa] = 1
    return mask

def selectaction(s,pi):
    aa,pp = pi(s)
    i     = choice(range(len(pp)), p=pp)
    return aa[i],pp[i]    

def getreward(agent, winner):
    rival = other(agent)
    if winner == agent:
        return 1
    if winner == rival:
        return -1
    return 0

def getoutcome(ss,aa,pp,rr,discount=0,normalize=False):
    outcome = {1:0,2:0}
    n       = len(rr)
    yy      = zeros_like(rr)
    for i in reversed(range(n)):
        actor = player(ss[i])
        yy[i]          = rr[i] + outcome[actor]
        outcome[actor] = yy[i] * discount
        if not hasbegun(ss[i]):
            outcome = {1:0,2:0}
    if normalize:
        yy = (yy - mean(yy)) / std(yy)
    return yy

def episode(policy, start=None):
    n  = 0
    ss = zeros((SIZE,SIZE))
    aa = zeros((SIZE))
    pp = zeros((SIZE))
    s,winner = game() if start is None else (start,None)
    while winner is None:
        actor   = player(s)
        rival   = other(actor) 
        a,p     = selectaction(s, policy[actor])
        ss[n,:] = s
        aa[n]   = a
        pp[n]   = p
        n      += 1     
        s,winner = game(s, a, overwrite=True)
    ss = ss[0:n,:]
    aa = aa[0:n]
    pp = pp[0:n]
    return winner,ss,aa,pp

def winratio(wins, agent=1):
    rival       = other(agent)
    draw_score  = wins[0] * 0.5
    agent_score = wins[agent] + draw_score
    rival_score = wins[rival] + draw_score
    ratio       = agent_score / (rival_score or 1)
    return ratio

def testgames(policy, iters=1000):
    ratio    = 0
    agent,*_ = policy.keys()
    rival    = other(agent)
    wins     = [0,0,0]
    progbar  = Progbar(target=iters, stateful_metrics=['draws','wins1','wins2','win ratio'])
    print(f'testing policy {agent}:{policy[agent].__name__} vs {rival}:{policy[rival].__name__}')
    for i in range(iters):
        winner,_,_,_  = episode(policy)
        wins[winner] += 1
        ratio         = winratio(wins,agent)
        progbar.update(i+1, values=[
            ('draws', wins[0]/10_000),
            ('wins1', wins[1]/10_000),
            ('wins2', wins[2]/10_000),
            ('win ratio', ratio),             
        ])    
    return ratio

def samplegames(policy, iters=100, start=None, progress=None):
    m        = 0
    sss      = zeros((SIZE*iters,SIZE))
    aaa      = zeros((SIZE*iters))
    ppp      = zeros((SIZE*iters))
    rrr      = zeros((SIZE*iters))
    progbar  = Progbar(target=iters, stateful_metrics=['total samples']) if progress else None
    progress = PROGRESS(path=f'{DIR}/play.log') if progbar else None
    for i in range(iters):
        winner,ss,aa,pp = episode(policy, start=start)
        rr              = zeros((len(ss)))
        rr[-1]          = getreward(player(ss[-1]), winner)
        rr[-2]          = getreward(player(ss[-2]), winner)
        d               = len(rr)
        sss[m:m+d,:]    = ss
        aaa[m:m+d]      = aa
        ppp[m:m+d]      = pp
        rrr[m:m+d]      = rr
        m              += d
        if progbar is not None:
            values= [
                ('samples per episode', d),
                ('total samples', m)]
            progbar.update(i+1, values=values)    
            progress(f'{NAME}, iter {i} of {iters}, {dict(values)}') 
    return sss[0:m,:], aaa[0:m], ppp[0:m], rrr[0:m]

def argsmax(values):
    return unique(ravel(argwhere(values == max(values))))
            
def uniformprob(aa,ii):
    pp     = zeros_like(aa).astype(float)
    pp[ii] = array(1/len(ii))
    return pp

def argmaxprob(qq):
    ii = argsmax(qq)
    pp = uniformprob(qq,ii)
    return ravel(pp)

def softmaxprob(zz):
    max_z = max(zz)
    num   = exp(zz - max_z) 
    den   = sum(num)
    return num / den

def randompi(s):
    aa = actions(s)
    n  = len(aa)
    pp = array([1/n] * n)
    return aa,pp

def anycanwin(s,a):
    for line in LINES[int(a)]:
        xx = s[line]
        if all(xx == 1) or all(xx == 2):
            return True
    return False

def lookaheadpi(pi):
    @rename(f'lookaheadpi({pi.__name__})')
    def lookaheadpi(s):
        actor = player(s)
        rival = other(actor)
        aa    = actions(s)
        n     = count_nonzero(s)
        if n >= 8:  
            for i,a in enumerate(aa):
                if anycanwin(s,a):
                    pp = uniformprob(aa,[i])
                    return aa,pp
        return pi(s)    
    return lookaheadpi

def maxpi(pi):
    @rename(f'maxpi({pi.__name__})')
    def maxpi(s):
        aa,pp = pi(s)
        pp    = argmaxprob(pp)
        return aa,pp
    return maxpi

seed(42)
winner,ss,aa,pp = episode({ 1:randompi, 2:randompi })
rr = zeros_like(pp)
rr[-1] = getreward(player(ss[-1]), winner)
rr[-2] = getreward(player(ss[-2]), winner)
yy = getoutcome(ss,aa,pp,rr,discount=0.9)
rr = around(rr.ravel(),2) 
yy = around(yy.ravel(),2) 
print(f'winner = {winner}')
print(f'states = {ss}')
print(f'actions = {aa}')
print(f'probs = {pp}')
print(f'rewards = {rr}')
print(f'outcome = {yy}')

ss,aa,pp,rr = samplegames({ 1:randompi, 2:randompi }, iters=10, progress=True)

testgames({ 1:randompi, 2:randompi },iters=TESTS)


ttt-5-9x9
56 [[20, 29, 38, 47], [29, 38, 47, 65], [38, 47, 65, 74], [54, 55, 57, 58], [55, 57, 58, 59], [57, 58, 59, 60], [36, 46, 66, 76], [24, 32, 40, 48], [32, 40, 48, 64], [40, 48, 64, 72]]
winner = 1
states = [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 2. 1. 1.]
 [0. 1. 1. ... 2. 1. 1.]
 [0. 1. 1. ... 2. 1. 1.]]
actions = [30. 77. 58. 47. 12. 11.  4. 70. 48. 56.  1. 78. 66. 18. 16. 17. 26. 42.
 36. 25. 51. 10. 27. 33. 39. 64. 15. 44. 52.  3. 53. 14.  6. 76. 79. 67.
 28.  8. 59. 38. 13. 43.  2. 73. 24. 57. 31. 46. 50. 20. 80. 65. 74. 71.
 49.]
probs = [0.01234568 0.0125     0.01265823 0.01282051 0.01298701 0.01315789
 0.01333333 0.01351351 0.01369863 0.01388889 0.01408451 0.01428571
 0.01449275 0.01470588 0.01492537 0.01515152 0.01538462 0.015625
 0.01587302 0.01612903 0.01639344 0.01666667 0.01694915 0.01724138
 0.01754386 0.01785714 0.01818182 0.01851852 0.01886792 0.01923077
 0.01960784 0.02       0.02040816 0.02083333 0.0212766

1.564102564102564

## Jupyter Widgets: Simple game frontend to try agents right here in the notebook
...

In [3]:
### FRONTEND ###

def play(policy):
    state   = None
    board   = None
    agent   = None
    rival   = None
    
    def moveagent(s, a):
        a,p      = selectaction(s,policy[agent])
        s,winner = game(s, a)
        return s,winner

    def display_board(onclick):
        board = []
        for i in range(SIZE):
            btn = widgets.Button(
                description  = '',
                disabled     = False,
                button_style = '', # 'success', 'info', 'warning', 'danger' or ''
                tooltip      = 'Click me',
                icon         = '',
                layout       = Layout(width='40px', height='40px')
            )
            btn.action = i
            btn.on_click(lambda btn: onclick(btn.action))
            board.append(btn)
        boxes = []
        for x in range(ROWS):
            cells = []
            for y in range(COLS):
                cells.append(board[xy2action(x,y)])
            boxes.append(HBox(cells))
        display(VBox(boxes))                
        return board
        
    def update_board(board, state):
        chars = [' ', 'x', 'o']
        state = state.astype(int)
        for i in range(SIZE):
            board[i].description = chars[state[i]]

    def gameturn(s=None, a=None):
        s,winner = game(s,a)
#         print(reshape(s, (ROWS,COLS)))
        update_board(board, s)
        if winner is None and player(s) == agent:
            s,winner = moveagent(s,a)
            update_board(board, s)
        if winner is not None:
            msgs = ['DRAW','X WINS','O WINS']
            print(msgs[winner])
            for i in range(SIZE):
                board[i].disabled = True
            play(policy)
        return s,winner

    def onclick(a): 
        nonlocal state
        if (player(state) != agent) and (a in actions(state)):
            state,winner = gameturn(state,a)

    assert(1 in policy or 2 in policy)    
    if 1 in policy:
        agent,rival = 1,2
    elif 2 in policy:
        agent,rival = 2,1
    print(f'play against {policy[agent].__name__}')
    
    board        = display_board(onclick=onclick)
    state,winner = gameturn()

    
play(policy={1:lookaheadpi(randompi)})
 

play against lookaheadpi(randompi)


VBox(children=(HBox(children=(Button(layout=Layout(height='40px', width='40px'), style=ButtonStyle(), tooltip=â€¦

In [4]:
### DATA ###

def rot0(M):     return M
def rot180(M):   return rot90(M,2)
def rot270(M):   return rot90(M,3)
def fliptrbl(M): return transpose(M)
def fliptlbr(M): return transpose(rot90(M,2))

rotations = [ rot0,   rot90,  rot180,   rot270   ]
flips     = [ fliplr, flipud, fliptlbr, fliptrbl ]

def map_rotations_and_flips():
    statemap  = {}
    actionmap = {}
    state   = array(arange(SIZE))
    for f in rotations+flips:
        s1 = ravel(f(reshape(state,(ROWS,COLS))))
        s2 = zeros_like(s1)
        for i,x in enumerate(s1):
            s2[x] = i;
        statemap [f.__name__] = s1
        actionmap[f.__name__] = s2
    return statemap,actionmap
    
statemap,actionmap = map_rotations_and_flips()
for f in rotations+flips:
    print(f.__name__, statemap[f.__name__], actionmap[f.__name__])
print()
    
def rotate_and_flip(x):
    if isscalar(x): 
        x = int(x)
        return array([ idx[x] for idx in actionmap.values() ])
    else: 
        return array([ x[idx] for idx in statemap.values()  ])

def onehots2D(ss):
    n  = len(ss)
    ss = reshape(ss, (n,ROWS,COLS))
    xx = zeros((n,ROWS,COLS,CHANNELS))
    xx[:,:,:,0] = (ss == 2).astype(int)
    xx[:,:,:,1] = (ss == 1).astype(int)
    return xx
        
def pushsample(data, state, action, reward):
    ss = rotate_and_flip(state)
    aa = rotate_and_flip(action)
    xx = onehots2D(ss)
    yy = to_categorical(aa,SIZE) 
    rr = ones_like(aa)*reward
    data['states' ].extend(xx)
    data['actions'].extend(yy)
    data['rewards'].extend(rr)
    
def loadsamples(path, verbose=True):    
    if isdir(path):
        sss,aaa,rrr = [],[],[]
        for file in glob.glob(f'{DIR}/data-*.npy'):
            ss,aa,rr = loadsamples(file, verbose=verbose)
            sss.extend(ss)
            aaa.extend(aa)
            rrr.extend(rr)
        return array(sss),array(aaa),array(rrr)
    if verbose: MTIME(path)
    data = load(path)
    ss   = array(data.item().get('states'))
    aa   = array(data.item().get('actions'))
    rr   = array(data.item().get('rewards'))
    if verbose: print(f'states:  {ss.shape}, actions: {aa.shape}, rewards: {rr.shape}')
    return ss,aa,rr
    
data        = { 'states': [], 'actions': [], 'rewards': [] }
ss,aa,pp,rr = samplegames({ 1:randompi, 2:randompi }, iters=1)
i           = choice(len(ss))
pushsample(data,ss[i],aa[i],rr[i])
print(data)


rot0 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80]
rot90 [ 8 17 26 35 44 53 62 71 80  7 16 25 34 43 52 61 70 79  6 15 24 33 42 51
 60 69 78  5 14 23 32 41 50 59 68 77  4 13 22 31 40 49 58 67 76  3 12 21
 30 39 48 57 66 75  2 11 20 29 38 47 56 65 74  1 10 19 28 37 46 55 64 73
  0  9 18 27 36 45 54 63 72] [72 63 54 45 36 27 18  9  0 73 64 55 46 37 28 19 10  1 74 65 56 47 38 29
 20 11  2 75 66 57 48 39 30 21 12  3 76 67 58 49 40 31 22 13  4 77 68 59
 50 41 32 23 14  5 78 69 60 51 42 33 24 15  6 79 70 61 52 43 34 25 16  7
 80 71 62 53 44 35 26 17  

In [13]:
### MODEL ###

def ConvLayer(filters=256, kernel_size=3, activation='relu'):
    def proc(x):
        y = Conv2D(filters=filters, kernel_size=kernel_size, use_bias=False)(x)
        y = BatchNormalization()(y)
        z = Activation(activation)(y)
        return z
    return proc

def ResLayer(filters=256, kernel_size=3, activation='relu'):
    def proc(x):
        y = Conv2D(filters=filters, kernel_size=kernel_size, use_bias=False, padding='same')(x)
        y = BatchNormalization()(y)
        y = Activation(activation)(y)
        y = Conv2D(filters=filters, kernel_size=kernel_size, use_bias=False, padding='same')(y)
        y = BatchNormalization()(y)
        y = Add()([y, x])
        z = Activation(activation)(y)
        return z
    return proc

def ValueHead():
    def proc(x):
        y = ConvLayer(filters=1, kernel_size=1, activation='relu')(x)
        y = Flatten()(y)
        y = Dense(256, activation='relu')(y) 
        z = Dense(1, activation='tanh', name='output_value')(y) 
        return z
    return proc

def PolicyHead():
    def proc(x):
        y = ConvLayer(filters=2, kernel_size=1, activation='relu')(x)
        y = Flatten()(y)
        z = Dense(SIZE, activation='softmax', name='output_policy')(y)
        return z
    return proc

class BabyAlphaZeroModel(Model):
    def __init__(self, residuals=5):
        x = Input((ROWS,COLS,CHANNELS,), name='input_state')
        y = ConvLayer()(x)
        for i in range(residuals):
            y = ResLayer()(y)
        q = ValueHead()(y)
        p = PolicyHead()(y)
        super(BabyAlphaZeroModel, self).__init__(inputs=x, outputs=[q,p], name='baby-alpha-zero-model')
        self.__name__ = self.name
        self.compile(
            loss      = ['mse','categorical_crossentropy'], 
            optimizer = Adam(lr=0.001, clipnorm=1.0))         
    def clone():
        clone = BabyAlphaZeroModel()
        clone.set_weights(self.get_weights())
        return clone
    def __call__(self, s, policy=True):
        aa   = actions(s)
        xx   = onehots2D(array([s]))
        q,pp = self.predict(xx)
        q    = q.item()
        pp   = softmaxprob(ravel(pp))
#         xx   = tf.convert_to_tensor(xx, dtype=tf.float32)
#         q,pp = self.call(xx, training=False)
#         q    = K.eval(q).item()
#         pp   = softmaxprob(ravel(K.eval(pp)))
        return (aa,pp[aa]) if policy else (q,pp)
    def __str__(self):
        return self.__name__
    def save(self, path):
        if isfile(path):
            MTIME(path)
            os.rename(path, path+'.bak')       
            MTIME(path+'.bak')
        save(path, self.get_weights())
        MTIME(path)
        self.__name__ = Path(path).stem
    def load(self, path):
        self.set_weights(load(path))
        MTIME(path)
        self.__name__ = Path(path).stem
    def train(self, ss, aa, rr,
        batch_size = 32,
        epochs     = 1,
        rates      = {0:0.1},
        skip_zeros = False,
        shuffle    = True):
            if skip_zeros:
                ii       = argwhere(rr != 0)
                ss,aa,rr = ss[ii],aa[ii],rr[ii] 
            if shuffle:
                ss,aa,rr = shuffleall(ss,aa,rr)
            batches  = ceil(len(ss) / batch_size)   
            sss      = array_split(ss,batches)
            aaa      = array_split(aa,batches)
            rrr      = array_split(rr,batches)
            iters    = int(epochs*batches)
            sumerrs  = zeros(iters)
            valerrs  = zeros(iters)
            polerrs  = zeros(iters)
            progbar  = Progbar(target=iters, stateful_metrics=['rate','epoch'])
            progress = PROGRESS(path=f'{DIR}/train.log')
            i        = 0
            for epoch in range(epochs):
                for ss,aa,rr in zip(sss,aaa,rrr):
                    if epoch in rates:
                        rate = rates[epoch]
                        K.set_value(self.optimizer.lr, rate)
                    errs = self.train_on_batch(
                                x             = ss, 
                                y             = [rr,aa], 
                                sample_weight = {1:rr}, 
                                reset_metrics = False)
                    sumerrs[i],valerrs[i],polerrs[i] = errs
                    values    = [
                        ('epoch',  epoch),
                        ('rate',   rate),
                        ('error',  sumerrs[i]),
                        ('value',  valerrs[i]),
                        ('policy', polerrs[i])]
                    i += 1
                    progbar.update(i, values=values)
                    progress(f'{NAME}, iter {i} of {iters}, {dict(values)}') 
            figure()
            plot(valerrs)
            plot(polerrs)
            plot(sumerrs,'r')
            title('objective history')
            ylabel('objective')
            xlabel(f'iterations')
            legend(['value error', 'policy error', 'total error'], loc='upper left')
            savefig(f'{DIR}/{model.name}.png')
            show()        
            return mean(sumerrs),mean(valerrs),mean(polerrs)
    
print(BabyAlphaZeroModel)    


<class '__main__.BabyAlphaZeroModel'>


In [15]:
### MCTS ###

def searchpi(model, iters=1, seconds=0, exp=1, tau=1):
    tree = {}
    
    def puct(node, a, exp=1): 
        p     = node['pp'][a]
        q     = node['qq'][a]
        n     = node['nn'][a]
        sum_n = sum(list(node['nn'].values()))
        return q + exp*p*sqrt(sum_n)/(1+n)

    def pi(node, tau=1):
        aa    = array([ a          for a in node['nn'].keys()   ])
        nn    = array([ n**(1/tau) for n in node['nn'].values() ])
        return aa, nn/sum(nn)    

    def cma(cma,n,x):
        return cma + (x-cma)/(n+1), n+1

    def search(model, s, exp=1):
        nonlocal tree
        actor = player(s)
        aa    = actions(s)
        key   = digits(s)
        if key not in tree:           
            value,pp  = model(s, policy=False)
            node      = {'pp': { a:pp[a] for a in aa }, 
                         'qq': { a:0     for a in aa },
                         'nn': { a:0     for a in aa }}
            tree[key] = node     
            return -value,node
        node     = tree[key]           
        uu       = array([ puct(node,a,exp) for a in aa ])
        a        = choice(aa, p=argmaxprob(uu))          
        s,winner = game(s, a)
        if winner is None: value,_  = search(model, s, exp)
        else:              value    = getreward(actor, winner)
        node['qq'][a],node['nn'][a] = cma(node['qq'][a], node['nn'][a], value)
        return -value,node
    
    @rename(f'searchpi({model.name},{iters},{seconds}s,exp={exp},tau={tau})')
    def searchpi(s):
        i,t  = 0,time()
        while i <= iters or time()-t < seconds:
            _,node = search(model,s,exp)
            i     += 1
        aa,pp = pi(node,tau)            
        return aa,pp
    return searchpi

print(searchpi)


<function searchpi at 0x7f1e245f2d08>


In [11]:
stats = {}

def addcell(table,row,col,val):
    row,col = str(row),str(col)
    if row not in table: 
        table[row] = {}
    row = table[row]
    row[col] = val
    
print(stats)

{}


In [None]:
iters     = 10
ratios1   = zeros(iters)
ratios2   = zeros(iters)
prevmodel = BabyAlphaZeroModel(); #prevmodel.load(f'{DIR}/model-000.npy')
currmodel = BabyAlphaZeroModel(); #currmodel.load(f'{DIR}/model-000.npy')
prevpi    = searchpi(prevmodel, iters=10, seconds=0, exp=0, tau=1)
currpi    = searchpi(currmodel, iters=10, seconds=0, exp=0, tau=1)
seed0     = int(100*rand())
print('seed0 =', seed0)
for i in range(iters):
    seed(seed0+i)
    ratios1[i] = testgames({1:currpi, 2:prevpi}, iters=TESTS)
    ratios2[i] = testgames({2:currpi, 1:prevpi}, iters=TESTS)

ratio1 = mean(ratios1)
ratio2 = mean(ratios2)

print('ratio1 =', ratio1)
print('ratio2 =', ratio2)

NOTIFY(f'{NAME}, {currpi.__name__} vs {prevpi.__name__}, {ratio1} vs {ratio2}')

addcell(stats,prevmodel,currmodel,ratio1)   
addcell(stats,currmodel,prevmodel,ratio2)   

path = f'{DIR}/stats-{round(time())}.npy'
save(path, stats)
MTIME(path)

DataFrame(data=stats)

seed0 = 36
testing policy 1:searchpi(baby-alpha-zero-model,10,0s,exp=0,tau=1) vs 2:searchpi(baby-alpha-zero-model,10,0s,exp=0,tau=1)
  1/100 [..............................] - ETA: 67:46:00 - draws: 0.0000e+00 - wins1: 1.0000e-04 - wins2: 0.0000e+00 - win ratio: 1.0000

In [None]:
DataFrame(data=stats)

In [None]:
model = BabyAlphaZeroModel()
model.summary()