## Reinforcement Learning series: Tic-tac-toe game bot


In [1]:
### IMPORTS ###

ipynb = 'ttt-3'

import sys, os, json
import tensorflow    as tf
import keras.backend as K

from ipywidgets           import widgets, HBox, VBox, Layout
from IPython.core.display import display, HTML, Javascript as JS
from pandas               import DataFrame
from pathlib              import Path
from pprint               import pprint
from operator             import iconcat
from functools            import reduce, partial
from collections          import deque
from numpy                import *
from numpy.random         import *
from os.path              import isfile
from uuid                 import uuid4 as guid

# from tensorflow                      import keras
# from tensorflow.keras.layers         import Input, Dense, BatchNormalization, Activation, Multiply
# from tensorflow.keras.losses         import mse, categorical_crossentropy, binary_crossentropy
# from tensorflow.keras.optimizers     import Adam
# from tensorflow.keras.regularizers   import l2
# from tensorflow.keras.activations    import softmax
# from tensorflow.keras.models         import Model, load_model, clone_model
# from tensorflow.keras.callbacks      import LearningRateScheduler, LambdaCallback
# from tensorflow.keras.utils          import Progbar, to_categorical

from keras.layers         import Input, Dense, BatchNormalization, Activation, Multiply
from keras.losses         import mse, categorical_crossentropy, binary_crossentropy
from keras.optimizers     import Adam
from keras.regularizers   import l2
from keras.activations    import softmax
from keras.models         import Model, load_model, clone_model
from keras.callbacks      import LearningRateScheduler, LambdaCallback
from keras.utils          import Progbar, to_categorical

from matplotlib.pyplot    import *
from time                 import *

# %matplotlib inline

STDOUT = sys.stdout

from ipynbutils import *

DIR = f'tmp/{ipynb}'
Path(DIR).mkdir(parents=True, exist_ok=True)
print('DIR =', DIR)

TESTS = 100


Using TensorFlow backend.


DIR = tmp/ttt-3


## Problem formulation: Treating after-states as action-value function inputs
...

In [2]:
### ENVIRONMENT ###

# 0 1 2
# 3 4 5
# 6 7 8

winidx = [
    [ [0,1,2],          [0,3,6], [0,4,8] ], # 0 - (0,0)
    [ [0,1,2],          [1,4,7]          ], # 1 - (0,1)
    [ [0,1,2], [2,4,6], [2,5,8]          ], # 2 - (0,2)
    [ [3,4,5],          [0,3,6]          ], # 3 - (1,0)
    [ [3,4,5], [2,4,6], [1,4,7], [0,4,8] ], # 4 - (1,1)
    [ [3,4,5],          [2,5,8]          ], # 5 - (1,2)
    [ [6,7,8], [2,4,6], [0,3,6],         ], # 6 - (2,0)
    [ [6,7,8],          [1,4,7]          ], # 7 - (2,1)
    [ [6,7,8],          [2,5,8], [0,4,8] ], # 8 - (2,2)
]

encoding = array([
    [0,0], # 0
    [0,1], # 1
    [1,0], # 2
]) 

def onehot(s): 
    return concatenate(encoding[s.astype(int)])

def digits(s):
    return ''.join(int64(s).ravel().astype(str))

def other(p): 
    return (p%2)+1

def player(s):
    return other(count_nonzero(s))

def action2xy(a):
    x,y = unravel_index(a, (3,3))
    return (x,y)

def xy2action(x,y):
    a = ravel_multi_index((x,y), (3,3))
    return a

def enum_lines(a):
    return winidx[a]
            
def iswin(s,a):
    p = s[a]
    for line in enum_lines(a):
        if all(s[line] == p):
            return True
    return False

def canwin(s,a,p):
    a    = int(a)
    s[a] = p
    yes  = iswin(s,a)
    s[a] = 0
    return yes

def hasbegun(s):
    return count_nonzero(s) > 0

def isover(s):
    return count_nonzero(s) == len(s)

def game(s=None,a=None):
    winner = None
    if s is None:
        s = zeros(3*3)
    elif a is None:
        s = array(list(s)).astype(float)
    else:
        a    = int(a)
        assert(s[a] == 0)
        s    = copy(s)
        p    = player(s)
        s[a] = p
        if iswin(s,a):
            winner = p
        elif isover(s):
            winner = 0
    return s,winner    

def actions(s):
    aa = argwhere(s == 0)
    return concatenate(aa) if len(aa) > 0 else []

def actionmask(aa):
    aa       = aa.astype(int)
    mask     = zeros(9)
    mask[aa] = 1
    return mask

def selectaction(s,pi):
    aa,pp = pi(s)
    i     = choice(range(len(pp)), p=pp)
    return aa[i],pp[i]    

def getreward(agent, winner):
    rival = other(agent)
    if winner == agent:
        return 1
    if winner == rival:
        return -1
    return 0

def getoutcome(ss,aa,pp,rr,discount):
    outcome = {1:0,2:0}
    n       = len(rr)
    yy      = zeros_like(rr)
    for i in reversed(range(n)):
        actor          = player(ss[i])
        _,winner       = game(ss[i],aa[i])      
        if winner is not None:
            outcome    = {1:0,2:0}
        yy[i]          = rr[i] + outcome[actor]
        outcome[actor] = yy[i] * discount
    return yy

def episode(policy, start=None):
    n  = 0
    ss = zeros((9,9))
    aa = zeros((9))
    pp = zeros((9))
    s,winner = game() if start is None else (start,None)
    while winner is None:
        actor   = player(s)
        rival   = other(actor) 
        a,p     = selectaction(s, policy[actor])
        ss[n,:] = s
        aa[n]   = a
        pp[n]   = p
        n      += 1     
        s,winner = game(s, a)
    ss = ss[0:n,:]
    aa = aa[0:n]
    pp = pp[0:n]
    return winner,ss,aa,pp

def winratio(wins, agent=1):
    rival       = other(agent)
    draw_score  = wins[0] * 0.5
    agent_score = wins[agent] + draw_score
    rival_score = wins[rival] + draw_score
    ratio       = agent_score / (rival_score or 1)
    return ratio

def testgames(policy, iters=1000):
    ratio    = 0
    agent,*_ = policy.keys()
    rival    = other(agent)
    wins     = [0,0,0]
    progbar  = Progbar(target=iters, stateful_metrics=['draws','wins1','wins2','win ratio'])
    print(f'testing policy {agent}:{policy[agent].__name__} vs {rival}:{policy[rival].__name__}')
    for i in range(iters):
        winner,_,_,_  = episode(policy)
        wins[winner] += 1
        ratio         = winratio(wins,agent)
        progbar.update(i+1, values=[
            ('draws', wins[0]/10_000),
            ('wins1', wins[1]/10_000),
            ('wins2', wins[2]/10_000),
            ('win ratio', ratio),             
        ])    
    return ratio

def samplegames(policy, iters=100, start=None, progress=None):
    m       = 0
    sss     = zeros((9*iters,9))
    aaa     = zeros((9*iters))
    ppp     = zeros((9*iters))
    rrr     = zeros((9*iters))
    progbar = Progbar(target=iters, stateful_metrics=['total samples']) if progress else None
    for i in range(iters):
        winner,ss,aa,pp = episode(policy, start=start)
        rr              = zeros((len(ss)))
        rr[-1]          = getreward(player(ss[-1]), winner)
        rr[-2]          = getreward(player(ss[-2]), winner)
        d               = len(rr)
        sss[m:m+d,:]    = ss
        aaa[m:m+d]      = aa
        ppp[m:m+d]      = pp
        rrr[m:m+d]      = rr
        m              += d
        if progbar is not None:
            progbar.update(i+1, values=[
                ('samples per episode', d),
                ('total samples', m),
            ])    
    return sss[0:m,:], aaa[0:m], ppp[0:m], rrr[0:m]

class EnumProc:
    def __init__(self):
        self.__name__ = 'EnumProc'
        self.states   = [0,0,0]
        self.wins     = [0,0,0]
    def __call__(self, s, winner):
        p = player(s)
        self.states[p] += 1
        if winner is not None:
            self.wins[winner] += 1
    
def enum_policy(policy, proc):
    print(f'enumerating policy 1:{policy[1].__name__} vs 2:{policy[2].__name__} with {proc.__name__}')
    visited = {}
    def iter(s,w):
        key = digits(s)
        if key not in visited:
            visited[key] = 0
            proc(s,w)
            if w is None:
                actor = player(s)
                aa,pp = policy[actor](s)
                for a,p in zip(aa,pp):
                    if p > 0:
                        iter(*game(s,a))
        visited[key] += 1
    iter(*game())

def enum_policies(policies):
    policy1,policy2,states1,states2,draws,wins1,wins2 = [],[],[],[],[],[],[]
    for policy in policies:
        proc = EnumProc()
        enum_policy(policy, proc)
        policy1.append(policy[1].__name__)
        policy2.append(policy[2].__name__)
        states1.append(proc.states[1])
        states2.append(proc.states[2])
        draws.append(proc.wins[0])
        wins1.append(proc.wins[1])
        wins2.append(proc.wins[2])
    return DataFrame(data={
        'policy1': policy1,
        'policy2': policy2,
        'states1': states1,
        'states2': states2,
        'draws': draws,
        'wins1': wins1,
        'wins2': wins2
    })    
    
def argsmax(values):
    return unique(ravel(argwhere(values == max(values))))
        
def lookahead(s, aa):
    actor = player(s)
    ww    = [game(s,a)[1] for a in aa]
    ii    = [i for i,w in enumerate(ww) if w == actor]
#     if len(ii) == 0:
#         rival = other(actor)
#         ii    = [i for i,a in enumerate(aa) if canwin(s,a,rival)]
    return ii
    
def uniformprob(aa,ii):
    pp     = zeros_like(aa).astype(float)
    pp[ii] = array(1/len(ii))
    return pp

def argmaxprob(qq):
    ii = argsmax(qq)
    pp = uniformprob(qq,ii)
    return ravel(pp)

def softmaxprob(zz):
    max_z = max(zz)
    num   = exp(zz - max_z) 
    den   = sum(num)
    return num / den

def randompi(s):
    aa = actions(s)
    n  = len(aa)
    pp = array([1/n] * n)
    return aa,pp

def lookaheadpi(pi):
    @rename(f'lookaheadpi({pi.__name__})')
    def lookaheadpi(s):
        aa = actions(s)
        ii = lookahead(s, aa)
        if len(ii) > 0: 
            pp = uniformprob(aa,ii)
            return aa,pp
        return pi(s)    
    return lookaheadpi

def maxpi(pi):
    @rename(f'maxpi({pi.__name__})')
    def maxpi(s):
        aa,pp = pi(s)
        pp    = argmaxprob(pp)
        return aa,pp
    return maxpi

def percentilepi(pi, centile=90):
    @rename(f'percentilepi({pi.__name__})')
    def percentilepi(s):
        aa,pp = pi(s)
        th    = percentile(pp,centile)
        ii    = argwhere(pp >= th)
        pp    = uniformprob(aa,ii)
        return aa,pp
    return percentilepi

def explorepi(pi, epsilon=0.1):
    @rename(f'explorepi({pi.__name__},{epsilon})')
    def explorepi(s):
        if random() < epsilon:
            return randompi(s)
        return pi(s)
    return explorepi

def choicepi(*policies):
    names = ','.join([pi.__name__ for pi in policies])
    @rename(f'choicepi({names})')
    def choicepi(s):
        pi = choice(policies)
        return pi(s)
    return choicepi

def switchpi(*policies):
    pi    = choice(policies)
    names = ','.join([pi.__name__ for pi in policies])
    @rename(f'switchpi({names})')
    def switchpi(s):
        nonlocal pi
        if count_nonzero(s) < 2:
            pi = choice(policies)
        return pi(s)
    return switchpi

seed(42)

ss,aa,pp,rr = samplegames({ 1:randompi, 2:randompi }, iters=10, progress=True)
print(ss)
print(aa)
print(pp)
print(rr)

testgames({ 1:randompi, 2:randompi },iters=TESTS)

enum_policies([
    {1:randompi,              2:randompi},
    {1:lookaheadpi(randompi), 2:randompi},
    {1:randompi,              2:lookaheadpi(randompi)},
    {1:lookaheadpi(randompi), 2:lookaheadpi(randompi)},
])


[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 2.]
 [0. 0. 0. 1. 0. 0. 1. 0. 2.]
 [0. 0. 0. 1. 2. 0. 1. 0. 2.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [2. 1. 0. 0. 0. 0. 0. 0. 0.]
 [2. 1. 0. 0. 0. 0. 0. 0. 1.]
 [2. 1. 0. 0. 0. 2. 0. 0. 1.]
 [2. 1. 0. 0. 0. 2. 1. 0. 1.]
 [2. 1. 2. 0. 0. 2. 1. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 2. 0. 0. 0. 0. 0. 1. 0.]
 [0. 2. 1. 0. 0. 0. 0. 1. 0.]
 [0. 2. 1. 2. 0. 0. 0. 1. 0.]
 [0. 2. 1. 2. 1. 0. 0. 1. 0.]
 [0. 2. 1. 2. 1. 0. 2. 1. 0.]
 [0. 2. 1. 2. 1. 1. 2. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 2. 0. 0. 0. 1. 0. 0. 0.]
 [0. 2. 0. 1. 0. 1. 0. 0. 0.]
 [0. 2. 0. 1. 2. 1. 0. 0. 0.]
 [0. 2. 0. 1. 2. 1. 1. 0. 0.]
 [0. 2. 0. 1. 2. 1. 1. 0. 2.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 2. 0. 0. 0.]
 [1. 0. 0. 0. 1. 2. 0. 0. 0.]
 [1. 0. 0. 0. 1. 2. 2. 0. 0.]
 [1. 1. 0. 0. 1. 2. 2. 0. 0.]
 [1. 1. 2.

Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,randompi,randompi,2739,2739,16,626,316
1,lookaheadpi(randompi),randompi,2701,2581,16,604,316
2,randompi,lookaheadpi(randompi),2701,2733,16,620,316
3,lookaheadpi(randompi),lookaheadpi(randompi),2657,2569,16,604,316


## Jupyter Widgets: Simple game frontend to try agents right here in the notebook
...

In [3]:
### FRONTEND ###

def play(policy):
    state   = None
    board   = None
    agent   = None
    rival   = None
    
    def moveagent(s, a):
        aa,pp    = policy[agent](s)
        i        = choice(range(len(pp)), p=pp)
        a,p      = aa[i],pp[i]
        s,winner = game(s, a)
        return s,winner

    def display_board(onclick):
        board = []
        for i in range(9):
            btn = widgets.Button(
                description  = '',
                disabled     = False,
                button_style = '', # 'success', 'info', 'warning', 'danger' or ''
                tooltip      = 'Click me',
                icon         = '',
                layout       = Layout(width='40px', height='40px')
            )
            btn.action = i
            btn.on_click(lambda btn: onclick(btn.action))
            board.append(btn)
        display(VBox([
            HBox([board[0],board[1],board[2]]),
            HBox([board[3],board[4],board[5]]),
            HBox([board[6],board[7],board[8]])
        ]))
        return board
        
    def update_board(board, state):
        chars = [' ', 'x', 'o']
        state = state.astype(int)
        for i in range(9):
            board[i].description = chars[state[i]]

    def gameturn(s=None, a=None):
        s,winner = game(s,a)
#         print(reshape(s, (3,3)))
        update_board(board, s)
        if winner is None and player(s) == agent:
            s,winner = moveagent(s,a)
            update_board(board, s)
        if winner is not None:
            msgs = ['DRAW','X WINS','O WINS']
            print(msgs[winner])
            for i in range(9):
                board[i].disabled = True
            play(policy)
        return s,winner

    def onclick(a): 
        nonlocal state
        if (player(state) != agent) and (a in actions(state)):
            state,winner = gameturn(state,a)

    assert(1 in policy or 2 in policy)    
    if 1 in policy:
        agent,rival = 1,2
    elif 2 in policy:
        agent,rival = 2,1
    print(f'play against {policy[agent].__name__}')
    
    board        = display_board(onclick=onclick)
    state,winner = gameturn()

play(policy={1:lookaheadpi(randompi)})
 

play against lookaheadpi(randompi)


VBox(children=(HBox(children=(Button(layout=Layout(height='40px', width='40px'), style=ButtonStyle(), tooltip=…

## Tabular Reinforcement Learning
...

## Dynamic Programming: Action-value iteration
Action-value iteration works by iteratively applying the Bellman optimality equation for $q_{\ast}$ to a working action-value function, as an update rule, as shown below.

$$\large q_{\ast}(s, a) = \sum_{s', r} p(s', r | s, a)[r + \gamma v_{\ast}(s')]$$

Alternatively we can express this equation in terms of $q_{\ast}$ itself.

$$\large q_{\ast}(s, a) = \sum_{s', r} p(s', r | s, a)[r + \gamma \max_{a'} q_{\ast}(s', a')]$$


In [4]:
### TABULAR-DYNAMIC-PROGRAMMING ###

class qtable(dict):
    def __init__(self, name):
        self.__name__ = name
    def __call__(self, s, a):
        s,_ = game(s,a)
        key = digits(s)
        q   = self.get(key,0)
        return q
    def path(self):
        return f'{DIR}/{self.__name__}.npy'
    def save(self):
        save(self.path(), self)
    def load(self):
        tmp = load(self.path())
        self.update(tmp.item())
    def exists(self):
        return isfile(self.path())
    
def qpi(model):
    @rename(f'qpi({model.__name__})')
    def qpi(s):
        aa = actions(s)
        qq = array([model(s,a) for a in aa])
        pp = argmaxprob(qq)
        return aa,pp
    return qpi

def enum_states(proc):
    visited = {}
    def iter(s,w):
        key = digits(s)
        if key not in visited:
            visited[key] = 1
            proc(s,w)
            if w is None:
                for a in actions(s):
                    iter(*game(s,a))
    iter(*game())

def iter_qvalue(table, iters=10, target=1e-8):    
    def eval_afterstate(after_state, winner):    
        new_value = 0
        rival     = player(after_state)
        agent     = other(rival)
        if winner is None: 
            rival_actions = actions(after_state)
            rival_prob    = 1 / len(rival_actions)
            for rival_action in rival_actions:
                next_state,winner = game(after_state, rival_action)
                if winner is None: 
                    max_next_value = max([table(next_state,a) for a in actions(next_state)])
                    new_value     += rival_prob * (0 + max_next_value)
                else:
                    assert(winner != agent)
                    reward     = -1 if winner == rival else 0
                    new_value += rival_prob * (reward + 0)
                    if winner == rival:              # commenting this  
                        new_value = 1 * (reward + 0) # prevents player2
                        break                        # from learning optimal policy
        else:
            assert(winner != rival)
            reward    = 1 if winner == agent else 0
            new_value = 1 * (reward + 0)
        key        = digits(after_state)
        old_value  = table.get(key, 0)
        table[key] = new_value
        return abs(old_value - new_value)
    delta = 0
    def proc(state, winner):
        nonlocal delta
        delta = max(delta, eval_afterstate(state, winner))
    progbar = Progbar(target=iters, stateful_metrics=['delta'])
    for i in range(iters):
        delta = 0
        enum_states(proc)
        progbar.update(i+1, values=[('delta',  delta)]) 
        if delta <= target:
            break
    print('\n')
         
dptable = qtable('dptable')  
if dptable.exists():
    dptable.load()
else:
    iter_qvalue(dptable, iters=100, target=1e-08)
    dptable.save()
print('dptable:', len(dptable))
MTIME(dptable.path())

agentpi = qpi(dptable) 
rivalpi = lookaheadpi(randompi)

testgames({1:agentpi, 2:rivalpi}, iters=TESTS)
testgames({2:agentpi, 1:rivalpi}, iters=TESTS)
testgames({1:agentpi, 2:agentpi}, iters=TESTS)

enum_policies([
    {1:agentpi, 2:rivalpi},
    {1:rivalpi, 2:agentpi},
    {1:agentpi, 2:agentpi},
])


dptable: 5478
tmp/ttt-3/dptable.npy Aug 01 2020 08:18:18
testing policy 1:qpi(dptable) vs 2:lookaheadpi(randompi)
testing policy 2:qpi(dptable) vs 1:lookaheadpi(randompi)
testing policy 1:qpi(dptable) vs 2:qpi(dptable)
enumerating policy 1:qpi(dptable) vs 2:lookaheadpi(randompi) with EnumProc
enumerating policy 1:lookaheadpi(randompi) vs 2:qpi(dptable) with EnumProc
enumerating policy 1:qpi(dptable) vs 2:qpi(dptable) with EnumProc


Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,qpi(dptable),lookaheadpi(randompi),1219,1004,7,554,0
1,lookaheadpi(randompi),qpi(dptable),577,737,16,0,260
2,qpi(dptable),qpi(dptable),25,26,7,0,0


In [None]:
play(policy={2:agentpi})

## Monte-Carlo RL:

$$\large V(S_t) = V(S_t) + \alpha [ G_t - V(S_t) ]$$

$$\large Q(S_t,A_t) = Q(S_t,A_t) + \alpha [ G_t - Q(S_t,A_t) ]$$

...

In [5]:
### TABULAR-MONTE-CARLO ###

def iter_under_policy(policy, iters=100, games=1, rates={0:0.1}, epsilons={0:0.1}, proc=print):
    print(f'iterating under policy {1}:{policy[1].__name__} vs {2}:{policy[2].__name__}')
    progbar  = Progbar(target=iters, stateful_metrics=['rate', 'epsilon'])
    progress = PROGRESS(path=f'{DIR}/progress.log')
    errors   = zeros(iters) 
    rewards  = [None, zeros(iters), zeros(iters)]
    for i in range(iters):
        if i in rates:
            rate = rates[i]
        if i in epsilons:
            epsilon = epsilons[i]
            policy2 = {
                1:explorepi(policy[1], epsilon),
                2:explorepi(policy[2], epsilon),
            }
        ss,aa,pp,rr = samplegames(policy=policy2, iters=games)
        errors[i]   = proc(ss,aa,pp,rr,rate=rate) or 0
        for s,r in zip(ss,rr):
            actor = player(s)
            rewards[actor][i] += r/games 
        values = [
            ('rate',      rate),
            ('epsilon',   epsilon), 
            ('error',     errors[i]),
            ('reward[X]', rewards[1][i]),
            ('reward[O]', rewards[2][i])
        ]
        progbar.update(i+1, values)
        progress(f'{ipynb}, iter {i} of {iters}, {dict(values)}')    
    figure()
    plot(errors,'r')
    plot(rewards[1])
    plot(rewards[2])
    title('objective history')
    ylabel('objective')
    xlabel(f'games x{games}')
    legend(['error', 'reward[X]', 'reward[O]'], loc='upper left')
    savefig(f'{DIR}/{policy[1].__name__}-vs-{policy[2].__name__}.png')
    show()     

def mctrain(table, discount=0.99, iters=100, games=1, rates={0:0.1}, epsilons={0:0.1}):
    def mcproc(ss,aa,pp,rr,rate=0.1):
        error = 0
        yy    = getoutcome(ss,aa,pp,rr,discount=discount)
        for s,a,y in zip(ss,aa,yy):
            key         = digits(game(s,a)[0])
            q           = table.get(key, 0)
            diff        = y - q
            table[key]  = q + rate*diff
            error      += abs(diff)
        return error / len(ss)
    agentpi = lookaheadpi(qpi(table))
    policy  = {1:agentpi,2:agentpi}
    iter_under_policy(policy, iters=iters, games=games, rates=rates, epsilons=epsilons, proc=mcproc)
        
seed(42)
mctable = qtable('mctable')   
if mctable.exists():
    mctable.load()
else:
    mctrain(mctable, 
            discount = 0.99, 
            iters    = 10_000, 
            games    = 10, 
            rates    = { 0:0.1 }, 
            epsilons = { 0:0.1, 5000:0.05 })
    mctable.save()
print('mctable:', len(mctable))
MTIME(mctable.path())
          
agentpi = qpi(mctable) 
rivalpi = lookaheadpi(randompi)

testgames({1:agentpi, 2:rivalpi}, iters=TESTS)
testgames({2:agentpi, 1:rivalpi}, iters=TESTS)
testgames({1:agentpi, 2:agentpi}, iters=TESTS)

enum_policies([
    {1:agentpi, 2:rivalpi},
    {1:rivalpi, 2:agentpi},
    {1:agentpi, 2:agentpi},
])


mctable: 4732
tmp/ttt-3/mctable.npy Aug 01 2020 08:57:32
testing policy 1:qpi(mctable) vs 2:lookaheadpi(randompi)
testing policy 2:qpi(mctable) vs 1:lookaheadpi(randompi)
testing policy 1:qpi(mctable) vs 2:qpi(mctable)
enumerating policy 1:qpi(mctable) vs 2:lookaheadpi(randompi) with EnumProc
enumerating policy 1:lookaheadpi(randompi) vs 2:qpi(mctable) with EnumProc
enumerating policy 1:qpi(mctable) vs 2:qpi(mctable) with EnumProc


Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,qpi(mctable),lookaheadpi(randompi),94,87,3,63,0
1,lookaheadpi(randompi),qpi(mctable),264,318,12,0,148
2,qpi(mctable),qpi(mctable),5,5,1,0,0


In [None]:
play(policy={2:agentpi})

## Temporal difference: Q-Learning

$$\large V(S_t) = V(S_t) + \alpha [ R_{t+1} + \gamma V(S_{t+1}) - V(S_t) ]$$

$$\large Q(S_t, A_t) = Q(S_t, A_t) + \alpha [ R_{t+1} + \gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t) ]$$

$$\large Q(S_t, A_t) = Q(S_t, A_t) + \alpha [ R_{t+1} + \gamma \max_{a} Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t) ]$$

...

In [6]:
### TABULAR-TEMPORAL-DIFFERENCE ###
        
def tdtrain(table, discount=0.99, iters=100, games=1, rates={0:0.1}, epsilons={0:0.1}):
    def tdproc(ss,aa,pp,rr,rate=0.1):
        error = 0
        n     = len(ss)
        for j in reversed(range(n)):
            reward    = rr[j]
            after_s,_ = game(ss[j],aa[j])
            key       = digits(after_s)
            q         = table.get(key, 0)
            if j+1 < n and all(after_s == ss[j+1]):
                next_s,_   = game(ss[j+1],aa[j+1])
                qq         = [table(next_s,a) for a in actions(next_s)]
                max_next_q = max(qq) if len(qq) > 0 else 0
            else:    
                max_next_q = 0
            diff       = reward + discount*max_next_q - q
            table[key] = q + rate*diff                
            error     += abs(diff)
        return error / n
    agentpi = lookaheadpi(qpi(table))
    policy  = {1:agentpi,2:agentpi}
    iter_under_policy(policy, iters=iters, games=games, rates=rates, epsilons=epsilons, proc=tdproc)
        
seed(42)
tdtable = qtable('tdtable')   
if tdtable.exists():
    tdtable.load()
else:
    tdtrain(tdtable, 
        discount = 0.99, 
        iters    = 10_000, 
        games    = 30, 
        rates    = {0:0.1, 3_000:0.01, 7_000:0.001}, 
        epsilons = {0:0.1, 5_000: 0.05})
    tdtable.save()
print('tdtable:', len(tdtable))
MTIME(tdtable.path())
      
agentpi = qpi(tdtable) 
rivalpi = lookaheadpi(randompi)

testgames({1:agentpi, 2:rivalpi}, iters=TESTS)
testgames({2:agentpi, 1:rivalpi}, iters=TESTS)
testgames({1:agentpi, 2:agentpi}, iters=TESTS)

enum_policies([
    {1:agentpi, 2:rivalpi},
    {1:rivalpi, 2:agentpi},
    {1:agentpi, 2:agentpi},
])
    

tdtable: 4461
tmp/ttt-3/tdtable.npy Aug 01 2020 10:44:06
testing policy 1:qpi(tdtable) vs 2:lookaheadpi(randompi)
testing policy 2:qpi(tdtable) vs 1:lookaheadpi(randompi)
testing policy 1:qpi(tdtable) vs 2:qpi(tdtable)
enumerating policy 1:qpi(tdtable) vs 2:lookaheadpi(randompi) with EnumProc
enumerating policy 1:lookaheadpi(randompi) vs 2:qpi(tdtable) with EnumProc
enumerating policy 1:qpi(tdtable) vs 2:qpi(tdtable) with EnumProc


Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,qpi(tdtable),lookaheadpi(randompi),90,87,4,62,0
1,lookaheadpi(randompi),qpi(tdtable),260,312,12,0,146
2,qpi(tdtable),qpi(tdtable),5,5,1,0,0


In [None]:
play(policy={2:agentpi})

## Monte Carlo Tree Search

<img align="left" src="MCTS.jpg">

$$\large UCT(S,a) = \frac{Q(S_a)}{N(S_a)} + \alpha*\sqrt{\frac{2\ln{N(S)}}{N(S_a)}}$$

$$\large PUCT(S,a) = \frac{Q(S_a)}{N(S_a)} + \alpha*P(S,a)\sqrt{\frac{\sum_b{N(S_b)}}{(1+N(S_a))}}$$


In [7]:
### MONTE-CARLO-TREE-SEARCH ###

class mctstable(qtable):
    def __init__(self, name):
        self.__name__ = name
        self.notfound = None
    def __call__(self, s, a):
        s,_  = game(s,a)
        key  = digits(s)
        if key in self:
            uid = self[key]
            return self[uid]['value']
        if self.notfound is not None:
            self.notfound.add(key)
        return 0   
    
def rot0(M):     return M
def rot180(M):   return rot90(M,2)
def rot270(M):   return rot90(M,3)
def fliptrbl(M): return transpose(M)
def fliptlbr(M): return transpose(rot90(M,2))

rotations = [ rot0,   rot90,  rot180,   rot270   ]
flips     = [ fliplr, flipud, fliptlbr, fliptrbl ]

def map_rotations_and_flips():
    statemap  = {}
    actionmap = {}
    state   = array(arange(9))
    for f in rotations+flips:
        s1 = ravel(f(reshape(state,(3,3))))
        s2 = zeros_like(s1)
        for i,x in enumerate(s1):
            s2[x] = i;
        statemap [f.__name__] = s1
        actionmap[f.__name__] = s2
    return statemap,actionmap
    
statemap,actionmap = map_rotations_and_flips()
for f in rotations+flips:
    print(f.__name__, statemap[f.__name__], actionmap[f.__name__])
print()
    
def rotate_and_flip(x):
    if isscalar(x): 
        x = int(x)
        return array([ idx[x] for idx in actionmap.values() ])
    else: 
        return array([ x[idx] for idx in statemap.values()  ])
    
def getstats(table, state):
    key = digits(state)    
    if key not in table:
        uid        = str(guid())
        table[uid] = { 'value': 0, 'visits': 0 }
        for s in rotate_and_flip(state):
            k = digits(s)
            if k not in table:
                table[k] = uid
    uid = table[key]
    return table[uid]

def makenode(table, state, parent=None, terminal=False):
    return {'parent'   : parent,
            'stats'    : getstats(table, state),
            'branches' : { a:None for a in actions(state) } if not terminal else None
           }

def isterminal(node):
    return node['branches'] is None

def isexpanded(node):
    if isterminal(node):
        return False
    for child in node['branches'].values():
        if child is None:
            return False
    return True

def backpropagate(node, reward):
    while node is not None:
        stats            = node['stats']
        stats['visits'] += 1
        stats['value']  += (reward - stats['value']) / stats['visits']
        node             = node['parent']
        reward           = -reward

def expandbranch(table, node, state):
    assert(not isterminal(node))
    assert(not isexpanded(node))
    branches          = node['branches']
    actor             = player(state)
    actions           = list(branches.keys())
    unknown           = [ a for a in actions if branches[a] is None ]
    action            = choice(unknown)
    next_state,winner = game(state,action)
    child             = makenode(table, next_state, parent=node, terminal=winner is not None)
    branches[action]  = child
    if isterminal(child):
        reward = getreward(actor, winner) 
        backpropagate(child, reward)
    return child,next_state
                
def ucb1(node, action, exploration): 
    n     = node['stats']['visits']
    child = node['branches'][action]
    nj    = child['stats']['visits']
    xj    = child['stats']['value']
    if exploration == 0:
        return xj
    if n == 0 or nj == 0:
        return float('inf')
    return xj + exploration*sqrt(2*log(n)/nj)

def selectbranchpi(node, exploration=0):
    assert(isexpanded(node))
    actions  = list(node['branches'].keys())
    values   = array([ ucb1(node,a,exploration) for a in actions ])
    probs    = argmaxprob(values)
    return actions,probs

def selectbranch(node, state, exploration):
    actions,probs = selectbranchpi(node, exploration)
    action        = choice(actions, p=probs)
    node          = node['branches'][action]
    state,_       = game(state, action)
    return node,state

def selectnode(table, node, state, exploration):
    while isexpanded(node):
        node,state = selectbranch(node, state, exploration)
    if not isterminal(node):
        node,state = expandbranch(table, node, state)
    return node,state

def simulatefrom(leaf, state, policy):
    if isterminal(leaf):
        return leaf['stats']['value']
    actor        = player(state)
    winner,_,_,_ = episode(policy, start=state)
    reward       = getreward(actor, winner)
    return reward 
    
def search(table, node, state, policy, exploration=1):
    leaf,state = selectnode(table, node, state, exploration)
    reward     = simulatefrom(leaf, state, policy)
    backpropagate(leaf, reward)
    return reward

def searchpi(table, timeout=1, policy={1:randompi,2:randompi}):
    @rename(f'searchpi({len(table)},{timeout})')
    def searchpi(s):
        t    = time()
        node = makenode(table, s)
        search(table, node, s, policy)
        while time() - t < timeout:
            search(table, node, s, policy)
        aa,pp = selectbranchpi(node, exploration=0)
        return aa,pp
    return searchpi

def train(table, policy, iters, exploration=1):
    state    = game()[0]
    root     = makenode(table, state)
    progbar  = Progbar(target=iters, stateful_metrics=[])
    progress = PROGRESS(path=f'{DIR}/progress.log')
    for i in range(progbar.target):
        reward = search(table, root, state, policy, exploration)
        size   = len(table) 
        progbar.update(i+1, values=[('reward',reward),('size',size)])
        progress(f'{ipynb}, iter {i+1} of {iters}, reward {reward}, size {size}')   

seed(41)   
          
playoutpi = switchpi(randompi, lookaheadpi(randompi)) 
          
mctsdata = mctstable('mctsdata')      
policy   = {1:playoutpi,2:playoutpi}
if mctsdata.exists():
    mctsdata.load()
else:
    train(mctsdata, policy, iters=500_000, exploration=10)
    mctsdata.save()
MTIME(mctsdata.path())
          
agentpi = searchpi(mctsdata,0.1)
rivalpi = lookaheadpi(randompi)        
ratio1  = testgames({1:agentpi, 2:rivalpi}, iters=10)          
ratio2  = testgames({2:agentpi, 1:rivalpi}, iters=10)          

mctsdata.notfound = set() 
agentpi = qpi(mctsdata)
enum_policies([
    {1:agentpi, 2:randompi},
    {1:agentpi, 2:lookaheadpi(randompi)},
    {1:agentpi, 2:qpi(dptable)},
    {1:agentpi, 2:qpi(mctable)},
    {1:agentpi, 2:qpi(tdtable)},
    {2:agentpi, 1:randompi},
    {2:agentpi, 1:lookaheadpi(randompi)},
    {2:agentpi, 1:qpi(dptable)},
    {2:agentpi, 1:qpi(mctable)},
    {2:agentpi, 1:qpi(tdtable)},
])


rot0 [0 1 2 3 4 5 6 7 8] [0 1 2 3 4 5 6 7 8]
rot90 [2 5 8 1 4 7 0 3 6] [6 3 0 7 4 1 8 5 2]
rot180 [8 7 6 5 4 3 2 1 0] [8 7 6 5 4 3 2 1 0]
rot270 [6 3 0 7 4 1 8 5 2] [2 5 8 1 4 7 0 3 6]
fliplr [2 1 0 5 4 3 8 7 6] [2 1 0 5 4 3 8 7 6]
flipud [6 7 8 3 4 5 0 1 2] [6 7 8 3 4 5 0 1 2]
fliptlbr [8 5 2 7 4 1 6 3 0] [8 5 2 7 4 1 6 3 0]
fliptrbl [0 3 6 1 4 7 2 5 8] [0 3 6 1 4 7 2 5 8]

tmp/ttt-3/mctsdata.npy Aug 29 2020 07:36:29
testing policy 1:searchpi(6243,0.1) vs 2:lookaheadpi(randompi)
testing policy 2:searchpi(6243,0.1) vs 1:lookaheadpi(randompi)
enumerating policy 1:qpi(mctsdata) vs 2:randompi with EnumProc
enumerating policy 1:qpi(mctsdata) vs 2:lookaheadpi(randompi) with EnumProc
enumerating policy 1:qpi(mctsdata) vs 2:qpi(dptable) with EnumProc
enumerating policy 1:qpi(mctsdata) vs 2:qpi(mctable) with EnumProc
enumerating policy 1:qpi(mctsdata) vs 2:qpi(tdtable) with EnumProc
enumerating policy 1:randompi vs 2:qpi(mctsdata) with EnumProc
enumerating policy 1:lookaheadpi(randompi) vs 2:q

Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,qpi(mctsdata),randompi,177,181,8,132,0
1,qpi(mctsdata),lookaheadpi(randompi),177,181,8,132,0
2,qpi(mctsdata),qpi(dptable),29,33,8,0,0
3,qpi(mctsdata),qpi(mctable),8,9,2,0,0
4,qpi(mctsdata),qpi(tdtable),8,9,2,0,0
5,randompi,qpi(mctsdata),449,589,16,0,228
6,lookaheadpi(randompi),qpi(mctsdata),449,589,16,0,228
7,qpi(dptable),qpi(mctsdata),25,26,7,0,0
8,qpi(mctable),qpi(mctsdata),17,16,3,0,0
9,qpi(tdtable),qpi(mctsdata),17,17,4,0,0


In [8]:
print(len(mctsdata.notfound))
pprint(mctsdata)

0
{'000000000': '0fc03e19-9179-40ef-9e81-c0e68d7afed8',
 '000000001': '15aa1c0b-30cb-4d96-944b-cba9d3bf9f06',
 '000000010': '95a684e7-130f-4b9e-b900-7064ab9793b0',
 '000000012': 'd9365ff4-6b48-43cb-bbb8-5c612d0755c3',
 '000000021': 'fa4f099e-27fc-459b-bbc2-3cb827357db1',
 '000000100': '15aa1c0b-30cb-4d96-944b-cba9d3bf9f06',
 '000000102': '22847cef-1314-4c9c-a54e-b1727a9f8694',
 '000000112': '1e77b878-94be-4f0f-baa1-e44c07dc54b6',
 '000000120': 'fa4f099e-27fc-459b-bbc2-3cb827357db1',
 '000000121': 'a5c7c56e-50f6-4e1f-8673-9d1e3cd57597',
 '000000201': '22847cef-1314-4c9c-a54e-b1727a9f8694',
 '000000210': 'd9365ff4-6b48-43cb-bbb8-5c612d0755c3',
 '000000211': '1e77b878-94be-4f0f-baa1-e44c07dc54b6',
 '000001000': '95a684e7-130f-4b9e-b900-7064ab9793b0',
 '000001002': 'd9365ff4-6b48-43cb-bbb8-5c612d0755c3',
 '000001012': '130b1915-3ddf-4854-8d93-bab1e9c7dbb0',
 '000001020': 'f41eed97-ce91-46b6-b6c0-3f2fd423fbfa',
 '000001021': '4723bf0a-1ab4-49fc-9052-6add67ab5bc0',
 '000001102': '8ede71e4-38

 '000122100': '0306dd8e-c3d3-4809-a444-47eeb59c8cca',
 '000122101': '6b89ee01-ed31-4bce-91d5-95b4d351cdc2',
 '000122110': 'e5a4c251-ca19-4882-955f-74947d5f07c0',
 '000122112': '73dcef4f-867f-482d-8022-c7555aed3000',
 '000122121': '88d32ad9-7fe3-46e6-b30d-adfbffd92d1f',
 '000122211': '04aed953-6a7b-4448-90cc-7e239193d773',
 '000200001': '90171b68-a92f-4d6b-ad3a-486aff06695b',
 '000200010': 'f41eed97-ce91-46b6-b6c0-3f2fd423fbfa',
 '000200011': 'b8fa6de9-4283-4160-8a8e-9b04ee198742',
 '000200100': 'fa4f099e-27fc-459b-bbc2-3cb827357db1',
 '000200101': 'dfb6c484-5ea1-4510-a6f1-3ff11f68acfb',
 '000200110': '4723bf0a-1ab4-49fc-9052-6add67ab5bc0',
 '000200112': '0023c5e9-fdb8-428c-9f0d-e87c7625ece2',
 '000200121': '53b0880b-0418-4427-8d51-3384c9d99cc4',
 '000200211': '3bc69c1f-5863-4860-a25b-741f8fcbb8e1',
 '000201000': '47201c13-eae1-43ad-b0b8-3ca1da8c23e6',
 '000201001': 'f9ba3856-9d10-42af-846e-7f80f4edb926',
 '000201010': 'b2970038-9e97-4ba1-aa93-b5c61c34f127',
 '000201012': '433cd5a9-f50b

 '001211221': 'a4fc562f-fed8-41c5-9131-5f38c39cc5fd',
 '001212000': '423703ce-766b-4f90-a22d-02696db390c8',
 '001212001': '09bed0e0-04ad-4023-8ce7-713e819ad638',
 '001212010': '58a6a719-3fc2-4487-95da-4327eeab1e39',
 '001212012': 'd62b9c9b-e83c-4213-81e9-8e1b8a5ff4c0',
 '001212021': '35a061aa-beeb-45f2-8691-15cb2e35ddf3',
 '001212100': '98fa053e-1e56-450e-9aa4-9570e958db12',
 '001212112': '741181bf-21c3-4052-b7b9-241ebe12229d',
 '001212121': '5ece210c-37b5-4995-b003-bf6c9353c3e4',
 '001212201': '5e04c952-d545-4dae-95bf-b0131ce98a85',
 '001212210': 'f9a56f98-73e0-46e2-acb5-63847b4d4a60',
 '001212211': '0e1c86e5-5495-4632-83e7-3de95be6efb4',
 '001220001': '99aafc49-c45e-45b8-9cf9-45fd429863e9',
 '001220010': '4e217206-50ed-4337-940e-7abefb3695fc',
 '001220011': 'c8fa8214-dd15-416c-b5e8-1376df876907',
 '001220100': '14e17f0a-8f68-4f15-a1ab-0d85d62194e4',
 '001220101': 'f19db4d7-8712-4699-8dd7-537c75a9237a',
 '001220110': '06c58807-0dcd-4eb7-847b-7ba0533b9cb2',
 '001220112': '8be35008-24c4

 '010020112': '72d16f1d-4016-4557-b1e6-74a5cf7aced0',
 '010020120': 'cadad97e-9c52-4f58-aab8-7ebedb3f4b3e',
 '010020121': '71119735-0601-419e-95b8-5b3f1ff359f8',
 '010020201': '48671f9e-d52c-40e8-8753-2b22a7065fd9',
 '010020210': 'd240b444-da7d-42cc-a41c-e3eb274a8209',
 '010020211': '72d16f1d-4016-4557-b1e6-74a5cf7aced0',
 '010021000': 'ad52904c-a7fe-4171-acf7-e6b82627309b',
 '010021002': 'ada66488-5496-4299-b673-5b34248fc504',
 '010021012': '5e9755f7-c296-4a42-a9e4-8f562b720a20',
 '010021020': 'ccb24c13-620e-4c10-9eeb-3e99684cc070',
 '010021021': 'ce80eae8-9586-45a7-b05e-757082365600',
 '010021102': 'cf7b8088-cc84-4d41-98af-2b3e6938b017',
 '010021120': '9c07d7d8-6a03-40d9-9fff-f4e8b92f7275',
 '010021122': 'dc5fe2c1-8254-432f-8b69-ec677f49c0f5',
 '010021200': '817852c4-9c38-4415-899b-70c6fd25b9c0',
 '010021201': 'b98a6394-1216-468f-ae72-5f629ecbb67a',
 '010021210': '85b3c4df-2628-44aa-bf7e-13e0e9b7bfbc',
 '010021212': '0f21b0c7-19d6-4ec7-96f0-dfc11bb6a7c9',
 '010021221': 'e0eba746-0959

 '012110221': '72a9cb88-1f9e-4dcf-a5b7-a1c7a783413d',
 '012111022': 'baa976e5-da11-4042-9f56-f9de45a3e768',
 '012111202': 'c51f1e5d-1f55-4c26-a187-e2a9fdc511f3',
 '012111220': '4a0e824f-2148-42fc-a504-5a8676135070',
 '012112000': 'a2cd361d-7d0d-4fef-ac83-3669a19190f9',
 '012112002': '61e090a3-ffbe-47dc-8cf9-911c036d842d',
 '012112020': 'e6023091-d810-41fc-a78d-e532f02ff9b3',
 '012112021': '75fea4e3-2e51-4819-afb7-491a2896a84d',
 '012112120': '18d89a76-1b0d-4ad3-a280-faa7ba928079',
 '012112122': 'c3d77c31-9bab-460a-90c7-c70740b9a2bd',
 '012112200': '22ad2de6-67a9-4f59-8c48-a883ac4ddd59',
 '012112201': '72a9cb88-1f9e-4dcf-a5b7-a1c7a783413d',
 '012112210': '4a0e824f-2148-42fc-a504-5a8676135070',
 '012112221': '11f6f954-78d8-4e39-ae55-1fca88543310',
 '012120000': 'ada66488-5496-4299-b673-5b34248fc504',
 '012120001': 'cf7b8088-cc84-4d41-98af-2b3e6938b017',
 '012120010': '85b3c4df-2628-44aa-bf7e-13e0e9b7bfbc',
 '012120012': '19456f3a-8ce3-492f-90b7-58a6ea12a59a',
 '012120021': '6c53ddba-52e9

 '021201100': '3c43711e-a05e-4cf6-b210-38c2803950bf',
 '021201102': 'c5b9538f-ecd7-4f49-99ba-2d447fd5ca57',
 '021201112': '061bbbe6-9464-46ed-8898-2e125f7daf7a',
 '021201120': 'e78d09df-870d-4adf-a43b-e32ab66ec632',
 '021201121': '284dd840-2499-46e8-b476-6ae7211fad6a',
 '021201210': '299da04e-13ad-4db5-8f15-6ada90783860',
 '021201211': '75397f87-d0ee-459d-b43b-9539c006af4c',
 '021202011': 'ec27f96f-0285-4879-8fca-f89bd198efba',
 '021202101': '6cc5c91b-1266-42a1-b5b6-c1d1f549ba57',
 '021202110': 'e78d09df-870d-4adf-a43b-e32ab66ec632',
 '021202111': '284dd840-2499-46e8-b476-6ae7211fad6a',
 '021210000': '7078b7b7-acd6-4128-9bb9-2e9b281d3131',
 '021210001': '43af2922-cf7f-45ec-8230-bed5b57a222a',
 '021210010': '2b28f248-f99a-4e2c-9e0a-092dc64f2f0c',
 '021210012': '7dc84458-1ba1-4274-a13f-a52dfdefff6d',
 '021210021': 'cec67ea5-c953-4f97-a547-4bf11273de84',
 '021210100': '0d51a234-a8f7-46af-a75b-f8bd209ade7b',
 '021210112': '2962ed2d-0d45-431e-9b94-383c80f0fca7',
 '021210121': '92f24047-2829

 '100120120': '129fefc3-6847-46e2-a38f-33efcf1d1105',
 '100120200': '386ba8be-00bf-4733-94b8-3d857bcddda6',
 '100120201': 'e3d58dd5-1187-4b99-a95d-49fa2c6a35c1',
 '100120210': '8edd998c-dbaf-4af0-a5be-2a20fb796119',
 '100120212': '442b8df7-b3e6-4e52-a41f-24f9bae29f15',
 '100120221': 'dbc9f7ff-b794-456f-9138-63dd122bf669',
 '100121002': 'fd89878a-aae9-48bc-8c7f-bf8105f22b91',
 '100121020': '59884b80-d6b9-4098-8326-f3f253906d45',
 '100121022': 'c018fd9f-a6a0-4bfa-af46-3fa34e496f4d',
 '100121122': '247057a5-2260-465b-86a6-c38d7580b616',
 '100121200': '72d16f1d-4016-4557-b1e6-74a5cf7aced0',
 '100121202': 'e4a871ca-c83c-41b8-b80c-7e922da9af00',
 '100121212': '1384e204-08b1-440f-bdac-363e61a1ff75',
 '100121220': '88038839-411d-43ac-8005-00de9c3025db',
 '100121221': '68ff1abb-b9f9-4cae-96de-42064b7b8c63',
 '100122000': '0306dd8e-c3d3-4809-a444-47eeb59c8cca',
 '100122001': '11c90f94-fef6-4ca3-adf2-cc9ca135455f',
 '100122010': 'e33d2516-6719-4778-8b53-edfe1394f39f',
 '100122012': '76ef78a1-0c6b

 '102020101': '5661589b-1e84-49e9-972c-17b06950b8df',
 '102020110': 'c2daef48-0653-46a4-bde9-e8b7312a3a9a',
 '102020112': 'b995a811-9ab5-404a-b1dc-3b6fd7890382',
 '102020121': 'd06023bc-4a87-453f-9fd0-e56a21c6ae6d',
 '102020211': 'a5277a2a-c878-47db-9694-298c67b9e26f',
 '102021000': '004d3944-6f73-42d2-942e-2ee5e878f4c0',
 '102021001': 'e3d58dd5-1187-4b99-a95d-49fa2c6a35c1',
 '102021010': 'cf7b8088-cc84-4d41-98af-2b3e6938b017',
 '102021012': 'c605cfe2-8443-4feb-af9b-eb9c806b8de3',
 '102021021': '8be35008-24c4-4cde-84fc-bb1eb953f0bb',
 '102021100': '119a3999-e167-4843-8293-57a555bc880b',
 '102021102': '5c78be7e-1112-43e2-864c-46ea34d6e142',
 '102021112': '0bd24e4c-cb64-4b9a-8b8b-82c931e2ed88',
 '102021120': '487e21b9-e81a-4d1d-ad6b-b9a8a3ef2bc9',
 '102021121': '69eb357a-2069-4321-8449-a2b0eb98ce2d',
 '102021201': 'a5277a2a-c878-47db-9694-298c67b9e26f',
 '102021210': 'c24bd1f3-d31e-44ce-8561-906b56f0ba39',
 '102022011': '84ea9c1d-b481-47e5-9546-62e099eaab7a',
 '102022101': '00947276-0473

 '110200012': '34e0a6ce-a4f9-485b-9837-e30232095fa4',
 '110200020': '7dd1e97b-23e1-4652-83ae-28bad44695b5',
 '110200021': '3c43711e-a05e-4cf6-b210-38c2803950bf',
 '110200102': '739aeba0-7ec9-4205-b96f-84eb0f5ab5be',
 '110200120': 'f3026680-3d8f-425e-8b8d-a0524c2e452a',
 '110200122': 'f72825e9-6eb9-49d1-92a0-6836e4de50e8',
 '110200200': '0c1f3681-3b23-4b4c-974e-4ec5cbbecf62',
 '110200201': 'c1037c3f-f1fa-48de-8b37-2822ea887d11',
 '110200210': '118dfa96-c061-42bd-a8fa-f49ced258463',
 '110200212': '8627f778-d4a4-43d3-bd6a-351d2a9959b4',
 '110200221': 'd8b47f2e-5a43-4e8d-adc8-c92772a6791e',
 '110201002': '3a467242-da66-44d7-9ddd-fd139ddae146',
 '110201020': 'd25f2346-64b9-40dd-a238-306199761f65',
 '110201022': '299da04e-13ad-4db5-8f15-6ada90783860',
 '110201122': '21bd7a8b-ca1d-46bc-ad41-fb6ce422892b',
 '110201200': '5a4cd4b4-4443-46ca-b5ed-30e161234b12',
 '110201202': '3789e9fe-2200-419c-be6b-2212d1754907',
 '110201212': '66e72f2a-14a6-42a7-8170-ef8892786b5c',
 '110201220': '46144b0d-e4b5

 '112212100': '0e1c86e5-5495-4632-83e7-3de95be6efb4',
 '112212102': '4ea3e10a-e66d-4bf4-a512-0457e33aabac',
 '112212120': 'de087722-e4aa-4ac6-9645-4774ab1d5893',
 '112212121': 'bce78268-530d-4259-90cd-9c9d735e20cc',
 '112212211': 'e84cc889-8fea-40e9-8a59-ac028fbb4750',
 '112220001': '8be35008-24c4-4cde-84fc-bb1eb953f0bb',
 '112220010': 'c1fc7288-d244-4030-b410-6de9dcf77bc2',
 '112220011': '0330a270-a870-471e-9148-f961885a08c3',
 '112220100': 'bab84770-ba4a-433d-9356-530234288d8f',
 '112220101': '69eb357a-2069-4321-8449-a2b0eb98ce2d',
 '112220110': '123e8d33-0fbd-41b4-93f7-38beb2308835',
 '112220112': 'c7f20852-0ae6-4712-865b-513789ce190d',
 '112220121': '63c055c1-dea0-4a04-b184-0b1276da806e',
 '112220211': '430e4574-be9e-467f-b076-ed3a379b37ea',
 '112221000': '04aed953-6a7b-4448-90cc-7e239193d773',
 '112221001': 'f66d31d8-bfe4-4b2d-9e20-9a1a63eb92e7',
 '112221010': '958694c4-6a82-45c3-9cfb-d60dc28e8540',
 '112221012': '4c643047-6603-44eb-9c4b-8454c3853c28',
 '112221021': 'e6ca22e2-37c5

 '121021122': 'f48ed913-ac55-4b18-82a6-a26959cacca1',
 '121021200': 'ea7e1e28-1ee3-4c4f-8427-8074bd5b725a',
 '121021201': '76ee5b6b-4d51-46e4-a47d-d2a8c6754d3b',
 '121021210': 'ed93cb3f-a677-4a15-ba96-6cc4ea902bd2',
 '121021212': '68a869fe-be3f-4c02-8251-84e753f1911d',
 '121022001': 'a08171c1-0dfe-4e5b-b229-c6bf28e3e162',
 '121022010': '82d8c5aa-172d-4d89-9296-00ac19c4a441',
 '121022011': 'f639efe4-2cf5-4533-b73e-8b9e8d2a8849',
 '121022100': '95a710c4-27fa-4c1d-aa6f-bcf92fa722e0',
 '121022101': '29c1b5aa-91c8-4762-a186-820de7ffc3ae',
 '121022110': '04b521eb-df27-4151-970d-0ad43b010666',
 '121022112': '7755864a-e853-478a-8975-9a991ff1590e',
 '121022121': '5d5a27a5-0bfe-439d-8976-18c6511aa827',
 '121022211': '63c055c1-dea0-4a04-b184-0b1276da806e',
 '121100002': '739aeba0-7ec9-4205-b96f-84eb0f5ab5be',
 '121100020': 'ee7c0802-66fe-4860-8f40-4becf8f55429',
 '121100022': '43a4a048-51c7-4842-af45-5e6f61296291',
 '121100122': '9b8e26ee-5dbd-4036-8d86-45caf8bdb76c',
 '121100200': '12190310-fdbc

 '200100021': '759d3c4c-7f55-40e3-ae14-3590dca34eba',
 '200100100': '1e77b878-94be-4f0f-baa1-e44c07dc54b6',
 '200100102': '753ecc3c-cb5f-4b24-9a3d-197d17790c6f',
 '200100112': 'c195ddb4-1bee-4ca2-93e2-b1d4a391e335',
 '200100120': '0023c5e9-fdb8-428c-9f0d-e87c7625ece2',
 '200100121': '12190310-fdbc-4e2a-8b16-3c58b088d991',
 '200100201': '26f43d8e-3adc-465c-b8fe-ef02defe0dd3',
 '200100210': 'd52accbb-337c-499b-8619-6c1764218098',
 '200100211': '6a877552-0f03-4724-b52d-ba46cf895d02',
 '200101000': '1884387c-cbe9-4fe0-8b53-82befa512aad',
 '200101002': '1033e799-a7ab-4da5-a951-16e0afb3725d',
 '200101012': 'c552c4fb-72c6-47a4-93e0-6e647861ae20',
 '200101020': '741b3baf-8fdb-43ec-9302-58496c2d4e29',
 '200101021': '34e0a6ce-a4f9-485b-9837-e30232095fa4',
 '200101102': 'a332c2a5-5574-4d88-8c74-62099337cce8',
 '200101120': '2c88c9e7-0b32-41d1-808c-db273bc57229',
 '200101122': '78cd9d0e-30bd-43cd-bee0-17e30e2af3d7',
 '200101200': 'b4ee0810-3e76-44f8-8342-0cf56315d404',
 '200101201': '26a7c22f-83df

 '210000021': 'a7c04ad6-357f-475b-8720-0b678802d016',
 '210000100': '8ede71e4-38a3-4d46-8e3b-a903738d8907',
 '210000102': 'e974b3dc-b4c1-491b-ab98-f33149ab9172',
 '210000112': 'a332c2a5-5574-4d88-8c74-62099337cce8',
 '210000120': 'd83d2467-b98c-4814-983a-408322bf9687',
 '210000121': '2bfae856-0eb0-4890-9f17-ea3bbec21e8b',
 '210000201': '3a3d9be3-2089-4ad2-aac9-5ccc3718a2dc',
 '210000210': '6c367433-1336-4dac-a4b3-fb5171d356ac',
 '210000211': '618d7f44-0106-4e98-9579-2bf38575df15',
 '210001000': 'c8bae379-8786-4dfa-bfe8-4156449b6fe1',
 '210001002': '16129ddb-2d71-4940-8918-960293ed5786',
 '210001012': 'c552c4fb-72c6-47a4-93e0-6e647861ae20',
 '210001020': '860ddf75-08f8-4ffe-afb4-743512bdfb99',
 '210001021': '3a467242-da66-44d7-9ddd-fd139ddae146',
 '210001102': '264de447-ed62-4488-ab6c-a4fc47401d44',
 '210001120': '9870c168-b053-4b3f-903d-eccd943de601',
 '210001122': 'c97c80d0-8009-488a-98d6-6ddbda98d400',
 '210001200': '10bede57-73e0-409e-9eb2-29a52487e339',
 '210001201': '1ec5e8cc-420f

 '211002010': '2c88c9e7-0b32-41d1-808c-db273bc57229',
 '211002012': '78cd9d0e-30bd-43cd-bee0-17e30e2af3d7',
 '211002021': 'da1d0e8a-f64c-4444-bb81-d02cc18c29b3',
 '211002100': 'd144e31f-e5ab-4293-b694-47491bd66a9e',
 '211002102': '21cdcbc9-77e3-44c4-95eb-d764c57d8ef3',
 '211002112': '5f711135-a940-4cb7-bd41-a25de2bc67d2',
 '211002120': 'c5b9538f-ecd7-4f49-99ba-2d447fd5ca57',
 '211002121': '08af1249-e0ee-4803-b536-fd20ecb883f2',
 '211002201': '106433f9-9781-44a2-9f79-c4afda38554f',
 '211002210': '953b08a1-5976-43c8-a19a-0c10073b0ca8',
 '211002211': '368c7126-df26-4f2a-b9ab-6164d2453ee1',
 '211010002': 'ac29d051-13ff-452b-86fa-7a3ba2c370fe',
 '211010020': 'd5a419ef-c786-45bd-b183-e85d0e15fa78',
 '211010022': 'f744d97b-7034-449b-b0f1-85efcd7d7b88',
 '211010122': 'c2b986ba-e1d6-445f-9c14-1e06af73715a',
 '211010200': 'e01d314a-ae29-4b10-af89-d8cc34e491e4',
 '211010202': '28565f90-532c-42e7-89df-2d448ecb77ea',
 '211010212': '95a24b1b-747c-447e-9a29-15c13fa9a30c',
 '211010220': '66b3abb3-bf76

 '212100000': 'd52accbb-337c-499b-8619-6c1764218098',
 '212100001': '7d586ded-4001-456c-8818-d07bd7569b3d',
 '212100010': '0e09b004-0b4f-49db-b4f7-c0b9548e6326',
 '212100012': '0ccc4f65-c2ca-462b-b3d7-9256ff7607da',
 '212100021': 'e25b9651-4fda-42a9-a124-6c84532df7bf',
 '212100100': '6a877552-0f03-4724-b52d-ba46cf895d02',
 '212100102': '468c1f96-9b44-46ca-b494-7e81b7cd0288',
 '212100112': '158acf8b-f558-4b5a-ba1c-1b004a94d8fd',
 '212100120': '351c6ae2-3b3b-4f11-945b-1203d2d59df0',
 '212100121': 'bec65878-4cf7-448c-9208-b72d40872684',
 '212100201': '53338abb-5718-4777-a2a7-3f1764116da0',
 '212100210': 'ade4af8b-ec09-4115-b958-983a26e60cf7',
 '212100211': '410344a2-0269-496a-a813-69c243e5da2a',
 '212101000': 'c127b54e-10e6-4245-bae5-14ceff1fa246',
 '212101002': 'ade4af8b-ec09-4115-b958-983a26e60cf7',
 '212101012': 'ab33c075-1fee-4d39-9867-fa1fcdc6205f',
 '212101020': '4ad45626-7d56-473d-bbb2-e2b9bb08c5da',
 '212101021': '0e5a9bb1-d9dd-434c-834b-eb642b0bfc7a',
 '212101102': '410344a2-0269

 '264de447-ed62-4488-ab6c-a4fc47401d44': {'value': -0.5415019762845853,
                                          'visits': 253},
 '26a7c22f-83df-4a58-87bd-33d24d3ee0e3': {'value': 0.11779448621553895,
                                          'visits': 399},
 '26f43d8e-3adc-465c-b8fe-ef02defe0dd3': {'value': 0.3536977491961415,
                                          'visits': 311},
 '27908b83-023d-4df1-9390-117315f82dcc': {'value': -0.5862068965517243,
                                          'visits': 290},
 '27afb869-c75a-48de-b37b-1b1029ac578b': {'value': -0.5810810810810813,
                                          'visits': 444},
 '284dd840-2499-46e8-b476-6ae7211fad6a': {'value': 1.0, 'visits': 316},
 '28565f90-532c-42e7-89df-2d448ecb77ea': {'value': -0.23600973236009726,
                                          'visits': 411},
 '2860d559-cedc-4edf-8854-d2a3297fa3eb': {'value': 1.0, 'visits': 1507},
 '2871ebba-6569-4714-a281-e04c8e9721cf': {'value': -0.5390946502057613,
   

                                          'visits': 309},
 '4703ce6c-26fb-4776-95d6-a517aaaed473': {'value': 1.0, 'visits': 1422},
 '47201c13-eae1-43ad-b0b8-3ca1da8c23e6': {'value': -0.06567489114658952,
                                          'visits': 2756},
 '4723bf0a-1ab4-49fc-9052-6add67ab5bc0': {'value': 0.3381588715664434,
                                          'visits': 2694},
 '484d1a2a-a9b2-4a04-8a64-c63c75538411': {'value': 1.0, 'visits': 397},
 '48609a73-393a-450e-a2f8-b4635896ae25': {'value': -0.01820279451352392,
                                          'visits': 7801},
 '48671f9e-d52c-40e8-8753-2b22a7065fd9': {'value': -0.6696149843912602,
                                          'visits': 1922},
 '487e21b9-e81a-4d1d-ad6b-b9a8a3ef2bc9': {'value': -0.869369369369368,
                                          'visits': 1332},
 '48f4707f-556d-49ff-8546-28439e695b1d': {'value': 1.0, 'visits': 1575},
 '49914d41-52e9-4086-9507-7a5b01aa03e2': {'value': -0.515185601799775

                                          'visits': 394},
 '8b7deb5e-5410-4b00-9df9-73acf65ecb3b': {'value': -0.9166666666666666,
                                          'visits': 144},
 '8ba99cec-f9b9-456f-9db1-e7a3e9d464d7': {'value': 1.0, 'visits': 1578},
 '8bda1c90-7c17-44ba-8d67-286f4b60e12c': {'value': -0.013017500227472532,
                                          'visits': 164855},
 '8be35008-24c4-4cde-84fc-bb1eb953f0bb': {'value': 0.7042424242424244,
                                          'visits': 825},
 '8bfe5951-c6ee-4ecc-b5b1-3bca398c7d1c': {'value': -0.7260155574762325,
                                          'visits': 1157},
 '8c4ac9c3-377d-4ddd-a597-dbb498bd84e5': {'value': 0.8445807770961149,
                                          'visits': 978},
 '8c7c9ed7-0873-44a2-bd95-89ef07fba298': {'value': 0.4732394366197181,
                                          'visits': 355},
 '8c99b36b-ad27-4d4b-876c-5c576e371c8b': {'value': 0.37888198757764,
                 

 'e72c8a1e-c01d-4f65-b31e-fc38c481c5fc': {'value': 1.0, 'visits': 1787},
 'e78d09df-870d-4adf-a43b-e32ab66ec632': {'value': -0.19999999999999998,
                                          'visits': 140},
 'e79b5867-68f8-4ada-9f7b-9646f0f0182e': {'value': 0.716902581182349,
                                          'visits': 1201},
 'e7dd3826-47d1-455d-ac30-e9a3870524d3': {'value': -0.22857142857142854,
                                          'visits': 210},
 'e81f5db2-b4e7-4207-99db-c88a88b2a4cd': {'value': -0.6541353383458651,
                                          'visits': 266},
 'e84cc889-8fea-40e9-8a59-ac028fbb4750': {'value': 1.0, 'visits': 30},
 'e85f12a4-afeb-454e-a26c-6f5e34c39cae': {'value': 1.0, 'visits': 1069},
 'e974b3dc-b4c1-491b-ab98-f33149ab9172': {'value': 0.3224755700325736,
                                          'visits': 307},
 'e97c5ffb-39d8-44ac-a24b-9d3568e12a19': {'value': -0.8417132216014905,
                                          'visits': 1074},
 '

                                          'visits': 1143},
 'ff670f0b-6672-4b43-86b7-17ff23fbf149': {'value': 1.0, 'visits': 590},
 'ff9fac50-d263-44aa-ac43-f9d4127759df': {'value': 0.40121580547112445,
                                          'visits': 329}}


In [None]:
play(policy={2:searchpi(mctsdata,1)})

## Approximate Reinforcement Learning:  Neural networks
...

## Policy Gradients: REINFORCE

$$\large \nabla J(\theta) \propto \sum_s \mu(s) \sum_a q_\pi(s,a) \nabla \pi(a|s, \theta)$$

...

In [8]:
### POLICY-GRADIENTS ###

class pimodel(Model):
    def __init__(self, name, *nn):
        inf  = tf.constant(np.finfo(np.float32).min)
        mask = Input((nn[-1],)) 
        def maskedsoftmax(x):
            ones     = tf.ones_like(x)
            boolmask = tf.cast(mask, dtype=tf.dtypes.bool)
            masked_x = tf.where(boolmask, x, ones*inf)
            return softmax(masked_x)
        x     = Input((nn[0],))
        y     = x
        for n in nn[1:-1]:
            y = Dense(n, activation='relu', trainable=True)(y)
        y     = Dense(nn[-1], trainable=True)(y)
        z     = Activation(maskedsoftmax)(y)
        super(pimodel, self).__init__(inputs=[x,mask], outputs=z, name=name)
        optimizer = Adam(lr=0.001, clipnorm=1.0)
        self.compile(loss=categorical_crossentropy, optimizer=optimizer)     
        self.__name__ = name      
    def __call__(self, s):
        aa   = actions(s)
        xx   = array([onehot(s)]) 
        mask = array([actionmask(aa)])
        zz   = self.predict([xx,mask])
        pp   = ravel(zz)[aa]
        return aa,pp
    def __str__(self):
        return self.name
    def path(self):
        return f'{DIR}/{self.__name__}.npy'
    def save(self):
        save(self.path(), self.get_weights())
    def load(self):
        weights = load(self.path())
        self.set_weights(weights)
    def exists(self):
        return isfile(self.path())

def train(models, discount=0.99, iters=100, games=1, rates={0:0.1}, epsilons={0:0.1}):
    avg_y         = 0
    agent,*_      = models.keys()
    rival         = other(agent)
    model         = models[agent]
    policy        = {}    
    policy[agent] = model
    policy[rival] = models[rival] if rival in models else lookaheadpi(randompi)
    def proc(ss,aa,pp,rr,rate=0.1):
        K.set_value(model.optimizer.lr, rate)
        yy       = getoutcome(ss,aa,pp,rr,discount=discount)
        ii       = [i for i,s in enumerate(ss) if agent == player(s)] 
        ss,aa,yy = ss[ii],aa[ii],yy[ii]
        xx       = array([onehot(s) for s in ss])
        mask     = array([actionmask(actions(s)) for s in ss])
        ohaa     = to_categorical(aa,9)
        error    = model.train_on_batch(x=[xx,mask], y=ohaa, sample_weight=yy, reset_metrics=False)
        return error
    iter_under_policy(policy, iters=iters, games=games, rates=rates, epsilons=epsilons, proc=proc)
                
seed(42)
pimodel1 = pimodel('pimodel1',18,32,9)  
print(pimodel1.summary())

if pimodel1.exists():
    pimodel1.load()
else:
    train({1:pimodel1}, 
        discount = 0.99, 
        iters    = 300, 
        games    = 20, 
        rates    = {0:0.01, 200:0.001}, 
        epsilons = {0:0.1, 1000: 0.1})
    pimodel1.save()
MTIME(pimodel1.path())

agentpi = pimodel1
rivalpi = lookaheadpi(randompi)
testgames({1:agentpi, 2:rivalpi}, iters=TESTS)

agentpi = maxpi(pimodel1)
rivalpi = lookaheadpi(randompi)
testgames  ({1:agentpi, 2:rivalpi}, iters=TESTS)

enum_policies([
    {1:agentpi, 2:rivalpi},
    {1:agentpi, 2:qpi(dptable)},
    {1:agentpi, 2:qpi(mctable)},
    {1:agentpi, 2:qpi(tdtable)},
    {1:agentpi, 2:qpi(mctsdata)},
])



Instructions for updating:
Colocations handled automatically by placer.
Model: "pimodel1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 18)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                608       
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 297       
_________________________________________________________________
activation_1 (Activation)    (None, 9)                 0         
Total params: 905
Trainable params: 905
Non-trainable params: 0
_________________________________________________________________
None
tmp/ttt-3/pimodel1.npy Aug 02 2020 16:43:32
testing policy 1:pimodel1 vs 2:lookaheadpi(randompi)
testing policy 1:maxpi(pimodel1) vs 2:lookaheadpi(randompi)
enumerating policy 1:max

Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,maxpi(pimodel1),lookaheadpi(randompi),113,105,5,66,0
1,maxpi(pimodel1),qpi(dptable),18,17,4,0,0
2,maxpi(pimodel1),qpi(mctable),5,5,1,0,0
3,maxpi(pimodel1),qpi(tdtable),5,5,1,0,0
4,maxpi(pimodel1),qpi(mctsdata),18,17,4,0,0


In [None]:
play(policy={1:maxpi(pimodel1)}) 

In [9]:
### POLICY-GRADIENTS PLAYER2 ###

seed(1)
pimodel2 = pimodel('pimodel2',18,32,9)  
print(pimodel2.summary())

allpi = choicepi(randompi,lookaheadpi(randompi),qpi(dptable),qpi(mctable),qpi(tdtable))

if pimodel2.exists():
    pimodel2.load()
else:
    train({2:pimodel2,1:allpi}, 
        discount = 0.99, 
        iters    = 100_000, 
        games    = 30, 
        rates    = {0:0.01, 30_000:0.001, 70_000:0.0001}, 
        epsilons = {0:0.0}
    )
    pimodel2.save()
MTIME(pimodel2.path())

agentpi = pimodel2
rivalpi = lookaheadpi(randompi)
testgames({2:agentpi, 1:rivalpi}, iters=TESTS)

agentpi = maxpi(pimodel2)
rivalpi = lookaheadpi(randompi)
testgames  ({2:agentpi, 1:rivalpi}, iters=TESTS)

enum_policies([
    {2:agentpi, 1:lookaheadpi(randompi)},
    {2:agentpi, 1:qpi(dptable)},
    {2:agentpi, 1:qpi(mctable)},
    {2:agentpi, 1:qpi(tdtable)},
    {2:agentpi, 1:qpi(mctsdata)},
    {2:agentpi, 1:maxpi(pimodel1)},
])


Model: "pimodel2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 18)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                608       
_________________________________________________________________
dense_4 (Dense)              (None, 9)                 297       
_________________________________________________________________
activation_2 (Activation)    (None, 9)                 0         
Total params: 905
Trainable params: 905
Non-trainable params: 0
_________________________________________________________________
None
tmp/ttt-3/pimodel2.npy Aug 02 2020 16:40:34
testing policy 2:pimodel2 vs 1:lookaheadpi(randompi)
testing policy 2:maxpi(pimodel2) vs 1:lookaheadpi(randompi)
enumerating policy 1:lookaheadpi(randompi) vs 2:maxpi(pimodel2) with EnumProc
enumerating policy

Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,lookaheadpi(randompi),maxpi(pimodel2),240,316,12,0,116
1,qpi(dptable),maxpi(pimodel2),20,22,3,0,0
2,qpi(mctable),maxpi(pimodel2),5,5,1,0,0
3,qpi(tdtable),maxpi(pimodel2),5,5,1,0,0
4,qpi(mctsdata),maxpi(pimodel2),8,9,2,0,0
5,maxpi(pimodel1),maxpi(pimodel2),5,5,1,0,0


In [None]:
play(policy={2:maxpi(pimodel2)}) 

## Approximate Q-Learning
...

In [10]:
### APPROXIMATE-Q-LEARNING ###

class qmodel(Model):
    def __init__(self, name, *nn, activation=None, buffer=100, batch=10):
        def clone():
            clone = qmodel(name,*nn,activation=activation)
            clone.set_weights(self.get_weights())
            return clone
        x     = Input((nn[0],))
        y     = x
        for n in nn[1:-1]:
            y = Dense(n, activation=activation)(y)
        z     = Dense(nn[-1], activation='tanh')(y)
        super(qmodel, self).__init__(inputs=x, outputs=z, name=name)
        optimizer = Adam(lr=0.001, clipnorm=1.0)
        self.compile(loss=mse, optimizer=optimizer) 
        self.__name__ = name        
        self.buffer   = deque(maxlen=buffer*9) 
        self.batch    = batch*9
        self.clone    = clone
    def __call__(self, s, logits=False):
        aa   = actions(s)
        xx   = array([onehot(s)])
        zz   = self.predict(xx)
        qq   = ravel(zz)[aa]
        if logits:
            return aa,qq
        pp   = softmaxprob(qq)
        return aa,pp    
    def push(self, xx, yy):
        for x,y in zip(xx,yy):
            self.buffer.append((x,y))
    def pop(self):
        n  = len(self.buffer)
        ii = choice(arange(n), size=min(self.batch,n))
        xx = array([self.buffer[i][0] for i in ii])
        yy = array([self.buffer[i][1] for i in ii])
        return xx,yy
    def exchange(self, xx, yy):
        self.push(xx, yy)
        return self.pop()
    def __str__(self):
        return self.name
    def path(self):
        return f'{DIR}/{self.__name__}.npy'
    def save(self):
        save(self.path(), self.get_weights())
    def load(self):
        weights = load(self.path())
        self.set_weights(weights)
    def exists(self):
        return isfile(self.path())
    
def train(models, discount=0.9, iters=100, games=10, rates={0:0.1}, epsilons={0:0.1}):
    agent,*_      = models.keys()
    rival         = other(agent)
    model         = models[agent]
    policy        = {}    
    policy[agent] = model
    policy[rival] = models[rival] if rival in models else lookaheadpi(randompi)
    def mcproc(ss,aa,pp,rr,rate=0.1):
        K.set_value(model.optimizer.lr, rate)
        yy = getoutcome(ss,aa,pp,rr,discount=discount)
        xx = array([onehot(s) for s in ss])
        zz = model.predict(xx)
#         mask = array([actionmask(actions(s)) for s in ss])
#         zz  *= mask
        for z,a,y in zip(zz,aa,yy):
            z[int(a)] = y
        yy = zz
        xx,yy = model.exchange(xx,yy)
        error = model.train_on_batch(x=xx, y=yy, reset_metrics=False)
        return error
    iter_under_policy(policy, iters=iters, games=games, rates=rates, epsilons=epsilons, proc=mcproc)
        
seed(42)
qmodelxo = qmodel('qmodelxo',18,36,9,activation='tanh',buffer=100,batch=20)  
print(qmodelxo.summary())

if qmodelxo.exists():
    qmodelxo.load()
else:
    train({1:qmodelxo,2:qmodelxo}, 
          discount = 0.9, 
          iters    = 10_000, 
          games    = 10, 
          rates    = {0:0.1, 3_000:0.05, 5_000:0.01, 10_000:0.005, 15_000:0.001}, 
          epsilons = {0:0.1})
    qmodelxo.save()
MTIME(qmodelxo.path())

agentpi = maxpi(qmodelxo) 
rivalpi = lookaheadpi(randompi)

testgames({1:agentpi, 2:rivalpi}, iters=TESTS)
testgames({2:agentpi, 1:rivalpi}, iters=TESTS)

enum_policies([
    {1:agentpi, 2:rivalpi},
    {1:agentpi, 2:qpi(dptable)},
    {1:agentpi, 2:qpi(mctable)},
    {1:agentpi, 2:qpi(tdtable)},
    {1:agentpi, 2:qpi(mctsdata)},
    {1:agentpi, 2:maxpi(pimodel2)},
    {2:agentpi, 1:rivalpi},
    {2:agentpi, 1:qpi(dptable)},
    {2:agentpi, 1:qpi(mctable)},
    {2:agentpi, 1:qpi(tdtable)},
    {2:agentpi, 1:qpi(mctsdata)},
    {2:agentpi, 1:maxpi(pimodel1)},
])

Model: "qmodelxo"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 18)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 36)                684       
_________________________________________________________________
dense_6 (Dense)              (None, 9)                 333       
Total params: 1,017
Trainable params: 1,017
Non-trainable params: 0
_________________________________________________________________
None
tmp/ttt-3/qmodelxo.npy Aug 27 2020 17:28:23
testing policy 1:maxpi(qmodelxo) vs 2:lookaheadpi(randompi)
testing policy 2:maxpi(qmodelxo) vs 1:lookaheadpi(randompi)
enumerating policy 1:maxpi(qmodelxo) vs 2:lookaheadpi(randompi) with EnumProc
enumerating policy 1:maxpi(qmodelxo) vs 2:qpi(dptable) with EnumProc
enumerating policy 1:maxpi(qmodelxo) vs 2:qpi(mctable) with EnumProc
e

Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,maxpi(qmodelxo),lookaheadpi(randompi),106,93,4,61,1
1,maxpi(qmodelxo),qpi(dptable),7,7,2,0,0
2,maxpi(qmodelxo),qpi(mctable),5,5,1,0,0
3,maxpi(qmodelxo),qpi(tdtable),5,5,1,0,0
4,maxpi(qmodelxo),qpi(mctsdata),7,7,2,0,0
5,maxpi(qmodelxo),maxpi(pimodel2),5,5,1,0,0
6,lookaheadpi(randompi),maxpi(qmodelxo),263,359,10,23,125
7,qpi(dptable),maxpi(qmodelxo),19,26,1,6,0
8,qpi(mctable),maxpi(qmodelxo),5,5,1,0,0
9,qpi(tdtable),maxpi(qmodelxo),3,3,0,1,0


In [None]:
play(policy={2:maxpi(qmodelxo)})

## Scalable Approach: Monte Carlo Tree Search and Deep Q-Network
...

In [11]:
### MCTS & DQN ###

def makenode(value=0, parent=None, terminal=False):
    return {'value'    : value,
            'visits'   : 0,
            'parent'   : parent,
            'branches' : {} if not terminal else None
           }

def isterminal(node):
    return node['branches'] is None

def isexpanded(node):
    return len(node['branches'] or {}) > 0

def backpropagate(node, reward, rate=0.1):
    while node is not None:
        node['visits'] += 1
        node['value']  += (reward - node['value']) / node['visits']
#         node['value']  += (reward - node['value']) * rate
        node            = node['parent']
        reward          = -reward
                
def UCT(node, action, exploration): 
    n     = node['visits']
    child = node['branches'][action]
    nj    = child['visits']
    xj    = child['value']
    if exploration == 0:
        return xj
    if n == 0 or nj == 0:
        return float('inf')
    return xj + exploration*sqrt(2*log(n)/nj)

def selectbranchpi(node, exploration=0):
    assert(isexpanded(node))
    actions  = list(node['branches'].keys())
    values   = array([ UCT(node,a,exploration) for a in actions ])
    probs    = argmaxprob(values)
    return actions,probs

def selectbranch(node, state, exploration):
    actions,probs  = selectbranchpi(node, exploration)
    action         = choice(actions, p=probs)
    next_node      = node['branches'][action]
    next_state,_   = game(state, action)
    return next_node,next_state

def expandbranch(model, node, state):
    assert(not isterminal(node))
    assert(not isexpanded(node))
    branches = node['branches']
    actor    = player(state)
    aa,qq    = model(state, logits=True)
    for a,q in zip(aa,qq):
        _,winner    = game(state,a)
        child       = makenode(value=q, parent=node, terminal=winner is not None)
        branches[a] = child        
        if isterminal(child):
            reward = getreward(actor, winner) 
            backpropagate(child, reward) 

def selectnode(model, node, state, exploration):
    while isexpanded(node):
        node,state = selectbranch(node, state, exploration)
    if not isterminal(node):
        expandbranch(model, node, state)
        node,state = selectbranch(node, state, exploration)
    return node,state

def simulatefrom(leaf, state, policy):
    if isterminal(leaf):
        return leaf['value']
    actor        = player(state)
    winner,_,_,_ = episode(policy, start=state)
    reward       = getreward(actor, winner)
    return reward 
        
def search(model, node, state, policy, exploration=1):
    leaf,state = selectnode(model, node, state, exploration)
    reward     = simulatefrom(leaf, state, policy)
    backpropagate(leaf, reward)
    return reward

last_action = None
def selectaction(s,pi):
    global last_action
    aa,pp       = pi(s)
    i           = choice(range(len(pp)), p=pp)
    last_action = aa[i]
    return aa[i],pp[i]

def searchpi(model, 
             iters       = 1, 
             seconds     = 0, 
             exploration = 1, 
             policy      = {1:randompi,2:randompi}):
    node = None
    @rename(f'searchpi({model.name},{iters},{seconds}s,{policy[1].__name__}-vs-{policy[2].__name__})')
    def searchpi(s):
        nonlocal node
        if node is None or all(s == 0):
            node = makenode(value=0)
        else:
            node = node['branches'][last_action]
        i,t  = iters,time()
        while i > 0 or time() - t < seconds:
            search(model, node, s, policy, exploration)
            i -= 1
        aa,pp = selectbranchpi(node, exploration=0)     
        if exploration == 0:
            node = None
        return aa,pp
    return searchpi

def pushsample(model, state, action, reward):    
    ss = rotate_and_flip(state)
    aa = rotate_and_flip(action)
    xx = array([onehot(s) for s in ss])
    zz = model.predict(xx) 
    for z,a in zip(zz,aa):
        z[int(a)] = reward
    model.push(xx,zz)

def train(model, 
          iters       = 1_000, 
          games       = 10, 
          searches    = 10, 
          exploration = 1, 
          discount    = 0.9, 
          clone_every = -1,
          rates       = {0:0.1}):
    agentpi  = searchpi(model, iters=searches, seconds=0, exploration=exploration)
    print(f'searching under policy {agentpi.__name__}')
    errors   = zeros(iters) 
    rewards  = [None, zeros(iters), zeros(iters)]
    progbar  = Progbar(target=iters, stateful_metrics=['clone','rate'])
    progress = PROGRESS(path=f'{DIR}/progress.log')
    for i in range(iters):
        if clone_every > 0: 
            if i % clone_every == 0:
                clone   = model.clone()
                agentpi = searchpi(clone, iters=searches, seconds=0, exploration=exploration)
                clone   = i
        else:
            clone = i
        if i in rates:
            rate = rates[i]
            K.set_value(model.optimizer.lr, rate)
        ss,aa,pp,rr = samplegames(policy={1:agentpi,2:agentpi}, iters=games)
        yy          = getoutcome(ss,aa,pp,rr,discount=discount)
        for s,a,y in zip(ss,aa,yy):
            pushsample(model,s,a,y)
        for s,r in zip(ss,rr):
            actor = player(s)
            rewards[actor][i] += r/games         
        xx,yy     = model.pop()
        errors[i] = model.train_on_batch(x=xx, y=yy, reset_metrics=False)
        values    = [
            ('clone',  clone),
            ('rate',   rate),
            ('error',  errors[i]),
            ('reward', rewards[1][i]),
        ]        
        progbar.update(i+1, values=values)
        progress(f'{ipynb}, iter {i} of {iters}, {dict(values)}') 
        assert(rewards[1][i] == -rewards[2][i])
    figure()
    plot(rewards[1])
    plot(rewards[2])
    plot(errors,'r')
    title('objective history')
    ylabel('objective')
    xlabel(f'games x{games}')
    legend(['reward[X]', 'reward[O]', 'error'], loc='upper left')
    savefig(f'{DIR}/{model.name}.png')
    show()     

seed(41)   
          
model = qmodel('mctsmodel',18,36,9,activation='tanh',buffer=100,batch=20)  
print(model.summary())

if model.exists():
    model.load()
else:
    train(model, 
          iters       = 10_000, 
          games       = 10,
          searches    = 10,
          exploration = 10,
          discount    = 0.9,
          clone_every = 1000,
          rates       = {0:0.1, 1_000:0.05, 2_000:0.01, 5_000:0.005, 10_000:0.001, 20_000:0.0005, 25_000:0.0001})
    model.save()
MTIME(model.path())

agentpi = maxpi(model) 

testgames({1:agentpi, 2:lookaheadpi(randompi)}, iters=TESTS)
testgames({2:agentpi, 1:lookaheadpi(randompi)}, iters=TESTS)

enum_policies([
    {1:agentpi, 2:randompi},
    {1:agentpi, 2:lookaheadpi(randompi)},
    {1:agentpi, 2:qpi(dptable)},
    {1:agentpi, 2:qpi(mctable)},
    {1:agentpi, 2:qpi(tdtable)},
    {1:agentpi, 2:qpi(mctsdata)},
    {1:agentpi, 2:maxpi(pimodel2)},
    {2:agentpi, 1:randompi},
    {2:agentpi, 1:lookaheadpi(randompi)},
    {2:agentpi, 1:qpi(dptable)},
    {2:agentpi, 1:qpi(mctable)},
    {2:agentpi, 1:qpi(tdtable)},
    {2:agentpi, 1:qpi(mctsdata)},
    {2:agentpi, 1:maxpi(pimodel1)},
])


Model: "mctsmodel"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 18)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 36)                684       
_________________________________________________________________
dense_8 (Dense)              (None, 9)                 333       
Total params: 1,017
Trainable params: 1,017
Non-trainable params: 0
_________________________________________________________________
None
tmp/ttt-3/mctsmodel.npy Aug 30 2020 02:28:57
testing policy 1:maxpi(mctsmodel) vs 2:lookaheadpi(randompi)
testing policy 2:maxpi(mctsmodel) vs 1:lookaheadpi(randompi)
enumerating policy 1:maxpi(mctsmodel) vs 2:randompi with EnumProc
enumerating policy 1:maxpi(mctsmodel) vs 2:lookaheadpi(randompi) with EnumProc
enumerating policy 1:maxpi(mctsmodel) vs 2:qpi(dptable) with EnumPro

Unnamed: 0,policy1,policy2,states1,states2,draws,wins1,wins2
0,maxpi(mctsmodel),randompi,95,88,6,59,0
1,maxpi(mctsmodel),lookaheadpi(randompi),95,88,6,59,0
2,maxpi(mctsmodel),qpi(dptable),20,20,6,0,0
3,maxpi(mctsmodel),qpi(mctable),5,5,1,0,0
4,maxpi(mctsmodel),qpi(tdtable),5,5,1,0,0
5,maxpi(mctsmodel),qpi(mctsdata),19,19,6,0,0
6,maxpi(mctsmodel),maxpi(pimodel2),5,5,1,0,0
7,randompi,maxpi(mctsmodel),267,345,12,7,140
8,lookaheadpi(randompi),maxpi(mctsmodel),265,343,12,7,138
9,qpi(dptable),maxpi(mctsmodel),20,24,5,0,0


In [None]:
# agentpi = searchpi(model, iters=100, seconds=1, exploration=0, policy={1:randompi,2:randompi})
agentpi = maxpi(model)
play(policy={2:agentpi})

In [None]:
print(len(model.buffer))

xx = [ x for x,z in model.buffer ]
xx.sort(key=count_nonzero)

for i,x in enumerate(xx):
    print(i, x, count_nonzero(x))

## Reinforcement Learning: A bit of theory

###### Definitions

$$\large q_{\pi}(s_t, a_t) = \sum_{t'=t}^T \mathbb{E}_{\pi_\theta}[r(s_{t'}, a_{t'}) | s_t, a_t]$$

$$\large v_{\pi}(s_t) = \sum_{t'=t}^T \mathbb{E}_{\pi_\theta}[r(s_{t'}, a_{t'}) | s_t]$$

$$\large p_{\theta}(s_1,a_1,...,s_T,a_T) = p(s_1) \prod_{t=1}^T \pi_{\theta}(a_t|s_t)p(s_{t+1}|s_t,a_t)$$

###### Bellman Expectation Equations

$$\large v_{\pi}(s) = \mathbb{E}[q_{\pi}(s, a)]$$

$$\large v_{\pi}(s) = \sum_a \pi(a | s) q_{\pi}(s, a)$$

$$\large q_{\pi}(s, a) = \sum_{s', r} p(s', r | s, a)[r + \gamma v_{\pi}(s')]$$

$$\large v_{\pi}(s) = \sum_a \pi(a | s) \sum_{s', r} p(s', r | s, a)[r + \gamma v_{\pi}(s')]$$

$$\large q_{\pi}(s, a) = \sum_{s', r} p(s', r | s, a)[r + \gamma \sum_{a'} \pi(a' | s') q_{\pi}(s', a')]$$

###### Bellman Optimality Equations

$$\large v_{\ast}(s) = \max_a q_{\ast}(s, a)$$

$$\large q_{\ast}(s, a) = \sum_{s', r} p(s', r | s, a)[r + \gamma v_{\ast}(s')]$$

$$\large v_{\ast}(s) = \max_a \sum_{s', r} p(s', r | s, a)[r + \gamma v_{\ast}(s')]$$

$$\large q_{\ast}(s, a) = \sum_{s', r} p(s', r | s, a)[r + \gamma \max_{a'} q_{\ast}(s', a')]$$

###### Policy Improvement Theorem

$$\large q_\pi(s, \pi'(s)) \geq v_\pi(s) \implies v_{\pi'}(s) \geq v_{\pi}(s) $$
