In [94]:
from functools import reduce
def to_bits(*l):
    return reduce(lambda r,i: r |(1<<i), l, 0)

def winning_patterns():
    v1 = to_bits(0,1,2)
    h1 = to_bits(0,3,6)
    return [v1, v1<<3, v1<<6, h1, h1<<1, h1<<2, to_bits(0,4,8), to_bits(2,4,6)]
WINNING_PATTERNS = winning_patterns()

class Game:
    def __init__(self):
        self.board = [0,0]
        self.step = -1
        self.result = None
        
    def possible_moves(self):
        occupied = self.board[0]|self.board[1]
        return [i for i in range(9) if occupied&(1<<i) ==0]
    
    def move(self, i):
        self.step+=1
        color = self.step&1
        occupied = self.board[0]|self.board[1]
        assert (occupied >> i)&1 == 0
        self.board[color] |= (1<<i)
        if any(p&self.board[color] == p for p in WINNING_PATTERNS):
            self.end = True
            self.result = 2*color-1
        elif self.step == 8:
            self.result = 0
    
    def sim_move(self, i):        
        color = (self.step+1)&1
        board = self.board[:]
        board[color] |= (1<<i)
        return (board[0]<<9)|board[1]
            
    def repr(self):
        rtn = ""
        for i in range(9):
            is_o = (self.board[0]>>i)&1
            is_x = (self.board[1]>>i)&1
            p = '.ox'[is_o + 2*is_x]
            rtn+=p
            if i%3==2:
                rtn+="\n"
        return rtn

In [95]:
from random import choice, random

In [221]:
Q=[None]*2**18
reg = lambda x: 1. if x is None else -x

In [223]:
def run_game(game, verbose = False):
    board = (game.board[0]<<9)|game.board[1]
    if game.result is not None:
        Q[board] = -abs(float(game.result))
        if verbose:
            print("result", game.result)
        return
    color = (game.step+1)&1
    d = 2*color -1
    moves = game.possible_moves()    
    boards = [game.sim_move(m) for m in moves]            
    r = random()
    if r < 0.01:
        m = choice(moves)
    else:
        values = [reg(Q[b]) for b in boards]
        m = max(zip(values,moves))[1]
    s = game.repr()
    game.move(m)
    run_game(game, verbose)
    estQ = max(-Q[b] for b in boards if Q[b] is not None)
    if Q[board] is None:
        Q[board] = 0.
    Q[board]+=0.1*(estQ-Q[board])
    if verbose:
        print("color", color, "new score", Q[board], [Q[b] for b in boards])   
        print(s)
        print()
for i in range(1000):
    run_game(Game())
run_game(Game(), True)

result 0
color 0 new score 0.0 [-0.0]
oxo
xx.
oox


color 1 new score -7.617734804586638e-07 [0.0, 0.1]
oxo
.x.
oox


color 0 new score 6.345573092220667e-06 [-7.617734804586638e-07, 0.19, 0.1]
.xo
.x.
oox


color 1 new score -3.001996931791502e-05 [0.1, 0.1, 6.345573092220667e-06, 0.1]
.xo
...
oox


color 0 new score 9.8264360692981e-05 [0.0, 0.0, 0.0, 0.0, -3.001996931791502e-05]
.xo
...
.ox


color 1 new score -0.00024862211302664484 [0.006246255889000001, 9.8264360692981e-05, 0.007205590000000001, 0.006741104410000001, 0.008115687379000001, 0.007048904410000001]
..o
...
.ox


color 0 new score 0.0005174197348110656 [-0.00010000000000000003, -0.00010000000000000003, -1.0000000000000004e-06, -0.00019000000000000006, 1.0000000000000004e-05, -0.00010000000000000003, -0.00024862211302664484]
..o
...
..x


color 1 new score -0.000901644119994498 [0.005055156089654183, 0.004759388248946839, 0.004595590000000001, 0.005028802314148901, 0.004685590000000001, 0.004806193811969446, 0.004595590