Goal: build an engine based on some kind of monte-carlo tree search.

https://en.wikipedia.org/wiki/Monte_Carlo_tree_search

Make it sufficiently generic to be applied to various different 2 player games.

(i.e. separate completely game rules from MCTS)

Once that's done, allow the option of an evaluation function in place of random playouts, or evaluation-driven playouts in place of random ones.

Define the interface that a game must expose to be plugged into this system

Something like:

In [1]:
from typing import Optional
import numpy as np
import time
from random import choice
from collections import Counter

In [2]:
from enum import Enum
from copy import copy

In [3]:
from dataclasses import dataclass

In [24]:
class Player(str,Enum):
    ONE="ONE"
    TWO="TWO"
    def __repr__(self):
        return self.name
    
    
    
    
def other_player(player)->Player:
    return{
        Player.ONE:Player.TWO,
        Player.TWO:Player.ONE
    }[player]
    
@dataclass(frozen=True)
class Command():
    j:int
    i:int
        
class Illegal(Exception):pass       

class GameOver(Exception):pass

class Result(str, Enum):
    PLAYER1="PLAYER1"
    PLAYER2="PLAYER2"
    DRAW="DRAW"
    INPROGRESS="INPROGRESS"
    def __repr__(self):
        return self.name
    
SIZE=3

def _result(m)->Result:
    if any((m.sum(axis=1))==SIZE): return Result.PLAYER1
    if any((m.sum(axis=0))==SIZE): return Result.PLAYER1
    if m.trace()==SIZE: return Result.PLAYER1
    if np.fliplr(m).trace()==SIZE:return Result.PLAYER1

    if any((m.sum(axis=1))==-SIZE): return Result.PLAYER2
    if any((m.sum(axis=0))==-SIZE): return Result.PLAYER2
    if m.trace()==-SIZE: return Result.PLAYER2
    if np.fliplr(m).trace()==-SIZE:return Result.PLAYER2

    if not (m==0).any():return Result.DRAW

    return Result.INPROGRESS

# We need at least an implicit guarantee that State is immutable
class State:
    def __init__(self,m=None,player=None):
        if m is None:
            assert player is None
            self.player=Player.ONE
            self._m=np.array([[
                0 for i in range(SIZE)
            ]
                for j in range(SIZE)
            ])
        else:
            assert player is not None
            self._m=m
            self.player=player
            
        self.result=_result(self._m)
        
        self.commands= [
            Command(j,i)
            for j in range(SIZE)
            for i in range(SIZE)
            if self._m[j,i]==0
        ]
    
    def __repr__(self):
        d={0:" ",-1:"X", 1:"O"}
        return(("\n"+"-"*(SIZE*2-1)+"\n").join([
            "|".join(d[el] for el in line)
            for line in self._m
        ])) + f"    player {self.player}"

    
    def applyCommand(self, command:Command)->"State":
        
        if self.result!=Result.INPROGRESS:
            raise GameOver()
        # check legality
        if command not in self.commands:
            raise Illegal()
            
        if self._m[command.j,command.i]!=0:
            raise Illegal()
         
        
        m=self._m.copy()
        v={Player.ONE:1, Player.TWO:-1}[self.player]
        m[command.j, command.i]= v
        return State(
            m,
            other_player(self.player)
        )
        

In [5]:
s1=State()

In [6]:
def playout(state:State)->Result:
    while state.result==Result.INPROGRESS:
        command=choice(state.commands)
        state=state.applyCommand(command)
    return state.result

In [7]:
def do_playouts(state, seconds):
    c=Counter()
    end=seconds+time.time()
    assert seconds>0
    while time.time()<end:
        result=playout(state)
        c[result.name]+=1
    return c

In [8]:
def evaluate(state, player, seconds):
    c=do_playouts(state,seconds)
    name=player.name
    playouts=sum(c.values())
   
    if player==Player.ONE:
        wins=c['PLAYER1']
    elif player==Player.TWO:
        wins=c['PLAYER2']
    else:
        assert 0
    return wins/playouts

In games where draws are possible, a draw causes the numerator for both black and white to be incremented by 0.5 and the denominator by 1. This ensures that during selection, each player's choices expand towards the most promising moves for that player, which mirrors the goal of each player to maximize the value of their move

In [9]:
def get_command_scores(state, seconds):

    assert len(state.commands)>0
    per_command=seconds/len(state.commands)
    return {
        command:evaluate(
            state.applyCommand(command), state.player, per_command
        )
        for command in state.commands
    }    

In [10]:
def pick_move(state,seconds)->Command:
    command_scores=get_command_scores(state, seconds)
    # return highest scoring command
    return max(command_scores.items(), key=lambda cmd_score: cmd_score[1])[0]

In [98]:
pick_move(State(),.3)

Command(j=2, i=0)

# OK, can we build a repl?

In [99]:
def get_command():
    s=input("COMMAND> ")
    j, i = s.strip().split(' ')
    return Command(int(j), int(i))

In [102]:
State()

 | | 
-----
 | | 
-----
 | | 

In [103]:
s=State()
while s.result==Result.INPROGRESS:
    human = get_command()
    s=s.applyCommand(human)
    computer = pick_move(s,0.5)
   
    s=s.applyCommand(computer)
    print(s)
print(f"GAME OVER: {s.result}")


KeyboardInterrupt: 

OK so that's all pretty good. Next step is to implement MCTS

In [112]:
s=State()
print(s)
history=[s]
while s.result==Result.INPROGRESS:
    print('\n====================\n')
    command = pick_move(s,1) 
    s=s.applyCommand(command)
    print(s)
    history.append(s)
print(s.result)

 | | 
-----
 | | 
-----
 | | 


 | | 
-----
 |O| 
-----
 | | 


 | |X
-----
 |O| 
-----
 | | 


O| |X
-----
 |O| 
-----
 | | 


O| |X
-----
 |O| 
-----
 | |X


O| |X
-----
 |O|O
-----
 | |X


O| |X
-----
 |O|O
-----
X| |X


O| |X
-----
O|O|O
-----
X| |X
Result.PLAYER1


In [26]:
s=State(m=np.array([
    [1,0,-1],
    [0,1,1],
    [0,0,-1],
]), player=Player.TWO)

In [27]:
s

O| |X
-----
 |O|O
-----
 | |X    player TWO

In [28]:
s.player

TWO

In [29]:
s.commands

[Command(j=0, i=1), Command(j=1, i=0), Command(j=2, i=0), Command(j=2, i=1)]

In [30]:
pick_move(s,.5)

Command(j=2, i=1)

In [33]:
[
    round(100*evaluate(s.applyCommand(c),Player.TWO, .5),2)
    for c in s.commands
]

[0.0, 0.0, 16.07, 16.55]

Why is command 3 not considered strongest?

Probably because if I play it, I never win, and sometimes lose.
If I _don't_ play it, I may lose, but that's only guaranteed by good, not random play.

So this would be a good test case for MCTS

Pretty sure something is wrong here. Up next - typing and unit tests!

In [543]:
m=s._m

In [544]:
m

array([[ 1,  1,  1, -1,  1],
       [ 1, -1,  0,  0,  0],
       [ 0,  0, -1,  0,  0],
       [ 0,  0,  0, -1,  0],
       [ 0,  0,  0,  0, -1]])

In [547]:
m.sum(axis=0)

array([ 2,  0,  0, -2,  0])

In [548]:
m.trace()

-3