In [1]:
#######################################################################
# Copyright (C)                                                       #
# 2016 Shangtong Zhang(zhangshangtong.cpp@gmail.com)                  #
# 2016 Jan Hakenberg(jan.hakenberg@gmail.com)                         #
# 2016 Tian Jun(tianjun.cpp@gmail.com)                                #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################
#######################################################################
# Copyright (C)                                                       #
# 2017 Cheung Auyeung (yellowlab9@gmail.com)                          #                        
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

In [2]:
from __future__ import print_function
import numpy as np
import pickle
from IPython.display import clear_output
import sys

In [3]:
BOARD_ROWS = 3
BOARD_COLS = 3
BOARD_SIZE = BOARD_ROWS * BOARD_COLS

In [4]:
class State:
    def __init__(self):
        # the board is represented by a n * n array,
        # 1 represents chessman of the player who moves first,
        # -1 represents chessman of another player
        # 0 represents empty position
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.winner = None
        self.hashVal = None
        self.end = None

    # Calculate the hash value for one state, it's unique
    # The hash value is an 18 bits representation of the board
    # Each board position has two bits, with one bit per player.
    def getHash(self):
        if self.hashVal is None:
            self.hashVal = 0
            for i in self.board.reshape(BOARD_ROWS * BOARD_COLS):
                if i == -1:
                    i = 2
                self.hashVal = self.hashVal * 4 + i
        return int(self.hashVal)

    # determine whether a player has won the game, or it's a tie
    def isEnd(self):
        if self.end is not None:
            return self.end
        
        results = []
        # check row
        for i in range(0, BOARD_ROWS):
            results.append(np.sum(self.board[i, :]))
            
        # check columns
        for i in range(0, BOARD_COLS):
            results.append(np.sum(self.board[:, i]))
            
        # check diagonals
        results.append(0)
        for i in range(0, BOARD_ROWS):
            results[-1] += self.board[i, i]
            
        results.append(0)
        for i in range(0, BOARD_ROWS):
            results[-1] += self.board[i, BOARD_ROWS - 1 - i]

        for result in results:
            if result == 3:
                self.winner = 1
                self.end = True
                return self.end
            if result == -3:
                self.winner = -1
                self.end = True
                return self.end

        # whether it's a tie
        sum = np.sum(np.abs(self.board))
        if sum == BOARD_ROWS * BOARD_COLS:
            self.winner = 0
            self.end = True
            return self.end

        # game is still going on
        self.end = False
        return self.end

    # @symbol 1 or -1
    # put chessman symbol in position (i, j)
    def nextState(self, i, j, symbol):
        newState = State()
        newState.board = np.copy(self.board)
        newState.board[i, j] = symbol
        return newState

    # print the board
    def show(self):
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == 0:
                    token = '.'
                if self.board[i, j] == -1:
                    token = 'o'
                out += token + ' | '
            print(out)
        print('-------------')

Find all possible state of the the game and keep it in the list `allStates`.

In [5]:
def getAllStatesImpl(currentState, currentSymbol, allStates):
    for i in range(0, BOARD_ROWS):
        for j in range(0, BOARD_COLS):
            if currentState.board[i][j] == 0:
                newState = currentState.nextState(i, j, currentSymbol)
                newHash = newState.getHash()
                if newHash not in allStates.keys():
                    isEnd = newState.isEnd()
                    allStates[newHash] = (newState, isEnd)
                    if not isEnd:
                        getAllStatesImpl(newState, -currentSymbol, allStates)

def getAllStates():
    currentSymbol = 1
    currentState = State()
    allStates = dict()
    allStates[currentState.getHash()] = (currentState, currentState.isEnd())
    getAllStatesImpl(currentState, currentSymbol, allStates)
    return allStates

# all possible board configurations
allStates = getAllStates()

Print the mapping of the state as an 18 bits binary code. Also print the state as a 3 by 3 grids.

In [6]:
for i in allStates.keys()[:4] :
    print("{0:018b}".format(i))
    allStates[i][0].show()
    print()

000000000000000000
-------------
| . | . | . | 
-------------
| . | . | . | 
-------------
| . | . | . | 
-------------

010000000000000000
-------------
| x | . | . | 
-------------
| . | . | . | 
-------------
| . | . | . | 
-------------

010000000000000010
-------------
| x | . | . | 
-------------
| . | . | . | 
-------------
| . | . | o | 
-------------

000000000000000100
-------------
| . | . | . | 
-------------
| . | . | . | 
-------------
| . | x | . | 
-------------



In [7]:
class Judger:
    # @player1: player who will move first, its chessman will be 1
    # @player2: another player with chessman -1
    def __init__(self, player1, player2):
        self.p1 = player1
        self.p2 = player2
        self.currentPlayer = None
        self.p1Symbol = 1
        self.p2Symbol = -1
        self.p1.setSymbol(self.p1Symbol)
        self.p2.setSymbol(self.p2Symbol)
        self.currentState = State()
        self.allStates = allStates

    # give reward to two players at the end of a game
    def giveReward(self):
        if self.currentState.winner == self.p1Symbol:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif self.currentState.winner == self.p2Symbol:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)

    def feedCurrentState(self):
        self.p1.feedState(self.currentState)
        self.p2.feedState(self.currentState)

    def reset(self):
        self.p1.reset()
        self.p2.reset()
        self.currentState = State()
        self.currentPlayer = None

    # @show: if True, print each board during the game
    def play(self, show=False):
        self.reset()
        self.feedCurrentState()
        while True:
            # set current player
            if self.currentPlayer == self.p1:
                self.currentPlayer = self.p2
            else:
                self.currentPlayer = self.p1
            if show:
                self.currentState.show()
            [i, j, symbol] = self.currentPlayer.takeAction()
            self.currentState = self.currentState.nextState(i, j, symbol)
            hashValue = self.currentState.getHash()
            self.currentState, isEnd = self.allStates[hashValue]
            self.feedCurrentState()
            if isEnd:
                return self.currentState.winner

## AI player

In [8]:
class Player:
    # @stepSize: step size to update estimations
    # @exploreRate: possibility to explore
    def __init__(self, stepSize = 0.1, exploreRate=0.1):
        self.allStates = allStates
        self.estimations = dict()
        self.stepSize = stepSize
        self.exploreRate = exploreRate
        self.states = []

    def reset(self):
        self.states = []

    def setSymbol(self, symbol):
        self.symbol = symbol
        for hash in self.allStates.keys():
            (state, isEnd) = self.allStates[hash]
            if isEnd:
                if state.winner == self.symbol:
                    self.estimations[hash] = 1.0
                else:
                    self.estimations[hash] = 0
            else:
                self.estimations[hash] = 0.5

    # accept a state
    def feedState(self, state):
        self.states.append(state)

    # update estimation according to reward after playing a game
    def feedReward(self, reward):
        if len(self.states) == 0:
            return
        self.states = [state.getHash() for state in self.states]
        target = reward
        for latestState in reversed(self.states):
            value = self.estimations[latestState] + \
                self.stepSize * (target - self.estimations[latestState])
            self.estimations[latestState] = value
            target = value
        self.states = []

    # determine next action according to the current state
    def takeAction(self):
        state = self.states[-1]
        nextStates = []
        nextPositions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if state.board[i, j] == 0:
                    nextPositions.append([i, j])
                    nextStates.append(state.nextState(i, j, self.symbol).getHash())
        if np.random.binomial(1, self.exploreRate):
            np.random.shuffle(nextPositions)
            # Not sure if truncating is the best way to deal with exploratory step
            # Maybe it's better to only skip this step rather than forget all the history
            self.states = []
            action = nextPositions[0]
            action.append(self.symbol)
            return action


        # find the probability of winning at all possible next positions
        values = []
        for hash, pos in zip(nextStates, nextPositions):
            values.append((self.estimations[hash], pos))

        # Select the next position with the highest probability of winning.
        # If multiple next positions have the same highest probability, 
        # select one of them in random.            
        np.random.shuffle(values)
        values.sort(key=lambda x: x[0], reverse=True)

        # Return the action leading to the selected next position
        action = values[0][1]
        action.append(self.symbol)
        return action

    def savePolicy(self):
        fw = open('optimal_policy_' + str(self.symbol), 'wb')
        pickle.dump(self.estimations, fw)
        fw.close()

    def loadPolicy(self):
        fr = open('optimal_policy_' + str(self.symbol),'rb')
        self.estimations = pickle.load(fr)
        fr.close()

## Human interface

Input a number to put a chessman on the board according to the following positions on the board:
```
| 1 | 2 | 3 |
| 4 | 5 | 6 |
| 7 | 8 | 9 |
```

In [9]:
class HumanPlayer:
    def __init__(self, stepSize = 0.1, exploreRate=0.1):
        self.symbol = None
        self.currentState = None
        return
    def reset(self):
        return
    def setSymbol(self, symbol):
        self.symbol = symbol
        return
    def feedState(self, state):
        self.currentState = state
        return
    def feedReward(self, reward):
        return
    def takeAction(self):
        data = int(input("Input your position:"))
        clear_output(wait=True)
        data -= 1
        i = data // int(BOARD_COLS)
        j = data % BOARD_COLS
        if self.currentState.board[i, j] != 0:
            return self.takeAction()
        return (i, j, self.symbol)

Train two players to play against each other by playing the game multiple rounds.  

In [10]:
def train(epochs=20000):
    player1 = Player()
    player2 = Player()
    judger = Judger(player1, player2)
    
    player1Win = 0.0
    player2Win = 0.0
    playersTie = 0.0
    for i in range(0, epochs):
        if (i%5000 == 0) :
            print("Epoch", i)
            
        winner = judger.play()        
        if winner == 1:
            player1Win += 1
        elif winner == -1:
            player2Win += 1
        else :
            playersTie += 1
            
        judger.giveReward()
        judger.reset()

    print("Epoch", epochs-1)
    print("ratio of player1Win = ", player1Win / epochs)
    print("ratio of player2Win = ", player2Win / epochs)
    print("ratio of playersTie = ", playersTie / epochs)    
    player1.savePolicy()
    player2.savePolicy()

In [11]:
def compete(turns=500):
    player1 = Player(exploreRate=0)
    player2 = Player(exploreRate=0)
    judger = Judger(player1, player2)
    player1.loadPolicy()
    player2.loadPolicy()
    
    player1Win = 0.0
    player2Win = 0.0
    playersTie = 0.0
    for i in range(0, turns):
        if i%100 == 0 :
            print("Epoch", i)
            
        winner = judger.play()
        if winner == 1:
            player1Win += 1
        elif winner == -1:
            player2Win += 1
        else :
            playersTie += 1
            
        judger.reset()
        
    print("Epoch", turns-1)
    print("ratio of player1Win = ", player1Win / turns)
    print("ratio of player2Win = ", player2Win / turns)
    print("ratio of playersTie = ", playersTie / turns)    

In [12]:
def play():
    while True:
        player1 = Player(exploreRate=0)
        player2 = HumanPlayer()
        judger = Judger(player1, player2)
        player1.loadPolicy()
        winner = judger.play(True)
        judger.currentState.show()
        if winner == player2.symbol:
            print("Win!")
        elif winner == player1.symbol:
            print("Lose!")
        else:
            print("Tie!")
        sys.stdout.flush()
        data = raw_input("Play Again? Hit return to play or otherwise to quit: ")
        if data != "":
            print("Thank you for playing. Bye!")
            break
        else:
            clear_output(wait=True)

In [13]:
train()

Epoch 0
Epoch 5000
Epoch 10000
Epoch 15000
Epoch 19999
ratio of player1Win =  0.2184
ratio of player2Win =  0.2563
ratio of playersTie =  0.5253


In [14]:
compete()

Epoch 0
Epoch 100
Epoch 200
Epoch 300
Epoch 400
Epoch 499
ratio of player1Win =  0.0
ratio of player2Win =  0.0
ratio of playersTie =  1.0


In [15]:
play()

-------------
| x | o | x | 
-------------
| o | o | x | 
-------------
| . | x | o | 
-------------
-------------
| x | o | x | 
-------------
| o | o | x | 
-------------
| x | x | o | 
-------------
Tie!
Play Again? Hit return to play or otherwise to quit: q
Thank you for playing. Bye!
