### Board representation
1) Each of the tile will either be marked as 0, 1 or -1.

2) If a tile is vacant, it will be marked 0. If the tile is taken by player 1, it will be marked 1. If a tile is taken by player 2, it will be marked -1


In [52]:
################################
# Tic tac toe using TD learning
################################

import numpy as np
NROWS = 3
NCOLS = 3
NBOARD = NROWS*NCOLS

class State:
    def __init__(self):
        """
        Constructor
        """
        self.board = np.zeros((NROWS, NCOLS))
        self.end = None
        self.winner = None
        self.hash_val = None

    def hash(self):
        """
        Unique hash string of the game state
        :return: hash string describing game state
        """
        if self.hash_val is None:
            self.hash_val = 0
            for i in np.nditer(self.board):
                self.hash_val = self.hash_val * 3 + i + 1
        return self.hash_val
    
    def is_end(self):
        """
        Determine if a game has ended and winner of the game (if available)
        :return: winner (+1,-1), tie (0) or ongoing game (None)
        """
        if self.end is not None:
            return self.end
        
        results = []
        
        # row
        for i in range(NROWS):
            results.append(np.sum(self.board[i,:]))
        # column
        for i in range(NCOLS):
            results.append(np.sum(self.board[:,i]))
        # diagonal
        trace = 0
        reverse_trace = 0
        for i in range(NROWS):
            trace += self.board[i, i]
            reverse_trace += self.board[i, NROWS - 1 - i]
        results.append(trace)
        results.append(reverse_trace)

        for result in results:
            if result == 3:
                self.winner = 1
                self.end = True
                return self.end
            if result == -3:
                self.winner = -1
                self.end = True
                return self.end

        # tie
        if np.sum(np.abs(self.board)) == NBOARD:
            self.winner = 0
            self.end = True
            return self.end

        # ongoing
        self.end = False
        return self.end

    def next_state(self,i,j,symbol):
        """
        Updates board as player moves to a given position
        :param position: position that a particular player moves to
        """
        new_state = State()
        new_state.board = np.copy(self.board)
        new_state.board[i,j] = symbol
        return new_state

    def print_board(self):
        """
        Prints board
        """
        for i in range(NROWS):
            print('-------------')
            out = '| '
            for j in range(NCOLS):
                if self.board[i, j] == 1:
                    token = '*'
                elif self.board[i, j] == -1:
                    token = 'x'
                else:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-------------')
        
        
def get_all_states_recursion(curr_state, curr_symbol, all_states):
    """
    Recursively get all the states in a dictionary
    :param curr_state: current state
    :param curr_symbol: current player (+1/-1)
    :param all_state: dictionary mapping state hash to ending condition
    """
    for i in range(NROWS):
        for j in range(NCOLS):
            new_state = curr_state.next_state(i,j,curr_symbol)
            new_hash = new_state.hash();
            # add not already populated in all_state
            if new_hash not in all_states:
                is_end = new_state.is_end()
                all_states[new_hash] = (new_state, is_end)
                # continue recursing with the other player's input if game is still ongoing
                if not is_end:
                    get_all_states_recursion(new_state, -curr_symbol, all_states)

def get_all_states():
    """
    Returns all the states in a dictionary
    :return all_state: dictionary mapping state hash to ending condition
    """
    # first iteration
    curr_symbol = 1
    curr_state = State()
    all_states = dict()
    all_states[curr_state.hash()] = (curr_state, curr_state.is_end())
    # recursion
    get_all_states_recursion(curr_state, curr_symbol, all_states)
    return all_states

# all possible board configurations
all_states = get_all_states()

In [None]:
class Player:
    def __init__(self, step_size=0.1, epsilon=0.1):
        self.estimations = dict()
        self.step_size = step_size
        self.epsilon = epsilon
        self.states = []
        self.greedy = []
        self.symbol = 0

    def reset(self):
        self.states = []
        self.greedy = []
        
    def set_state(self, state):
        self.states.append(state)
        self.greedy.append(True)
        
    def set_symbol(self, symbol):
        self.symbol = symbol
        for hash_val in all_states:
            state, is_end = all_states[hash_val]
            if is_end:
                if state.winner == self.symbol:
                    # win
                    self.estimations[hash_val] = 1.0
                elif state.winner == 0:
                    # tie
                    self.estimations[hash_val] = 0.5
                else:
                    # lose
                    self.estimations[hash_val] = 0
            else:
                # ongoing, treated the same as tie
                self.estimations[hash_val] = 0.5