In [3]:
#State setting of the board and players

import numpy as np
import pickle

BOARD_ROWS = 3
BOARD_COLS = 3


class State:
    def __init__(self, player1, player2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.player1 = player1
        self.player2 = player2
        self.isEnd = False
        self.boardHash = None
        # player player 1 goes first by initializing it first
        self.playerSymbol = 1

    # getting the unique hash of the current state of the board
    def getHash(self):
        self.boardHash = str(self.board.reshape(BOARD_COLS * BOARD_ROWS))
        return self.boardHash

    #logic to get the winner 
    def winner(self):
        
        # checking row
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
            
        # checking column
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
            
        # checking diagonal
        diag_sum1_l_to_r = sum([self.board[i, i] for i in range(BOARD_COLS)])
        diag_sum2_r_to_l = sum([self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)])
        diag_sum = max(abs(diag_sum1_l_to_r), abs(diag_sum2_r_to_l))
        if diag_sum == 3:
            self.isEnd = True
            if diag_sum1_l_to_r == 3 or diag_sum2_r_to_l == 3:
                return 1
            else:
                return -1

        # checking for tie
        # no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None

    #checking for all the redundant positions
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                #0 indicates not occupied and is available
                #thus, if 0 the we append that board position into positions array
                if self.board[i, j] == 0:
                    positions.append((i, j))  # need to be tuple
        return positions

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    # The game end !!
    def giveReward(self):
        
        result = self.winner()
        
        # backpropagate reward
        #player 1 is winner
        if result == 1:
            self.player1.feedReward(1)
            self.player2.feedReward(0)
            
        #player 2 is winner 
        elif result == -1:
            self.player1.feedReward(0)
            self.player2.feedReward(1)
            
        #game is a tie
        else:
            self.player1.feedReward(0.1)
            self.player2.feedReward(0.5)

    # Resetting the board 
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1
        
##################
# mid code need to be put here
################

class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []  # array of every position that was taken
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}  # states should correspond to the value

    def getHash(self, board):
        boardHash = str(board.reshape(BOARD_COLS * BOARD_ROWS))
        return boardHash   
   
    def make_Move(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            #  make a random move on the board
            index = np.random.choice(len(positions))
            action = positions[index]
        else:
            value_max = -999
            for p in positions: # positional array
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                if value >= value_max:
                    value_max = value
                    action = p
        return action

    # additional hashstate
    def add_State(self, state):
        self.states.append(state)

    # at the end of game, backpropagate and update states value
    def feed_Reward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def save_Policy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def load_Policy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()


class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def make_Move(self, positions): # Have the player insert a row and column 
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

    # append a hash state
    def add_State(self, state):
        pass

    # at the end of game, backpropagate and update states value
    def feed_Reward(self, reward):
        pass

    def reset(self):
        pass