# Reinforcement Learning Practice: Tic Tac Toe
##### ***Jun 8***
##### ***Charles Zhang***

In [1]:
import numpy as np
import pickle

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 3

## Environment(State) Setting

In [3]:
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.end = False
        self.board_hash = None
        self.player = 1    # default that p1 plays first
    
    def show_board(self):
        # p1: x  p2: o
        for i in range(BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')
        
    def winner(self):
        """
        :return:1 if P1 wins, -1 if P2 wins, 0 if tie 
        """
        results = []
        for i in range(BOARD_ROWS):
            results.append(np.sum(self.board[i, :]))
        for i in range(BOARD_COLS):
            results.append(np.sum(self.board[:, i]))
        results.append(0)
        for i in range(BOARD_ROWS):
            results[-1] += self.board[i, i]    
        results.append(0)
        for i in range(BOARD_ROWS):
            results[-1] += self.board[i, BOARD_ROWS - 1 - i]

        for result in results:
            if result == 3:
                self.end = True
                return 1
            elif result == -3:
                self.end = True
                return -1
        # for tie
        sum = np.sum(np.abs(self.board))
        if sum == BOARD_ROWS * BOARD_COLS:
            self.end = True
            return 0
        self.end = False
        return None
    
    # get available positions
    def get_space(self):
        space = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    space.append((i, j))  # need to be tuple
        return space
    
    def give_reward(self):
        winner = self.winner()
        if winner == 1:
            self.p1.feed_reward(1)
            self.p2.feed_reward(0)
        elif winner == -1:
            self.p1.feed_reward(0)
            self.p2.feed_reward(1)
        else:
            self.p1.feed_reward(0.1)    # can be changed for tie
            self.p2.feed_reward(0.5)
        
    def play(self, player):
        positions = self.get_space()
        player_action = player.get_action(positions, self.board, self.player)
        self.update_state(player_action)
        board_hash = self.get_hash()
        player.add_state(board_hash)
        return self.winner()
    
    def play_against_self(self, rounds=5000):
        for i in range(rounds):
            if i%1000 == 0:
                print("Rounds {}".format(i))
            while not self.end:
                winner = self.play(self.p1)
                if winner is not None:
                    self.give_reward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break
                else:
                    winner = self.play(self.p2)
                    if winner is not None:
                        self.give_reward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
                
    def play_against_human(self):
        while not self.end:
            positions = self.get_space()
            p1_action = self.p1.get_action(positions, self.board, self.player)
            self.update_state(p1_action)
            self.show_board()
            winner = self.winner()
            if winner is not None:
                if winner == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break
            else:
                positions = self.get_space()
                p2_action = self.p2.get_action(positions)
                self.update_state(p2_action)
                self.show_board()
                winner = self.winner()
                if winner is not None:
                    if winner == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break
        
    def update_state(self, position):
        self.board[position] = self.player
        self.player *= -1
        
    def get_hash(self):
        self.board_hash = str(self.board.reshape(BOARD_COLS*BOARD_ROWS))
        return self.board_hash    
    
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.board_hash = None
        self.end = False
        self.player = 1

## Player Setting



Using ϵ-greedy method to balance between exploration and exploitation. If ϵ=0.3 , 70% of the time our agent will take greedy action, which is choosing action based on current estimation of states-value, and 30% of the time our agent will take random action. 

In [4]:
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []  
        self.alpha = 0.5  # learning rate
        self.exp_rate = exp_rate
        self.decay_gamma = 1 # discont factor, may be used in the future
        self.states_value = {}  # state value dictionary
    
    @staticmethod
    def get_hash(board):
        board_hash = str(board.reshape(BOARD_COLS*BOARD_ROWS))
        return board_hash
    
    def get_action(self, positions, current_board, player_name):
        """
        :param positions: available positions(actions) for now
        :return: the best action considering the greedy move
        """
        if np.random.uniform(0, 1) <= self.exp_rate:
            idx = np.random.choice(len(positions))  # random action
            action = positions[idx]
        else:
            max_value = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = player_name
                next_board_hash = self.get_hash(next_board)
                if self.states_value.get(next_board_hash) is None:
                    value = 0 
                else:
                    value = self.states_value.get(next_board_hash)
                if value >= max_value:
                    max_value = value
                    action = p
        return action
        
    def feed_reward(self, reward):
        # when the game ends, reversed to find st+1
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.alpha*(self.decay_gamma*reward - self.states_value[st])
            reward = self.states_value[st]
            
    def add_state(self, state):
        self.states.append(state)
        
    def reset(self):
        self.states = []
        
    def save_policy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def load_policy(self, file):
        fr = open(file,'rb')
        self.states_value = pickle.load(fr)
        fr.close()
                
            

## Human Player Setting

In [5]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name 
    
    def get_action(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action
            
    def add_state(self, state):
        pass

    def feed_reward(self, reward):
        pass
            
    def reset(self):
        pass

## Trainning

In [6]:
p1, p2 = Player("p1"), Player("p2")
state = State(p1, p2)
print("training...")
state.play_against_self()
print("training finished")

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
training finished


In [7]:
p1.save_policy()
p2.save_policy()

## Play with Human

In [8]:
p1 = Player("computer", exp_rate=0)
p1.load_policy("policy_p1")    # update the state values after learning
p2 = HumanPlayer("human")
state = State(p1, p2)
state.play_against_human()

-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
Input your action row:1
Input your action col:1
-------------
|   |   |   | 
-------------
| x | o |   | 
-------------
|   |   |   | 
-------------
-------------
|   |   |   | 
-------------
| x | o |   | 
-------------
|   |   | x | 
-------------
Input your action row:0
Input your action col:0
-------------
| o |   |   | 
-------------
| x | o |   | 
-------------
|   |   | x | 
-------------
-------------
| o |   |   | 
-------------
| x | o |   | 
-------------
| x |   | x | 
-------------
Input your action row:2
Input your action col:1
-------------
| o |   |   | 
-------------
| x | o |   | 
-------------
| x | o | x | 
-------------
-------------
| o | x |   | 
-------------
| x | o |   | 
-------------
| x | o | x | 
-------------
Input your action row:0
Input your action col:2
-------------
| o | x | o | 
-------------
| x | o |   | 
-------------
| x | o | x | 
-------------


In [9]:
state.play_against_human()

-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
Input your action row:1
Input your action col:1
-------------
|   |   |   | 
-------------
| x | o |   | 
-------------
|   |   |   | 
-------------
-------------
|   |   |   | 
-------------
| x | o |   | 
-------------
|   |   | x | 
-------------
Input your action row:0
Input your action col:2
-------------
|   |   | o | 
-------------
| x | o |   | 
-------------
|   |   | x | 
-------------
-------------
|   |   | o | 
-------------
| x | o |   | 
-------------
| x |   | x | 
-------------
Input your action row:0
Input your action col:0
-------------
| o |   | o | 
-------------
| x | o |   | 
-------------
| x |   | x | 
-------------
-------------
| o |   | o | 
-------------
| x | o |   | 
-------------
| x | x | x | 
-------------
computer wins!





**reference:**
    
http://incompleteideas.net/book/the-book-2nd.html
    
https://github.com/JaeDukSeo/reinforcement-learning-an-introduction
    
https://towardsdatascience.com/reinforcement-learning-implement-tictactoe-189582bea542