# 몬테 카를로 트리 탐색 tic-tac-toe agent

In [72]:
#################
# 선공:  1, 'O' #
# 후공: -1, 'X' #
#################

In [73]:
import numpy as np
import matplotlib.pyplot as plt
import math

## State

In [74]:
BOARD_SIZE = (3,3)

In [75]:
class State:
    def __init__(self, board_size=BOARD_SIZE, my_actions=None, enemy_actions=None):
        self.board_size = board_size # (3,3)
        self.num_actions = self.board_size[0] * self.board_size[1] # 3 * 3 = 9
        self.action_space = range(self.num_actions)

        self.my_actions = [] if my_actions is None else my_actions
        self.enemy_actions = [] if enemy_actions is None else enemy_actions

        self.board = self.create_board(self.my_actions, self.enemy_actions)

        self.available_actions = self.get_available_actions()

    def next(self, action):
        '''
        내 행동 이후 상대방 턴으로 변경
        '''
        my_actions = self.my_actions.copy()
        my_actions.append(action)

        return State(self.board_size, self.enemy_actions, my_actions)

    def create_board(self, my_actions, enemy_actions):
        total_board = np.zeros((2,self.board_size[0],self.board_size[1]))

        my_board = np.zeros(self.board_size).flatten()
        enemy_board = np.zeros(self.board_size).flatten()

        my_board[my_actions] = 1
        enemy_board[enemy_actions] = 1

        total_board[0] = my_board.reshape(self.board_size)
        total_board[1] = enemy_board.reshape(self.board_size)

        return total_board

    def get_available_actions(self):
        my_actions_set = set(self.my_actions)
        enemy_actions_set = set(self.enemy_actions)

        available_actions_set = set(range(self.num_actions)) - my_actions_set - enemy_actions_set

        return list(available_actions_set)

    def is_win(self):
        my_state = self.board[0]

        row_win = np.sum(my_state, axis=0).max() == self.board_size[0]
        col_win = np.sum(my_state, axis=1).max() == self.board_size[1]
        diag_win = np.trace(my_state) == self.board_size[0]
        anti_diag_win = np.trace(np.fliplr(my_state)) == self.board_size[0]

        return row_win or col_win or diag_win or anti_diag_win

    def is_draw(self):
        return (np.sum(self.board[0]) + np.sum(self.board[1])) >= self.num_actions

    def is_lose(self):
        enemy_state = self.board[1]

        row_lose = np.sum(enemy_state, axis=0).max() == self.board_size[0]
        col_lose = np.sum(enemy_state, axis=1).max() == self.board_size[1]
        diag_lose = np.trace(enemy_state) == self.board_size[0]
        anti_diag_lose = np.trace(np.fliplr(enemy_state)) == self.board_size[0]

        return row_lose or col_lose or diag_lose or anti_diag_lose

    def is_done(self):
        return self.is_win() or self.is_draw() or self.is_lose()

    def is_going_first(self):
        return len(self.my_actions) == len(self.enemy_actions)

## Environment

In [76]:
class TicTacToeEnv:
    def __init__(self):
        self.state = State()

        self.reward = {'win': 10, 'lose': -10, 'draw': 0, 'continue': 0}

    def reset(self):
        self.state = State()
        return self.state

    def step(self, action):
        my_actions = self.state.my_actions.copy()
        enemy_actions = self.state.enemy_actions.copy()

        my_actions.append(action)

        next_state = State(self.board_size, my_actions, enemy_actions)
        self.state = State(self.board_size, self.state.enemy_actions, my_actions) # 다음 스텝 - 상대방 턴

        if next_state.is_win():
            reward, done = self.reward['win'], True

        elif next_state.is_draw():
            reward, done = self.reward['draw'], True

        elif next_state.is_lose():
            reward, done = self.reward['lose'], True

        else:
            reward, done = self.reward['continue'], False

        return self.state, next_state, reward, done

    def reset(self):
        self.state = State()
        return self.state

    def render(self, state):
        board = state.board[0] + (-1 * state.board[1]) if state.is_going_first() else state.board[1] + (-1 * state.board[0])

        int_to_symbol = np.where(board == 1, 'O',
                                np.where(board == -1, 'X', '-'))

        rendering_board = '\n'.join([' '.join(row) for row in int_to_symbol])

        print()
        print(rendering_board)
        print()

## MCTS Agent

In [77]:
NUM_OF_SIMULATION = 100

In [78]:
class MCTSAgent:
    def __init__(self, env):
        self.env = env
        self.num_of_simulation = NUM_OF_SIMULATION

    def mcts_action(self, state):
        # 몬테카를로 트리 탐색의 노드 정의
        class Node:
            # 노드 초기화
            def __init__(self, state, agent):
                self.state = state # 상태
                self.agent = agent # MCTSAgent 인스턴스
                self.w = 0 # 보상 누계
                self.n = 0 # 시행 횟수
                self.child_nodes = None  # 자녀 노드 군

            # 국면 가치 계산
            def evaluate(self):
                # 게임 종료 시
                if self.state.is_done():
                    # 승패 결과로 가치 취득
                    value = -1 if self.state.is_lose() else 0 # 패배 시 -1, 무승부 시 0

                    # 보상 누계와 시행 횟수 갱신
                    self.w += value
                    self.n += 1
                    return value

                # 자녀 노드가 존재하지 않는 경우
                if not self.child_nodes:
                    # 플레이아웃으로 가치 얻기
                    value = self.agent.playout(self.state)  # self.playout -> self.agent.playout로 수정

                    # 보상 누계와 시행 횟수 갱신
                    self.w += value
                    self.n += 1

                    # 자녀 노드 전개
                    if self.n == 10:
                        self.expand()
                    return value

                # 자녀 노드가 존재하는 경우
                else:
                    # UCB1이 가장 큰 자녀 노드를 평가해 가치 얻기
                    value = -self.next_child_node().evaluate()

                    # 보상 누계와 시행 횟수 갱신
                    self.w += value
                    self.n += 1
                    return value

            # 자녀 노드 전개
            def expand(self):
                available_actions = self.state.available_actions
                self.child_nodes = []
                for action in available_actions:
                    self.child_nodes.append(Node(self.state.next(action), self.agent))  # 자녀 노드 생성 시에도 agent 전달

            # UCB1이 가장 큰 자녀 노드 얻기
            def next_child_node(self):
                # 시행 횟수가 0인 자녀 노드 반환
                for child_node in self.child_nodes:
                    if child_node.n == 0:
                        return child_node

                # UCB1 계산
                t = 0
                for c in self.child_nodes:
                    t += c.n
                ucb1_values = []
                for child_node in self.child_nodes:
                    ucb1_values.append(-child_node.w/child_node.n+(2*math.log(t)/child_node.n)**0.5)

                # UCB1이 가장 큰 자녀 노드 반환
                return self.child_nodes[self.agent.argmax(ucb1_values)]    # self.argmax -> self.agent.argmax로 수정

        # 현재 국면의 노드 생성
        root_node = Node(state, self)  # Node 생성 시 agent(self) 전달
        root_node.expand()

        # 100회 시뮬레이션 실행
        for _ in range(self.num_of_simulation):
            root_node.evaluate()

        # 시행 횟수가 가장 큰 값을 갖는 행동 반환
        available_actions = state.available_actions
        n_list = []
        for c in root_node.child_nodes:
            n_list.append(c.n)

        print('------------------')
        print(f"Available_actions : {available_actions}")
        print(f"Value_per_action : {n_list}")

        return available_actions[self.argmax(n_list)]

    def playout(self, state):
        if state.is_lose():
            return self.env.reward['lose']

        if state.is_draw():
            return  self.env.reward['draw']

        # 다음 상태의 상태 평가
        return -self.playout(state.next(self.random_action(state)))

    def random_action(self, state):
        available_actions = state.available_actions
        return np.random.choice(available_actions)

    def argmax(self, collection):
        max_idx_list = np.arange(len(collection))[collection == np.max(collection)]
        return np.random.choice(max_idx_list)

## Other Agents

In [79]:
class RandomAgent:
    def __init__(self, env):
        self.env = env

    def random_action(self, state):
        available_actions = state.available_actions
        return np.random.choice(available_actions)

In [80]:
class AlphaBetaAgent:
    def __init__(self, env):
        self.env = env

    def random_action(self, state):
        available_actions = state.available_actions
        return np.random.choice(available_actions)

    def alpha_beta(self, state, alpha, beta):
        if state.is_lose():
            return -1

        if state.is_draw():
            return 0

        score = 0
        for action in state.available_actions:
            score = -self.alpha_beta(state.next(action), -beta, -alpha)
            if score > alpha:
                alpha = score

            if alpha >= beta:
                return alpha

        return alpha

    def alpha_beta_action(self, state):
        best_action = 0
        alpha = -np.inf
        for action in state.legal_actions():
            score = -self.alpha_beta(state.next(action), -np.inf, -alpha)
            if score > alpha:
                best_action = action
                alpha = score

        return best_action

In [81]:
NUM_OF_PLAYOUT = 10

In [82]:
class MCSAgent:
    def __init__(self, env):
        self.env = env
        self.num_of_playout = NUM_OF_PLAYOUT

    def playout(self, state):
        if state.is_lose():
            return self.env.reward['lose']

        if state.is_draw():
            return  self.env.reward['draw']

        # 다음 상태의 상태 평가
        return -self.playout(state.next(self.random_action(state)))

    def random_action(self, state):
        available_actions = state.available_actions
        return np.random.choice(available_actions)

    def mcs_action(self, state):
        available_actions = state.available_actions
        values = np.zeros(shape=len(available_actions))

        for i, action in enumerate(available_actions):
            for _ in range(self.num_of_playout):
                values[i] += - self.playout(state.next(action))
            values[i] /= self.num_of_playout

        print('------------------')
        print(f"Available_actions : {available_actions}")
        print(f"Value_per_action : {values}")
        return available_actions[self.argmax(values)]

    def argmax(self, collection):
        max_idx_list = np.arange(len(collection))[collection == np.max(collection)]
        return np.random.choice(max_idx_list)

## Main

In [83]:
env = TicTacToeEnv()
state = State()
agent_1 = MCTSAgent(env=env)
agent_2 = RandomAgent(env=env)

while True:
    if state.is_done():
        break

    if state.is_going_first():
        action = agent_1.mcts_action(state)

    else:
        action = agent_2.random_action(state)

    state = state.next(action)

    print()
    print(f"Action : {action}")
    env.render(state)

------------------
Available_actions : [0, 1, 2, 3, 4, 5, 6, 7, 8]
Value_per_action : [4, 1, 14, 1, 54, 4, 1, 1, 20]

Action : 4

- - -
- O -
- - -


Action : 1

- X -
- O -
- - -

------------------
Available_actions : [0, 2, 3, 5, 6, 7, 8]
Value_per_action : [30, 42, 2, 2, 2, 1, 21]

Action : 2

- X O
- O -
- - -


Action : 5

- X O
- O X
- - -

------------------
Available_actions : [0, 3, 6, 7, 8]
Value_per_action : [49, 1, 1, 1, 48]

Action : 0

O X O
- O X
- - -


Action : 7

O X O
- O X
- X -

------------------
Available_actions : [8, 3, 6]
Value_per_action : [1, 98, 1]

Action : 3

O X O
O O X
- X -


Action : 8

O X O
O O X
- X X

------------------
Available_actions : [6]
Value_per_action : [100]

Action : 6

O X O
O O X
O X X

