- 일단 중복 고려 안함. 이동 시 발생하는 reward 0.
- 오직 게임이 끝나고 win, lose, draw에 따라서만 reward 발생
- 몬테카를로 구현~~

In [24]:
import time
import os
import pickle
import numpy as np
import pandas as pd
from typing import Tuple
from collections import deque
import copy
from scipy.special import softmax
import random
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 틱택토 환경

In [69]:
class Environment:
    def __init__(self):
        self.n = 3
        self.num_actions = self.n**2
        self.present_state = np.zeros((self.n, self.n))
        self.action_space = np.arange(self.num_actions)
        self.available_actions = np.ones(self.num_actions)
        self.reward_dict = {'win':1, 'lose':-1, 'draw': -0.1, 'good_action':0, 'overlapped':0}
        self.done = False


    def step(self, action_idx:int, max_player:bool):
        '''
        에이전트가 선택한 action에 따라 주어지는 next_state, reward, done
        '''
        x, y = np.divmod(action_idx, self.n)

        self.present_state[x,y] = max_player*2 -1
        next_state = self.present_state
        done, is_win = self.is_done(next_state)
        reward = self.reward_dict['good_action']
        self.available_actions = self.check_available_action(self.present_state)

        if done:
            if is_win == "win":
                reward = self.reward_dict['win']
            elif is_win == "lose":
                reward = self.reward_dict['lose']
            else:
                reward = self.reward_dict['draw']

        self.done = done

        return next_state, reward, done, is_win


    def reset(self):
        '''
        게임판 초기화
        '''
        self.present_state = np.zeros((self.n, self.n))
        self.available_actions = np.ones(self.num_actions)
        self.done = False


    def render(self):
        '''
        print the current state
        '''
        render_state = np.array([['.','.','.'],
                                ['.','.','.'],
                                ['.','.','.']])
        render_str = ""
        for i in range(self.num_actions):
            x, y = np.divmod(i, 3)
            if self.present_state[x,y] == 1:
                render_state[x,y] = 'X'
            elif self.present_state[x,y] == -1:
                render_state[x,y] = 'O'

            render_str += f" {render_state[x,y]}"
            if (i+1) % 3 == 0:
                render_str += "\n" + "-"*11 + "\n"
            else:
                render_str += " |"

        print(render_str)


    def check_available_action(self, state):
        '''
        현재 state에서 가능한 actions array 반환
        '''
        impossible_actions = np.argwhere(state.reshape(-1) != 0)
        available_actions = np.ones(self.num_actions)
        available_actions[impossible_actions] = 0

        return available_actions


    def is_done(self, state):
        '''
        틱택토 게임 종료 조건 및 승리 여부 확인하는 함수
        '''
        is_done, is_win = False, "null"

        # 무승부 여부 확인
        if (state==0).sum()==0:
            is_done, is_win = True, "draw"

        else:
            axis_sum = np.concatenate((state.sum(axis=0), state.sum(axis=1)))
            diag_sum = np.array([state.trace(), np.fliplr(state).trace()])

            sum_array = np.concatenate((axis_sum, diag_sum))
            max_sum = np.max(sum_array)
            min_sum = np.min(sum_array)

            if max_sum == 3:
                is_done, is_win = True, "win"
            elif min_sum == -3:
                is_done, is_win = True, "lose"
            else:
                is_done, is_win = False, "null"

        return is_done, is_win

# 몬테카를로 알고리즘
- 랜덤 탐색 -> 가치함수 업데이트

In [70]:
class Agent:
    def __init__(self, env, max_player:bool):
        self.env = env
        self.max_player = max_player

        self.n = self.env.n
        self.num_actions = self.env.num_actions
        self.actions = self.env.action_space

        self.value_table = np.zeros(self.num_actions)
        self.returns = {i: [] for i in range(self.num_actions)}

        self.stepsize = STEPSIZE
        self.gamma = GAMMA
        self.epsilon = EPSILON
        self.epsilon_decay = EPSILON_DECAY
        self.epsilon_min = EPSILON_MIN


    def update_value_table(self, history):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        G = 0
        for t in reversed(range(len(history))):
            action_idx, reward = history[t]

            G = self.gamma * G + reward
            self.returns[action_idx].append(G)
            self.value_table[action_idx] = np.mean(self.returns[action_idx])


    def get_action(self, state, available_actions):
        available_action = np.where(available_actions != 0)[0]

        if (np.random.rand() <= self.epsilon) or (not self.max_player):
            act = random.choice(available_action)

        else:
            available_value = self.value_table * available_actions
            act = np.argmax(available_value)

        return act

## main

In [77]:
STEPSIZE = 0.1
GAMMA = 0.9
EPSILON = 0.999
EPSILON_DECAY = 0.999
EPSILON_MIN = 0.01
EPISODES = 1000

In [78]:
env = Environment()
agent = Agent(env, True)
player = Agent(env, False)
env.reset()

In [85]:
total_win = []

for episode in range(EPISODES):
    env.reset()
    state = env.present_state
    done = env.done

    history = []

    while not done:
        agent_action = agent.get_action(state, env.available_actions)
        next_state, reward, done, is_win = env.step(agent_action, True)
        history.append((agent_action, reward))

        if not done:
            player_action = player.get_action(next_state, env.available_actions)
            next_state, reward, done, is_win = env.step(player_action, False)

        state = next_state

    agent.update_value_table(history)

    total_win.append(is_win == "win")

    if (episode+1) % 100 == 0:
        print(f"Episode: {episode+1}, win?: {is_win}, win_rate: {sum(total_win[episode-100:episode+1])/100}")
        env.render()


Episode: 100, win?: win, win_rate: 0.01
 X | . | .
-----------
 . | X | .
-----------
 O | O | X
-----------

Episode: 200, win?: win, win_rate: 0.84
 O | . | X
-----------
 . | X | X
-----------
 O | O | X
-----------

Episode: 300, win?: win, win_rate: 0.78
 X | O | X
-----------
 . | X | O
-----------
 X | . | O
-----------

Episode: 400, win?: win, win_rate: 0.85
 X | . | .
-----------
 O | X | .
-----------
 O | . | X
-----------

Episode: 500, win?: win, win_rate: 0.82
 X | . | .
-----------
 O | X | .
-----------
 O | . | X
-----------

Episode: 600, win?: win, win_rate: 0.85
 X | . | O
-----------
 . | X | .
-----------
 O | . | X
-----------

Episode: 700, win?: lose, win_rate: 0.8
 O | . | X
-----------
 O | X | .
-----------
 O | . | X
-----------

Episode: 800, win?: draw, win_rate: 0.84
 O | X | O
-----------
 X | X | O
-----------
 X | O | X
-----------

Episode: 900, win?: win, win_rate: 0.83
 X | . | .
-----------
 . | X | .
-----------
 O | O | X
-----------

Episode: 

# minimax 알고리즘
- 입력 받은 상태에서 얻을 수 있는 최대값이 뭔지 알려주는 함수
- 개선점: 최대값을 얻을 수 있는 행동이 무엇인지 반환해야한다.
- 현재 모든 경우의 수에 대해 계산하는 minimax 함수이다. 틱택토 정도의 작은 상황에서는 가능하지만, 상태의 수가 많아지면 depth를 도입해 일정 깊이만큼만 탐색하도록 해야한다.

In [None]:
class Agent:
    def __init__(self, env, max_player:bool):
        self.env = env
        self.n = self.env.n
        self.num_actions = self.env.num_actions
        self.actions = torch.tensor(self.num_actions)

        self.best_action = None


    def minimax(self, present_state, depth, alpha, beta, max_player:bool):
        temp_env = Environment()
        state = copy.deepcopy(present_state)
        temp_env.present_state = state
        done, is_win = temp_env.is_done(state)
        reward = 0

        remain_actions = np.argwhere(state == 0)


        if (done == True) or (depth == 0):
            if is_win == "win":
                reward = temp_env.reward_dict['win']

            elif is_win == "lose":
                reward = temp_env.reward_dict['lose']

            else:
                reward = temp_env.reward_dict['draw']

            return reward


        if max_player:
            maxEval = -np.Inf
            best_action = None
            for (x, y) in remain_actions:
                idx = self.n * x + y
                child, _, _, _ = temp_env.step(idx, True)

                eval = self.minimax(child, depth-1, alpha, beta, False)

                if eval > maxEval:
                    best_action = idx
                    maxEval = eval

                alpha = max(alpha, eval)
                if beta <= alpha:
                    break

                if depth == DEPTH:  # 최상위 호출에서만 best_action을 저장
                    self.best_action = best_action

            return maxEval

        else:
            minEval = np.Inf
            for (x, y) in remain_actions:
                idx = self.n * x + y
                child, _, _, _ = temp_env.step(idx, False)

                eval = self.minimax(child, depth-1, alpha, beta, True)
                minEval = min(minEval, eval)

                beta = min(alpha, eval)
                if beta <= alpha:
                    break

            return minEval


    def get_action(self, state, agent_turn):
        self.minimax(state, DEPTH, -np.Inf, np.Inf, True)
        return self.best_action

In [None]:
DEPTH = 100