# Import : in local

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
import os
os.chdir('/content/drive/MyDrive/[RL] TicTacToe/code/monte_carlo')

In [31]:
! python environment.py
! python state.py
! python agent.py

In [33]:
from environment import TicTacToeEnvironment
from state import State
from agent import Agent

# Import : by GitHub

You can try this code in Colab by this method.  

In [None]:
!git clone https://github.com/Tonnonssi/tic_tac_toe.git

Cloning into 'tic_tac_toe'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 14 (delta 0), reused 11 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (14/14), 7.73 KiB | 7.73 MiB/s, done.


In [None]:
import os
os.chdir('/content/tic_tac_toe/code/monte_carlo')

In [None]:
! python environment.py
! python state.py
! python agent.py

In [None]:
from environment import TicTacToeEnvironment
from state import State
from agent import Agent

# Main

In [37]:
env = TicTacToeEnvironment()
state = State()
agent_1 = Agent(env=env)
agent_2 = Agent(env=env)


while True:
    if state.is_done():
        break

    if state.is_first_player():
        action = agent_1.mc_action(state)

    else:
        action = agent_2.random_available_action(state)

    state = state.next(action)

    print('-----')
    print(f"Action : {action}")
    env.render(state)
    print()

available_actions : [0, 1, 2, 3, 4, 5, 6, 7, 8]
val_per_action : [ 2.  1.  3.  0.  7. -2.  6.  1.  2.]
-----
Action : 4


Unnamed: 0,0,1,2
0,,,
1,,O,
2,,,



-----
Action : 8


Unnamed: 0,0,1,2
0,,,
1,,O,
2,,,X



available_actions : [0, 1, 2, 3, 5, 6, 7]
val_per_action : [-1.  7.  9.  2.  3.  6.  9.]
-----
Action : 2


Unnamed: 0,0,1,2
0,,,O
1,,O,
2,,,X



-----
Action : 3


Unnamed: 0,0,1,2
0,,,O
1,X,O,
2,,,X



available_actions : [0, 1, 5, 6, 7]
val_per_action : [ 4.  4.  3. 10.  2.]
-----
Action : 6


Unnamed: 0,0,1,2
0,,,O
1,X,O,
2,O,,X





In [40]:
state.board

array([[[0., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.]]])

# Extra : WinRate

- 알파베타기법 vs 몬테카를로
- random vs 몬테카를로

In [58]:
class AgentWithAB(Agent):
    def __init__(self, env):
        super().__init__(env)

    def alpha_beta(self, state, alpha, beta):
        # 패배 시, 상태 가치 -10
        if state.is_lose():
            return self.env.reward['lose']

        # 무승부 시, 상태 가치 0
        if state.is_draw():
            return self.env.reward['draw']

        if state.is_win():
            return self.env.reward['win']

        # 합법적인 수의 상태 가치 계산
        for action in state.available_actions:
            # 상대방의 턴에서 탐색하므로, 상태 가치를 -로 반전
            score = -self.alpha_beta(state.next(action), -beta, -alpha)

            # 현재 노드에서 알파 값을 업데이트
            if score > alpha:
                alpha = score

            # 가지치기 발생
            if alpha >= beta:
                return alpha

        # 탐색된 수 중 최대값 반환
        return alpha

    def alpha_beta_action(self, state):
        best_action = None
        alpha = -float('inf')

        action_values = []

        for action in state.available_actions:
            score = -self.alpha_beta(state.next(action), -float('inf'), -alpha)
            action_values.append(score)

            if score > alpha:
                best_action = action
                alpha = score

        print("Available actions:", state.available_actions)
        print("Action values:", action_values)

        best_action = state.available_actions[self._argmax(action_values)]

        return best_action

### random vs MC

In [36]:
NUM_OF_EPISODES = 100

In [None]:
env = TicTacToeEnvironment()
agent_1 = Agent(env=env)
agent_2 = Agent(env=env)
num_win = 0

for _ in range(NUM_OF_EPISODES):

    state = State()

    while True:
        if state.is_done():
            break

        if state.is_first_player():
            action = agent_1.mc_action(state)

        else:
            action = agent_2.random_available_action(state)

        state = state.next(action)

    num_win += state.is_win() if state.is_first_player() else state.is_lose()

In [52]:
print(f"Win Rate : {num_win / NUM_OF_EPISODES}")

Win Rate : 0.94


In [53]:
w = win_rate / NUM_OF_EPISODES

## Alpha-Beta vs MC

In [None]:
env = TicTacToeEnvironment()
agent_1 = AgentWithAB(env=env)
agent_2 = AgentWithAB(env=env)
num_win, num_lose = 0, 0

for _ in range(NUM_OF_EPISODES):

    state = State()

    while True:
        if state.is_done():
            break

        if state.is_first_player():
            action = agent_1.mc_action(state)

        else:
            action = agent_2.alpha_beta_action(state)

        state = state.next(action)

    num_win += state.is_win() if state.is_first_player() else state.is_lose() # 1st player 기준의 승률이라서
    num_lose += state.is_lose() if state.is_first_player() else state.is_win()
    print('-----')
    env.render(state)
    print()

In [62]:
print(f"Win Rate : {num_win / NUM_OF_EPISODES}")
print(f"Lose Rate : {num_lose / NUM_OF_EPISODES}")

Win Rate : 0.0
Lose Rate : 0.32
