In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt

import random
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cpu'

In [23]:
# Human: O, Computer: X
# Blank: 0, Human: 1, Computer: -1
from typing import List, Optional


class TicTacToe():
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=np.int32)
    
    def place(self, action: int, player: int) -> bool:
        if self.board[action // 3, action % 3] == 0:
            self.board[action // 3, action % 3] = player
            return True
        return False
    
    def available(self) -> np.ndarray:
        return np.where(self.board == 0)[0]
    
    def _check(self, v: List[int]) -> bool:
        _sum = sum(v)
        if _sum == 3: 
            return 1
        elif _sum == -3:
            return -1
        else:
            return 0
    
    def win(self) -> int: # check the winner of the game. If it's not done, return -1. 
        vecs: list = [self.board.tolist()[0], self.board.T.tolist()[0], self.board.diagonal(), np.flip(self.board, 1).diagonal()]
        
        for vec in vecs: 
            checked = self._check(vec)
            if checked != 0:
                return checked
        return -1
    
    def all_occupied(self) -> bool: 
        return np.square(self.board).sum() == 9
    
    def done(self) -> bool: 
        return self.win() != -1 or self.all_occupied()
    
    def render(self, player: Optional[int] = None): 
        if player is None: 
            assert abs(player) == 1, 'player must be 1 or -1'
            print('=' * 3, 'human' if player == 1 else 'computer', '=' * 3)
        for row in self.board: 
            print(' '.join([str(c) for c in row.tolist()]))
        print('=' * 10)

In [182]:
class Agent(nn.Module): 
    def __init__(self, device) -> None:
        super().__init__()

        self.device = device

        self.conv1 = nn.Conv2d(1, 32, kernel_size=2, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=2, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=2, stride=1, padding=1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=2, stride=1, padding=1)
        self.relu = nn.ReLU()
        
        self.fc = nn.Linear(1600, 9)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor: 
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.fc(x.view(x.size(0), -1))
        return x
    
    def action(self, state: torch.Tensor) -> int: 
        x = self.forward(state)
        x = self.softmax(x)
        # x[torch.where(state.view(state.size(0), -1).square() == 1)] = 0
        x[np.where(state.view(state.size(0), -1).square() == 1)] = 0
        return torch.argmax(x).item()

In [193]:
def game_loop(agent: Agent, human_first=True, manual=False, random=False): 
    actions = []

    game = TicTacToe()
    turn = 1 if human_first else -1

    while not game.done(): 
        if manual: 
            if turn == 1: 
                if random: 
                    action = np.random.choice(game.available())
                else: 
                    action = int(input('Enter your action: '))
                if not game.place(action, 1): 
                    break
            else: 
                action = agent.action(torch.from_numpy(game.board).view(1, 1, 3, 3).float().to(device))
                game.place(action, turn)
            if not random: 
                game.render(turn)
        else: 
            action = agent.action(torch.from_numpy(game.board).view(1, 1, 3, 3).float().to(device))
            game.place(action, turn)
        actions.append(action)
        turn *= -1
    return game.win(), human_first, actions

In [184]:
agent = Agent(device).to(device)
criteria = nn.CrossEntropyLoss()
optimizer = optim.Adam(agent.parameters(), lr=1e-2)

In [185]:
# FIXME: Update the training manner (Enhance the self-play method)
n_iteration = 10000
desc = ''
for i in tqdm(range(n_iteration), desc=desc):
    # winner, human_first, actions = game_loop(agent, bool(random.getrandbits(1)))
    winner, human_first, actions = game_loop(agent, False)

    x_train = []
    y_train = []
    
    my_board = torch.FloatTensor(np.zeros((3, 3)))
    oppo_board = torch.FloatTensor(np.zeros((3, 3)))
    
    if winner != (1 if human_first else -1): 
        actions = actions[1:]
    for i, action in enumerate(actions): 
        my_board[action // 3, action % 3] = winner * coeff
        oppo_board[action // 3, action % 3] = -winner * coeff
        x_train.append(my_board if i % 2 == 0 else oppo_board)
        y_train.append([action])
        winner *= -1
    
    x_train = torch.stack(x_train).to(device)
    x_train = x_train.view(x_train.size(0), 1, 3, 3)
    y_train = torch.LongTensor(y_train, device=device).squeeze()

    y_hat = agent(x_train)
    cost = criteria(y_hat, y_train)
    desc = f'{i}/{n_iteration} : {winner} : {cost:.4f}'

    optimizer.zero_grad()
    cost.backward()
    optimizer.step()


100%|██████████| 10000/10000 [00:26<00:00, 374.34it/s]


In [186]:
win_count = [0, 0]
for _ in tqdm(range(50000)): 
    win_count[(game_loop(agent, human_first=True, manual=True, random=True)[0] + 1) // 2] += 1
print('The winning rate is', win_count[0] / sum(win_count))

100%|██████████| 50000/50000 [00:14<00:00, 3333.85it/s]

The winning rate is 0.91444





In [196]:
game_loop(agent, human_first=True, manual=True, random=False)

0 0 0
0 0 1
0 0 0
0 0 -1
0 0 1
0 0 0
0 0 -1
0 1 1
0 0 0
0 0 -1
-1 1 1
0 0 0
0 1 -1
-1 1 1
0 0 0
-1 1 -1
-1 1 1
0 0 0
-1 1 -1
-1 1 1
0 1 0
-1 1 -1
-1 1 1
-1 1 0


ValueError: invalid literal for int() with base 10: ''