In [1]:
import os
import json
import numpy as np

from kaggle_environments import make

import torch as T
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import f1_score, accuracy_score

device = T.device('cuda' if T.cuda.is_available() else 'cpu')
print(f'Use {device} device')

Use cuda device


In [2]:
CACHE_DIR = '.'
TRAP_STATE = np.ones((3, 3), dtype=np.int)

In [3]:
class QMemory:
    def __init__(self, max_size, dims):
        self.count = 0
        self.max_size = max_size
        
        self.dones = np.zeros(max_size)
        self.actions = np.zeros(max_size)
        self.rewards = np.zeros(max_size)

        self.prev_states = np.zeros((max_size, *dims))
        self.next_states = np.zeros((max_size, *dims))
    
    def add(self, prev_state, action, next_state, reward, done):
        k = self.count % self.max_size
        self.count += 1
        
        self.dones[k] = done
        self.rewards[k] = reward
        self.actions[k] = action
        self.prev_states[k] = prev_state
        self.next_states[k] = next_state
            
    def sample(self, batch_size):
        siz = min(self.count, batch_size)
        idx = np.random.choice(siz, batch_size, replace=False)
        return self.prev_states[idx], self.actions[idx], self.next_states[idx], self.rewards[idx], self.dones[idx]  
    
class QEpsilon:
    def __init__(self, val=1, min=1e-4, dec=1e-5):
        self.val = val
        self.min = min
        self.dec = dec

    def decrease(self):
        self.val = max(self.val - self.dec, self.min)
        
    @property
    def value(self):
        return self.val
    

In [4]:
class QClassicAgent:
    def __init__(self, alpha=0.1, gamma=0.9):
        self.alpha = alpha
        self.gamma = gamma
        self.Q = np.random.rand(3 ** 9, 9)
        self.eps = QEpsilon(1, 0, 1e-4)

        self.Q[self.to_state(TRAP_STATE), :] = 0
        
    @staticmethod
    def to_state(board):
        return ((3 ** np.arange(9)) * (board.flatten() + 1)).sum()
    
    def __call__(self, board):
        if np.random.rand() < self.eps.value:
            return np.random.randint(9)
        s = self.to_state(board)
        a = self.Q[s, :].argmax()
        return int(a)
    
    def learn(self, prev_state, action, next_state, reward, done):
        a0 = action
        if done:
            next_state = TRAP_STATE
            
        s0 = self.to_state(prev_state)
        s1 = self.to_state(next_state)
        
            
        q_upd = reward + self.gamma * self.Q[s1, :].max()
        
        self.Q[s0, a0] = (1 - self.alpha)*self.Q[s0, a0] + self.alpha*q_upd        
        self.eps.decrease()

In [5]:
class DQNAgent:
    def __init__(self, dims=(3,3), n_actions=9, gamma=0.99, batch_size=64, mem_size=512, device=device, replace_rate=1000, eps=None):

        if eps is None:
            eps = QEpsilon(1, 1e-5, 5e-5)
        elif eps == 0:
            eps = QEpsilon(0, 0, 0)            
            
        self.device = device

        self.memory = QMemory(mem_size, dims)
        self.q_eval = self.create_q(dims, n_actions, device, True)
        self.q_next = self.create_q(dims, n_actions, device, False)
        self.action = np.arange(n_actions)

        self.gamma = gamma
        self.eps = eps
        self.batch_size = batch_size
        
        self.loss = nn.MSELoss()
        self.optimizer = optim.Adam(self.q_eval.parameters())
        
        self.batch_count = 0
        self.replace_rate = replace_rate
    
    @staticmethod
    def create_q(dims, n_actions, device, is_eval):
        q = nn.Sequential(
            nn.Conv2d(1, 64, 2),
            nn.Conv2d(64, 256, 2),
            nn.Flatten(),
            nn.Tanh(),
            nn.Linear(256, n_actions),
        ).to(device)
        if is_eval:
            q.eval()
        else:
            q.train()
        return q
    
    def save(self):
        T.save(self.q_eval.state_dict(), f'{CACHE_DIR}/eval.q')
        T.save(self.q_next.state_dict(), f'{CACHE_DIR}/next.q')
        
    def load(self):
        self.q_eval.load_state_dict(T.load(f'{CACHE_DIR}/eval.q'))
        self.q_next.load_state_dict(T.load(f'{CACHE_DIR}/next.q'))
        
    def tensor(self, array, dtype=T.float):
        return T.tensor(array, dtype=dtype, device=self.device)
    
    def __call__(self, board):
        if np.random.rand() <= self.eps.value:            
            action = np.random.choice(self.action)
        else:
            action = self.q_eval(self.tensor(board.reshape(1, 1, 3, 3))).argmax().item()
        return int(action)
    
    def learn(self, prev_state, action, next_state, reward, done):
        
        self.memory.add(prev_state, action, next_state, reward, done)
        if self.memory.count < self.batch_size:
            return

        
        self.batch_count += 1
        if self.batch_count >= self.replace_rate:
            self.batch_count = 0
            self.q_next.load_state_dict(self.q_eval.state_dict())
            
        prev_states, actions, next_states, rewards, dones = self.memory.sample(self.batch_size)
        
        batch_idx = np.arange(self.batch_size)
        q_vals = self.q_eval(self.tensor(prev_states.reshape(-1, 1, 3, 3)))
        q_pred = q_vals[batch_idx, actions]
        q_next = self.q_next(self.tensor(next_states.reshape(-1, 1, 3, 3))).max(dim=1)[0]
        
        q_next[dones] = 0.0
        q_targ = self.tensor(rewards) + self.gamma * q_next
        
        loss = self.loss(q_pred.flatten(), q_targ.flatten())
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.eps.decrease()


In [6]:
class QTrainer:
    def __init__(self, env, other="random"):
        self.trainer = env.train([None, other])

    @staticmethod
    def to_state(obs):
        board = np.array(obs['board']).reshape(3,3)
        board[board == 2] = -1
        return board
    
    def reset(self):
        return self.to_state(self.trainer.reset())

    def step(self, action):
        state, reward, done, info = self.trainer.step(int(action))
        state = self.to_state(state)
        if done:
            state = TRAP_STATE
        return state, reward, done

    def train(self, agent, n_games= 50000, print_rate=1000):
        wins, loose, draw, invalid = 0, 0, 0, 0

        for game_no in range(n_games):
            done = False
            prev_state = trainer.reset()
            while not done: 
                action = agent(prev_state)
                next_state, reward, done = self.step(action)

                if reward == 1:
                    wins += 1
                elif reward == -1:
                    loose += 1
                elif reward == 0:
                    draw += 1
                else:
                    invalid += 1
                    reward = -1

                agent.learn(prev_state, action, next_state, reward, done)
                prev_state = next_state

            if game_no % 1000 == 999:
                print(f'Game {game_no + 1}. Wins: {wins}, Loose: {loose}, Draw: {draw}, Invalid Moves: {invalid}, Eps: {agent.eps.value}')
                wins, loose, draw, invalid = (0, 0, 0, 0)
    

In [7]:
env = make('tictactoe')
trainer = QTrainer(env)
trainer.train(QClassicAgent())

Game 1000. Wins: 79, Loose: 74, Draw: 2247, Invalid Moves: 844, Eps: 0.6756000000000357
Game 2000. Wins: 123, Loose: 70, Draw: 2400, Invalid Moves: 804, Eps: 0.33590000000007314
Game 3000. Wins: 186, Loose: 100, Draw: 2668, Invalid Moves: 708, Eps: 0
Game 4000. Wins: 295, Loose: 113, Draw: 2840, Invalid Moves: 578, Eps: 0
Game 5000. Wins: 331, Loose: 88, Draw: 2942, Invalid Moves: 561, Eps: 0
Game 6000. Wins: 421, Loose: 79, Draw: 2912, Invalid Moves: 481, Eps: 0
Game 7000. Wins: 487, Loose: 77, Draw: 2870, Invalid Moves: 415, Eps: 0
Game 8000. Wins: 566, Loose: 68, Draw: 2820, Invalid Moves: 343, Eps: 0
Game 9000. Wins: 659, Loose: 53, Draw: 2763, Invalid Moves: 272, Eps: 0
Game 10000. Wins: 804, Loose: 20, Draw: 2652, Invalid Moves: 159, Eps: 0
Game 11000. Wins: 824, Loose: 15, Draw: 2689, Invalid Moves: 142, Eps: 0
Game 12000. Wins: 843, Loose: 12, Draw: 2667, Invalid Moves: 130, Eps: 0
Game 13000. Wins: 889, Loose: 9, Draw: 2659, Invalid Moves: 90, Eps: 0
Game 14000. Wins: 925, Loo

In [8]:
dqn_agent = DQNAgent(batch_size=256, mem_size=1024*10, replace_rate=400)
trainer.train(dqn_agent)

Game 1000. Wins: 97, Loose: 67, Draw: 2196, Invalid Moves: 830, Eps: 0.8532500000000162
Game 2000. Wins: 84, Loose: 60, Draw: 2164, Invalid Moves: 855, Eps: 0.6951000000000336
Game 3000. Wins: 64, Loose: 47, Draw: 1970, Invalid Moves: 889, Eps: 0.5466000000000499
Game 4000. Wins: 92, Loose: 50, Draw: 2091, Invalid Moves: 857, Eps: 0.39210000000006695
Game 5000. Wins: 103, Loose: 43, Draw: 2066, Invalid Moves: 853, Eps: 0.23885000000008383
Game 6000. Wins: 99, Loose: 46, Draw: 1999, Invalid Moves: 854, Eps: 0.08895000000010034
Game 7000. Wins: 190, Loose: 44, Draw: 2167, Invalid Moves: 766, Eps: 1e-05
Game 8000. Wins: 248, Loose: 58, Draw: 2242, Invalid Moves: 694, Eps: 1e-05
Game 9000. Wins: 234, Loose: 52, Draw: 2182, Invalid Moves: 712, Eps: 1e-05
Game 10000. Wins: 297, Loose: 51, Draw: 2273, Invalid Moves: 651, Eps: 1e-05
Game 11000. Wins: 194, Loose: 27, Draw: 2208, Invalid Moves: 779, Eps: 1e-05
Game 12000. Wins: 75, Loose: 32, Draw: 1896, Invalid Moves: 893, Eps: 1e-05
Game 13000