In [117]:
from tictactoe_env import TicTacToe
import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict
from itertools import product

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
class RandomPlayer:
    def strategy(self, state):
        possible_steps = state[1]
        return possible_steps[np.random.randint(possible_steps.shape[0])]

In [95]:
class TicTacToeGame:
    def __init__(self, env):
        self.env = env

    def run_episode(self, player_x, player_o, return_history=False):
        state, _, is_done, _ = self.env.reset()
        states_x, states_o = [], []
        rewards_x, rewards_o = [], []
        players = [player_x, player_o]
        cur_player = 0
        while not is_done:
            step = players[cur_player].strategy(state)
            state, reward, is_done, _ = self.env.step(step)
            if cur_player == 0:
                rewards_x.append(reward)
                states_x.append(state)
            else:
                rewards_o.append(-reward)
                states_o.append(state)
            cur_player = (cur_player + 1) % 2
        if (len(rewards_x) > len(rewards_o)):
            if (rewards_x[-1] == 1):
                rewards_o.append(-1)
            else:
                rewards_o.append(0)
        if (rewards_o[-1] == 1):
            rewards_x[-1] = -1
        if return_history:
            return (states_x, rewards_x), (states_o, rewards_o)
        return rewards_x[-1], rewards_o[-1]

    def check_mean_reward(self, player_x, player_o, n_iter):
        rewards_x = []
        for _ in tqdm(range(n_iter)):
            reward_x, _ = self.run_episode(player_x, player_o)
            rewards_x.append(reward_x)
        return np.mean(rewards_x)


In [96]:
env = TicTacToe()
game = TicTacToeGame(env)

In [97]:
player_x = RandomPlayer()
player_o = RandomPlayer()

game.run_episode(player_x, player_o)

(1, -1)

In [100]:
game.check_mean_reward(player_x, player_o, n_iter=10000)

  0%|          | 0/10000 [00:00<?, ?it/s]

0.2974

In [125]:
class QLearningPlayer:
    def __init__(self, eps, alpha, gamma, n_rows=3, n_cols=3):
        self.eps = eps
        self.alpha = alpha
        self.gamma = gamma
        self.q_table = defaultdict(lambda: np.zeros(n_rows * n_cols))
        self.actions = list(product(range(n_rows), range(n_cols)))
        self.set_training()
    
    def set_training(self):
        self.training = True

    def set_evaluating(self):
        self.training = False

    def update_q(self):
        pass

    def strategy(self, state):
        state_hash = state[0]
        coin = np.random.rand()
        greedy_action = self.greedy_step(state_hash)
        random_action = self.random_step()
        if self.training:
            coin = np.random.rand() < self.eps
            if coin:
                action = random_action
            else:
                action = greedy_action
        else:
            action = greedy_action
        return action
    
    def greedy_step(self, state_hash):
        return self.actions[self.q_table[state_hash].argmax()]
    
    def random_step(self):
        return self.actions[np.random.randint(0, len(self.actions))]

In [126]:
qplayer = QLearningPlayer(1, 0, 0)

In [128]:
game.check_mean_reward(qplayer, qplayer, 100000)

  0%|          | 0/100000 [00:00<?, ?it/s]

-4.63266

In [162]:
game.run_episode(qplayer, qplayer)

(-10, 0)