# TicTacToe Game & Tabular Q Learning

In [23]:
import numpy as np
import matplotlib.pyplot as plt 

rng = np.random.default_rng(2021)  # random number generator

In [80]:
class EnvTTT():
    def __init__(self,):
        self.reset()
        
    def reset(self,):
        self.board = '---------'
        self.state = self.board
        self.winner = None 

        return self.state 
        
    def step(self, action, ox):
        """ put X/O at the location specified by action
            action: a number 0 ~ 9 
            return: state, reward, done, info """
        done = False 
        reward = 0
        info = {}
        if self.state[action] != '-':
            done = True 
            info = {'state' : self.state, 'code': 'bad action: occupied' }
        else:
            self.state = self.state[:action] + ox + self.state[action+1:]

        if self.game_over() == True:
            if self.winner == 'D':
                reward = 0
            else:
                reward = 1
            done = True

        return self.state, reward, done, info 

    def game_over(self,):            
        # Each list corresponds to the values to check to see if a winner is there
        checks = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7] , [2, 5, 8], [0, 4, 8], [2, 4, 6]]

        for check in checks:
            # Check to see if the strings have a winner
            test = self.state[check[0]] + self.state[check[1]] + self.state[check[2]]
            if test == 'XXX':
                self.winner = 'X'
                # print('>>>', test, self.winner)
                return True
            elif test == 'OOO':
                self.winner = 'O'
                # print('>>>', test, self.winner)
                return True

        if '-' not in self.state:
            """ draw """
            self.winner = 'D'  # draw
            return True 
        return False

    def render(self,):
        for i in range(0, 9, 3):
            print(self.state[i:i+3])
    

In [116]:
class QAgent():
    def __init__(self, ox):
        self.ox = ox 
        # { state: [q1, q2, ..., q9] } where q_n = -inf if action n is impossible.
        # use self.get_Qvalues(state) interface. Do not try to access self.Q directly, because of initialization
        self.Q = {}  
        self.qval_init = .5
        #
        self.alpha = 0.05
        self.gamma = 1.  # discount factor 
        pass

    def get_random_action(self, state):
        possible = [i for i, s in enumerate(state) if s == '-']
        chosen = np.random.choice(possible)
        return chosen 

    def get_action(self, state, random=False):
        if random:
            return self.get_random_action(state)
        #
        # Get the q-values for the state
        q_vals = self.get_Qvalues(state)

        # Get location of all max values and select a random one
        max_q = np.round(max(q_vals), 7)
        action_candidates = [i for i, qsa in enumerate(q_vals) if qsa == max_q]
        print('action_cand: ', action_candidates, q_vals, max_q)
        action = np.random.choice(action_candidates)
        return action  # the  place to put 'O' or 'X"

    def get_Qvalues(self, state):
        if state not in self.Q.keys():
            values = np.array([self.qval_init if ox == '-' else float('-inf') for ox in state])
            self.Q[state] = values  # register a new action-values
        return self.Q[state]

    def q_update(self, state, action, reward, new_state, done):
        qsa = self.Q[state][action]  # Q(S,A)
        pred = reward
        if not done:
            max_a = max(self.get_Qvalues(new_state))
            pred += self.gamma * max_a
        #
        self.Q[state][action] += self.alpha * (pred - qsa)   # Q-learning or TD(0)

In [117]:
env = EnvTTT()

## Test running of the environment

In [118]:
def run(agents, update=False):
    done = False
    state = env.reset()
    print('initial state: ', state)
    while not done:
        action = agents[0].get_random_action(state)
        state, reward, done, info = env.step(action, agents[0].ox)
        print(f'Turn: {agents[0].ox}, A: {action}, S: {state}, R: {reward}')
        # env.render()

        if done: 
            break 

        action = agents[1].get_random_action(state)
        state, reward, done, info = env.step(action, agents[1].ox)
        print(f'Turn: {agents[1].ox}, A: {action}, S: {state}, R: {reward}')
        # env.render()
        
    print(f'Winner: {env.winner}')
    env.render()

In [119]:
# we need two agents to play the game
agents = [QAgent('X'), QAgent('O')]

run(agents)

initial state:  ---------
Turn: X, A: 5, S: -----X---, R: 0
Turn: O, A: 4, S: ----OX---, R: 0
Turn: X, A: 3, S: ---XOX---, R: 0
Turn: O, A: 8, S: ---XOX--O, R: 0
Turn: X, A: 0, S: X--XOX--O, R: 0
Turn: O, A: 1, S: XO-XOX--O, R: 0
Turn: X, A: 7, S: XO-XOX-XO, R: 0
Turn: O, A: 2, S: XOOXOX-XO, R: 0
Turn: X, A: 6, S: XOOXOXXXO, R: 1
Winner: X
XOO
XOX
XXO


In [120]:
# without learning, Q table is empty
agents[0].Q

{}

In [121]:
def run_X(agents):
    done = False
    state = env.reset()
    print('initial state: ', state)
    while not done:
        action = agents[0].get_action(state)
        new_state, reward, done, info = env.step(action, agents[0].ox)
        print(f'Turn: {agents[0].ox}, A: {action}, S: {state}, R: {reward}')
        # env.render()

        # agents[1].q_update(state, action, reward, new_state, done)

        if done: 
            agents[0].q_update(state, action, reward, new_state, done)  # if game is over (win), then update
            state = new_state
            break 

        action = agents[1].get_random_action(state)
        new_state, reward, done, info = env.step(action, agents[1].ox)
        print(f'Turn: {agents[1].ox}, A: {action}, S: {state}, R: {reward}')
        # env.render()

        # Now, the opponant finished its move. Update.
        agents[0].q_update(state, action, reward, new_state, done)

        state = new_state 

    print(f'Winner: {env.winner}')
    env.render()

In [122]:
run_X(agents)

initial state:  ---------
action_cand:  [0, 1, 2, 3, 4, 5, 6, 7, 8] [0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5] 0.5
Turn: X, A: 0, S: ---------, R: 0
Turn: O, A: 0, S: ---------, R: 0
Winner: None
X--
---
---


In [123]:
from collections import defaultdict
import numpy as np 

q = defaultdict(lambda : np.ones((9,))*.5)

In [125]:
q['hello']

array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])

In [126]:
q

defaultdict(<function __main__.<lambda>()>,
            {'hello': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])})

In [131]:
a = np.array(['X', 'X', 'O'])

b = a == 'X'
b

array([ True,  True, False])