In [25]:
import numpy as np
import random
from collections import defaultdict

In [26]:
# b. Defining the Tic-Tac-Toe game class
class TicTacToeGame:
    def __init__(self):
        self.env = TicTacToeEnv()
        self.q_table = defaultdict(float)

    def get_state(self):
        return tuple(self.env.board.flatten())

    def choose_action(self, epsilon=0.1):
        state = self.get_state()
        available_actions = self.env.available_actions()
        if not available_actions:
            return None  # No actions available
        if random.uniform(0, 1) < epsilon:
            return random.choice(available_actions)
        q_values = {action: self.q_table[(state, action)] for action in available_actions}
        return max(q_values, key=q_values.get)

    def train(self, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
        for episode in range(episodes):
            self.env.reset()
            state = self.get_state()
            done = False
            while not done:
                action = self.choose_action(epsilon)
                if action is None:
                    break  # No actions available, end the episode
                next_state, reward, done = self.env.take_action(action)
                next_state = tuple(next_state.flatten())
                best_next_action = self.choose_action(epsilon=0)
                if best_next_action is None:
                    td_target = reward
                else:
                    td_target = reward + gamma * self.q_table[(next_state, best_next_action)]
                td_error = td_target - self.q_table[(state, action)]
                self.q_table[(state, action)] += alpha * td_error
                state = next_state

    def test(self, episodes=10):
        for episode in range(episodes):
            self.env.reset()
            done = False
            print(f"Game {episode + 1}")
            while not done:
                action = self.choose_action(epsilon=0)  # No exploration during testing
                _, reward, done = self.env.take_action(action)
                print(self.env.board)
                print()
            if reward == 1:
                print("X wins!")
            elif reward == -1:
                print("O wins!")
            else:
                print("It's a draw!")

In [28]:
if __name__ == "__main__":
    game = TicTacToeGame()
    game.train(episodes=1000)
    game.test(episodes=3)

Game 1
[[1 0 0]
 [0 0 0]
 [0 0 0]]

[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [ 0  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1 -1]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1 -1]
 [ 1  0  0]]

X wins!
Game 2
[[1 0 0]
 [0 0 0]
 [0 0 0]]

[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [ 0  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1 -1]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1 -1]
 [ 1  0  0]]

X wins!
Game 3
[[1 0 0]
 [0 0 0]
 [0 0 0]]

[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [ 0  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  0  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1  0]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1 -1]
 [ 0  0  0]]

[[ 1 -1  1]
 [-1  1 -1]
 [ 1  0  0]]

X wins!
