In [None]:
import random
import numpy as np

class TicTacToe:
    def __init__(self):
        self.gameBoard = np.zeros((3, 3))
        # 1 for agent 1, -1 for agent 2
        self.player = 1
        self.gameWinner = None
        self.isGameDone = False

    # reset game board to the beginning with no plays
    def reset_game_board(self):
      self.gameBoard = np.zeros((3, 3))
      self.player, self.gameWinner, self.isGameDone = 1, None, False

    # flatten the game board into 1D array
    def get_flattened_state(self):
      flattened_board = self.gameBoard.flatten()
      state_tuple = tuple(flattened_board)
      return state_tuple

    # check if the move is a valid move
    def is_valid_move(self, action):
      if 0 <= action < 9:
          if self.gameBoard.flatten()[action] == 0:
              return True
          else:
              return False
      else:
          return False

    # update the board with the given move
    def make_board_move(self, action):
        if not self.is_valid_move(action):
            return False

        self._apply_move(action)
        self.check_game_winner() # after applying the move check if there is a winner
        self.player *= -1

        # if there is a winner or the game board is full, the game is over
        if np.count_nonzero(self.gameBoard == 0) == 0 or self.gameWinner is not None:
            self.isGameDone = True

        return True

    # apply the given move if it is valid
    def _apply_move(self, action):
      row, col = divmod(action, 3)
      self.gameBoard[row, col] = self.player

    # check if the game has been won
    def check_game_winner(self):
      # Check rows
      for row in self.gameBoard:
          if all(cell == self.player for cell in row):
              self.gameWinner = self.player
              return

      # Check columns
      for col in range(3):
          if all(self.gameBoard[row, col] == self.player for row in range(3)):
              self.gameWinner = self.player
              return

      # Check diagonals
      if all(self.gameBoard[i, i] == self.player for i in range(3)) or \
        all(self.gameBoard[i, 2 - i] == self.player for i in range(3)):
          self.gameWinner = self.player

    # display the current game state
    def display_board(self):
      border = "-------------"

      for i, row in enumerate(self.gameBoard):
          print(" " + " | ".join(["X" if cell == 1 else "O" if cell == -1 else " " for cell in row]))
          if i < 2:
              print(border)

      print()

class QLearningAgent:
    def __init__(self, player, ee_threshold=0.1, learn_rate=0.1, decay_rate=0.9):
        self.qtable = {}
        self.ee_threshold = ee_threshold # Explore-exploit threshold
        self.learn_rate = learn_rate  # Learning rate
        self.decay_rate = decay_rate  # Decay rate
        self.player = player  # current player

    # return the current q value
    def return_q_value(self, state, action):
        q_value = self.qtable.get((state, action), 0.0)
        return q_value

    # agent selects the next action
    def pick_next_action(self, state, available_moves):
        if np.random.rand() < self.ee_threshold:
            return np.random.choice(available_moves)
        else:
            q_values = [self.return_q_value(state, action) for action in available_moves]
            return available_moves[np.argmax(q_values)]

    # update the q value for the given action, reward amount, and next state of the board
    def update_q_value(self, state, action, reward, next_state):
        if reward == 0:  # Update the reward for a tie - reward for tie is 0.8
            reward = 0.8

        # agent picks the next best move on the current board
        best_move = max(self.return_q_value(next_state, next_move) for next_move in range(9))
        current_qvalue = self.return_q_value(state, action)
        # calculate the new qvalue for the new action
        new_qvalue = current_qvalue + self.learn_rate * (reward + self.decay_rate * best_move - current_qvalue)
        self.qtable[(state, action)] = new_qvalue

class RandomAgent:
    def __init__(self, player):
        self.player = player

    # for random player agent, just pick a random move from the available possible moves
    def pick_next_action(self, state, available_moves):
        return random.choice(available_moves)

def train_agent(agent, episodes=10000):
    env = TicTacToe()
    gameWin = 0
    gameTie = 0
    gameLost = 0

    for _ in range(episodes):
        env.reset_game_board()
        state = env.get_flattened_state()

        while not env.isGameDone:
            # get the possible move choices
            available_moves = get_available_moves(env)
            action = agent.pick_next_action(state, available_moves)

            # make the next move
            env.make_board_move(action)
            next_state = env.get_flattened_state()

            if env.gameWinner == 1:
                reward = 1
            elif env.gameWinner == -1:
                reward = -1
            else:
                reward = 0.8

            # update the q table with the current action, reward, and next state
            agent.update_q_value(state, action, reward, next_state)

            state = next_state

        if env.gameWinner == 1:
            gameWin += 1
        if env.gameWinner == -1:
            gameLost += 1
        if env.gameWinner == 0:
            gameTie += 1

    display_training_results(gameWin, gameTie, gameLost)


def display_training_results(gameWin, gameTie, gameLost):
    print("=== Training Results ===")
    print(f"Total Wins: {gameWin}")
    print(f"Total Draws: {gameTie}")
    print(f"Total Losses: {gameLost}")
    print("=======================")

# test two agents playing against each other
def run_agents(agent1, agent2, episodes=100):
    player1_wins = 0
    player2_wins = 0
    ties = 0

    for _ in range(episodes):
        env = TicTacToe()
        state = env.get_flattened_state()

        # while the game is not over, play the two agents
        while not env.isGameDone:
            if env.player == 1:
                current_player = agent1
            else:
                current_player = agent2

            available_moves = get_available_moves(env)
            action = current_player.pick_next_action(state, available_moves)

            env.make_board_move(action)
            state = env.get_flattened_state()

        if env.gameWinner == 1:
            player1_wins += 1
        elif env.gameWinner == -1:
            player2_wins += 1
        else:
            ties += 1

    display_results(player1_wins, player2_wins, ties, episodes)

def get_available_moves(env):
    return [i for i in range(9) if env.is_valid_move(i)]


def display_results(player1_wins, player2_wins, ties, episodes):
    print("Results:")
    print(f"Agent 1 wins: {player1_wins}, Agent 1 win %: {player1_wins/episodes}")
    print(f"Agent 2 wins: {player2_wins}, Agent 2 win %: {player2_wins/episodes}")
    print(f"Ties: {ties}, Tie %: {ties/episodes}")


def compete_with_agent(agent):
    env = TicTacToe()

    while True:
        # start a new game
        env.reset_game_board()
        state = env.get_flattened_state()

        # show the current game state
        while not env.isGameDone:
            env.display_board()

            if env.player == 1:  # player = 1 is the user
               print("User's (X) turn. Pick any number 0 through 8:")
               while True:
                 user_input = input()
                 try:
                     user_action = int(user_input)
                     if not env.is_valid_move(user_action):
                         print("Invalid input. Pick any number 0 through 8.")
                     else:
                         env.make_board_move(user_action)
                         break
                 except ValueError:
                     print("Invalid input. Pick any number 0 through 8.")
            else: # the q-learning agent's turn to play
               available_moves = get_available_moves(env)
               action = agent.pick_next_action(state, available_moves)
               env.make_board_move(action)


            state = env.get_flattened_state()

          # display the end result of the game
        env.display_board()
        if env.gameWinner == 1:
            print("User wins!")
        elif env.gameWinner == -1:
             print("User lost!")
        else:
            print("It's a tie!")


        print("Want to play another round? (Y/N)")
        choice = input().upper()
        if choice != 'Y':
            break

In [None]:
def main():
    iterations = [1000, 10000]
    learn_rate = [.1, .2]
    decay_rate = [.8, .7]
    explore_exploit = [0.2, 0.3]

    for i in iterations:
        for l in learn_rate:
            for d in decay_rate:
                for e in explore_exploit:
                    print(f"Iterations: {i}, Learn Rate: {l}, Decay Rate: {d}, Explore-Exploit Threshold: {e}")
                    # agent is trained and tested against an agent that makes random choices among available spots
                    agent1 = QLearningAgent(player=1, ee_threshold=e, learn_rate=l, decay_rate=d)
                    agent2 = RandomAgent(player=-1)

                    train_agent(agent1, episodes=i)

                    run_agents(agent1, agent2)
                    print()

    iterations = [1000, 2000, 5000, 10000, 15000, 20000]
    learn_rate = [.1]
    decay_rate = [.8]
    explore_exploit = [0.2]

    for i in iterations:
        for l in learn_rate:
            for d in decay_rate:
                for e in explore_exploit:
                    print(f"Iterations: {i}, Learn Rate: {l}, Decay Rate: {d}, Explore-Exploit Threshold: {e}")
                    # agent is trained and tested against an agent that makes random choices among available spots
                    agent1 = QLearningAgent(player=1, ee_threshold=e, learn_rate=l, decay_rate=d)
                    agent2 = RandomAgent(player=-1)

                    train_agent(agent1, episodes=i)

                    run_agents(agent1, agent2)
                    print()

    # UNCOMMENT TO PLAY AGAINST THE Q-LEARNING AGENT
    # compete_with_agent(agent1)

main()

Iterations: 1000, Learn Rate: 0.1, Decay Rate: 0.8, Explore-Exploit Threshold: 0.2
=== Training Results ===
Total Wins: 761
Total Draws: 0
Total Losses: 70
Results:
Agent 1 wins: 75, Agent 1 win %: 0.75
Agent 2 wins: 13, Agent 2 win %: 0.13
Ties: 12, Tie %: 0.12

Iterations: 1000, Learn Rate: 0.1, Decay Rate: 0.8, Explore-Exploit Threshold: 0.3
=== Training Results ===
Total Wins: 696
Total Draws: 0
Total Losses: 144
Results:
Agent 1 wins: 65, Agent 1 win %: 0.65
Agent 2 wins: 26, Agent 2 win %: 0.26
Ties: 9, Tie %: 0.09

Iterations: 1000, Learn Rate: 0.1, Decay Rate: 0.7, Explore-Exploit Threshold: 0.2
=== Training Results ===
Total Wins: 743
Total Draws: 0
Total Losses: 98
Results:
Agent 1 wins: 74, Agent 1 win %: 0.74
Agent 2 wins: 20, Agent 2 win %: 0.2
Ties: 6, Tie %: 0.06

Iterations: 1000, Learn Rate: 0.1, Decay Rate: 0.7, Explore-Exploit Threshold: 0.3
=== Training Results ===
Total Wins: 751
Total Draws: 0
Total Losses: 147
Results:
Agent 1 wins: 66, Agent 1 win %: 0.66
Agent 