Tic-Tac-Toe game code utilizes Q-learning, a reinforcement learning algorithm, to train an AI agent to play optimally against a human player. The game environment is encapsulated in the TicTacToe class, which manages the state of the board, checks for available actions, and determines if a player has won or if there is a tie. The QLearningAgent class implements the Q-learning algorithm, where the agent learns to select the best moves based on the current state of the game. It maintains a Q-table to store the values for each state-action pair, which it updates using the Bellman equation after each move. The agent chooses its moves based on either exploration (random move) or exploitation (best-known move). The game proceeds with the AI making its move, followed by the player's move, and updates the board after each action. The draw_board function is responsible for displaying the board in a clean, user-friendly format. The AI is trained using 1,000 episodes of gameplay, learning from its experiences before playing against the human player. The game continues until either the player or the AI wins, or a tie occurs. The main flow of the game includes interactions between the AI, which makes moves based on the learned strategy, and the human player, who inputs their move manually.

In [2]:
import random
import numpy as np

# Define the Tic-Tac-Toe environment
class TicTacToe:
    def __init__(self):
        self.board = [" " for _ in range(9)]  # Empty board
        self.done = False
        self.winner = None

    def reset(self):
        self.board = [" " for _ in range(9)]
        self.done = False
        self.winner = None
        return self.board

    def available_actions(self):
        return [i for i, x in enumerate(self.board) if x == " "]

    def take_action(self, action, player):
        if self.board[action] == " ":
            self.board[action] = player
            if self.check_winner(player):
                self.done = True
                self.winner = player
            elif " " not in self.board:
                self.done = True
                self.winner = "Tie"
            return True
        return False

    def check_winner(self, player):
        win_positions = [
            (0, 1, 2), (3, 4, 5), (6, 7, 8),  # Rows
            (0, 3, 6), (1, 4, 7), (2, 5, 8),  # Columns
            (0, 4, 8), (2, 4, 6)              # Diagonals
        ]
        for a, b, c in win_positions:
            if self.board[a] == self.board[b] == self.board[c] == player:
                return True
        return False

    def get_reward(self):
        if self.winner == "X":
            return 1  # AI wins
        elif self.winner == "O":
            return -1  # Player wins
        elif self.winner == "Tie":
            return 0  # Tie
        return 0

# Define the Q-Learning agent
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.q_table = {}  # Q-values will be stored here
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor

    def get_q_value(self, state, action):
        if (tuple(state), action) not in self.q_table:
            self.q_table[(tuple(state), action)] = 0  # Initialize if not seen before
        return self.q_table[(tuple(state), action)]

    def update_q_value(self, state, action, reward, next_state, next_action):
        max_q_value = max([self.get_q_value(next_state, a) for a in range(9)])
        current_q_value = self.get_q_value(state, action)
        new_q_value = current_q_value + self.alpha * (reward + self.gamma * max_q_value - current_q_value)
        self.q_table[(tuple(state), action)] = new_q_value

    def choose_action(self, state, available_actions):
        if random.random() < self.epsilon:
            return random.choice(available_actions)  # Explore
        else:
            q_values = [self.get_q_value(state, a) for a in available_actions]
            max_q_value = max(q_values)
            best_actions = [a for a in available_actions if self.get_q_value(state, a) == max_q_value]
            return random.choice(best_actions)  # Exploit

    def train(self, episodes=1000):
        for _ in range(episodes):
            env = TicTacToe()
            state = env.reset()
            while not env.done:
                available_actions = env.available_actions()
                action = self.choose_action(state, available_actions)
                env.take_action(action, "X")
                reward = env.get_reward()
                next_state = env.board
                if env.done:
                    self.update_q_value(state, action, reward, next_state, None)
                    break
                next_action = self.choose_action(next_state, available_actions)
                self.update_q_value(state, action, reward, next_state, next_action)
                state = next_state

# Print the board in the requested format
def draw_board(board):
    print(f"{board[0]} | {board[1]} | {board[2]}")
    print("---------")
    print(f"{board[3]} | {board[4]} | {board[5]}")
    print("---------")
    print(f"{board[6]} | {board[7]} | {board[8]}")

# Test the Q-Learning agent
if __name__ == "__main__":
    agent = QLearningAgent()
    agent.train(1000)  # Train the agent

    # Play a game
    env = TicTacToe()
    state = env.reset()
    print("Welcome to Tic-Tac-Toe!")
    draw_board(state)

    while not env.done:
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)
        print(f"AI chooses move {action}")
        env.take_action(action, "X")
        draw_board(env.board)
        if env.done:
            break

        # Player move
        player_move = int(input("Enter your move (0-8): "))
        env.take_action(player_move, "O")
        draw_board(env.board)

    print("Game Over!")

Welcome to Tic-Tac-Toe!
  |   |  
---------
  |   |  
---------
  |   |  
AI chooses move 1
  | X |  
---------
  |   |  
---------
  |   |  


Enter your move (0-8):  4


  | X |  
---------
  | O |  
---------
  |   |  
AI chooses move 8
  | X |  
---------
  | O |  
---------
  |   | X


Enter your move (0-8):  3


  | X |  
---------
O | O |  
---------
  |   | X
AI chooses move 0
X | X |  
---------
O | O |  
---------
  |   | X


Enter your move (0-8):  2


X | X | O
---------
O | O |  
---------
  |   | X
AI chooses move 5
X | X | O
---------
O | O | X
---------
  |   | X


Enter your move (0-8):  6


X | X | O
---------
O | O | X
---------
O |   | X
Game Over!
