In [1]:
# Import necessary libraries
import numpy as np
import random
from collections import defaultdict

In [2]:
# Tic-Tac-Toe environment setup
class TicTacToe:
    def __init__(self):
        self.reset()
    
    def reset(self):
        # Reset the board to an empty state
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        return self.get_state()
    
    def get_state(self):
        # Flatten the board as the state representation
        return tuple(self.board.flatten())
    
    def available_actions(self):
        # Return a list of available actions (empty cells)
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]
    
    def step(self, action, player):
        # Perform the action on the board
        if self.board[action] == 0:
            self.board[action] = player
            reward, self.done = self.check_winner(player)
            return self.get_state(), reward, self.done
        else:
            return self.get_state(), -1, self.done  # Illegal move
    
    def check_winner(self, player):
        # Check if the current player has won
        for i in range(3):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return 1, True  # Reward of 1 for winning
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return 1, True
        if not any(0 in row for row in self.board):
            return 0.5, True  # Reward of 0.5 for draw
        return 0, False  # No reward if the game continues


In [3]:
# Game display function for visual debugging
def display_board(board):
    board_symbols = {0: '.', 1: 'X', 2: 'O'}
    for row in board:
        print(" | ".join([board_symbols[cell] for cell in row]))
    print()

In [4]:
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_rate=1.0, exploration_decay=0.99):
        self.q_table = defaultdict(float)
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
    
    def choose_action(self, state, available_actions):
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(available_actions)
        else:
            q_values = {action: self.q_table[(state, action)] for action in available_actions}
            return max(q_values, key=q_values.get)
    
    def update_q_value(self, state, action, reward, next_state, done, available_actions):
        old_q_value = self.q_table[(state, action)]
        next_max = max([self.q_table[(next_state, a)] for a in available_actions], default=0)
        new_q_value = (1 - self.learning_rate) * old_q_value + self.learning_rate * (reward + self.discount_factor * next_max * (1 - done))
        self.q_table[(state, action)] = new_q_value
        if done:
            self.exploration_rate *= self.exploration_decay

In [5]:
# Training the agent
env = TicTacToe()
agent = QLearningAgent()

num_episodes = 10000

for episode in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)
        next_state, reward, done = env.step(action, 1)
        agent.update_q_value(state, action, reward, next_state, done, available_actions)
        state = next_state

        if not done:
            # Simulate random opponent's move
            opp_action = random.choice(env.available_actions())
            next_state, reward, done = env.step(opp_action, 2)
            state = next_state


In [6]:
# Testing the trained agent
state = env.reset()
done = False

print("Starting Tic-Tac-Toe Game!")
display_board(env.board)

while not done:
    # Agent's move
    available_actions = env.available_actions()
    action = agent.choose_action(state, available_actions)
    next_state, reward, done = env.step(action, 1)
    state = next_state
    display_board(env.board)

    if done:
        if reward == 1:
            print("Agent wins!")
        elif reward == 0.5:
            print("It's a draw!")
        break

    # Opponent's move
    if not done:
        opp_action = random.choice(env.available_actions())
        next_state, reward, done = env.step(opp_action, 2)
        state = next_state
        display_board(env.board)

        if done:
            if reward == 1:
                print("Opponent wins!")
            elif reward == 0.5:
                print("It's a draw!")
            break

Starting Tic-Tac-Toe Game!
. | . | .
. | . | .
. | . | .

X | . | .
. | . | .
. | . | .

X | . | .
. | . | .
. | . | O

X | X | .
. | . | .
. | . | O

X | X | .
. | O | .
. | . | O

X | X | X
. | O | .
. | . | O

Agent wins!
