In [1]:
import numpy as np
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1  # Player 1 starts

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.board

    def display_board(self):
        symbols = {1: 'X', -1: 'O', 0: ' '}
        for row in self.board:
            print(" | ".join([symbols[cell] for cell in row]))
        print("-------------")


    def is_winner(self, player):
        # Check if the specified player has won
        return any([
            all(self.board[i, :] == player) for i in range(3)
        ]) or any([
            all(self.board[:, i] == player) for i in range(3)
        ]) or all([
            self.board[i, i] == player for i in range(3)
        ]) or all([
            self.board[i, 2 - i] == player for i in range(3)
        ])

    def is_draw(self):
        # Check if the game is a draw (no winner, board is full)
        return np.all(self.board != 0) and not self.is_winner(1) and not self.is_winner(-1)

    def available_actions(self):
        # List of available actions (empty cells)
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def step(self, action):
        # Apply an action for the current player
        if self.board[action] != 0:
            raise ValueError("Invalid action: cell already occupied")

        self.board[action] = self.current_player

        # Check game outcome
        if self.is_winner(self.current_player):
            reward = 1
            done = True
        elif self.is_draw():
            reward = 0
            done = True
        else:
            reward = 0
            done = False
            self.current_player *= -1  # Switch player

        return self.board, reward, done, None




In [2]:
import random

class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, epsilon=1.0, epsilon_decay=0.99):
        self.q_table = {}  # Initialize Q-table as a dictionary
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay

    def get_state(self, board):
        # Convert the board to a tuple of tuples for dictionary key
        return tuple(map(tuple, board))

    def choose_action(self, state, available_actions):
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)  # Explore
        else:
            # Exploit: choose action with max Q-value for the state
            q_values = [self.q_table.get((state, action), 0) for action in available_actions]
            max_q = max(q_values)
            return random.choice([action for action, q in zip(available_actions, q_values) if q == max_q])

    def update_q_value(self, state, action, reward, next_state, next_available_actions):
        # Update the Q-value using the Q-learning formula
        old_q_value = self.q_table.get((state, action), 0)
        next_q_values = [self.q_table.get((next_state, a), 0) for a in next_available_actions]
        next_max_q = max(next_q_values) if next_q_values else 0

        new_q_value = old_q_value + self.learning_rate * (reward + self.discount_factor * next_max_q - old_q_value)
        self.q_table[(state, action)] = new_q_value

    def decay_epsilon(self):
        # Decay epsilon
        self.epsilon *= self.epsilon_decay


In [3]:
def train(agent, environment, episodes=10000, display_games=5):
    for episode in range(episodes):
        state = environment.reset()
        done = False

        if episode < display_games:
            print(f"--- Training Game {episode + 1} ---")
            environment.display_board()

        while not done:
            state_key = agent.get_state(state)
            available_actions = environment.available_actions()
            action = agent.choose_action(state_key, available_actions)

            # Take the action in the environment
            next_state, reward, done, _ = environment.step(action)
            if episode < display_games:  # Display for first few games
                environment.display_board()  # Show board after each move

            next_state_key = agent.get_state(next_state)
            next_available_actions = environment.available_actions() if not done else []
            agent.update_q_value(state_key, action, reward, next_state_key, next_available_actions)
            state = next_state

        agent.decay_epsilon()

    print("Training complete!")

def test(agent, environment, games=5):
    wins = 0
    draws = 0
    losses = 0

    for game in range(games):
        state = environment.reset()
        done = False
        print(f"--- Testing Game {game + 1} ---")
        environment.display_board()

        while not done:
            state_key = agent.get_state(state)
            available_actions = environment.available_actions()
            action = agent.choose_action(state_key, available_actions)
            state, reward, done, _ = environment.step(action)
            environment.display_board()  # Show board after each move

            # Opponent's turn (random)
            if not done:
                opponent_action = random.choice(environment.available_actions())
                state, _, done, _ = environment.step(opponent_action)
                environment.display_board()  # Show board after opponent's move

        if reward == 1:
            wins += 1
        elif reward == 0:
            draws += 1
        else:
            losses += 1

    print(f"Testing results: Wins: {wins}, Draws: {draws}, Losses: {losses}")


In [4]:
# Create environment and agent
env = TicTacToe()
agent = QLearningAgent()

# Train the agent, displaying the first 5 games
train(agent, env, episodes=10000, display_games=5)

# Test the agent, displaying each game
test(agent, env, games=5)



--- Training Game 1 ---
  |   |  
  |   |  
  |   |  
-------------
  |   |  
X |   |  
  |   |  
-------------
  |   | O
X |   |  
  |   |  
-------------
  |   | O
X |   | X
  |   |  
-------------
  |   | O
X |   | X
  | O |  
-------------
  |   | O
X | X | X
  | O |  
-------------
--- Training Game 2 ---
  |   |  
  |   |  
  |   |  
-------------
  |   | X
  |   |  
  |   |  
-------------
O |   | X
  |   |  
  |   |  
-------------
O | X | X
  |   |  
  |   |  
-------------
O | X | X
  |   |  
  | O |  
-------------
O | X | X
  |   |  
X | O |  
-------------
O | X | X
  |   | O
X | O |  
-------------
O | X | X
X |   | O
X | O |  
-------------
O | X | X
X |   | O
X | O | O
-------------
O | X | X
X | X | O
X | O | O
-------------
--- Training Game 3 ---
  |   |  
  |   |  
  |   |  
-------------
  |   |  
  |   |  
X |   |  
-------------
  |   |  
  |   | O
X |   |  
-------------
  |   |  
  | X | O
X |   |  
-------------
  |   | O
  | X | O
X |   |  
-------------
  | 