In [1]:
import numpy as np 
import random 

In [2]:
 board = np.zeros((3, 3), dtype=int) 

In [5]:
import numpy as np

def display_board(board):
    for row in board:
        print(" | ".join(["X" if cell == 1 else "O" if cell == -1 else " " for cell in row]))
        print("-" * 9)

def is_valid_move(board, row, col):
    return board[row][col] == 0

def check_win(board, player):
    # Check rows and columns
    for row in board:
        if all(cell == player for cell in row):
            return True
    for col in range(3):
        if all(board[row][col] == player for row in range(3)):
            return True
    # Check diagonals
    if all(board[i][i] == player for i in range(3)) or all(board[i][2 - i] == player for i in range(3)):
        return True
    return False

def check_draw(board):
    return np.all(board != 0)


In [6]:
class QLearningAgent:
    def __init__(self, epsilon, alpha, gamma):
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.q_table = {}  # Q-value table

    def get_action(self, state):
        state_tuple = tuple(map(tuple, state))
        if np.random.rand() < self.epsilon:
            # Explore: choose a random valid move
            valid_moves = [i for i in range(9) if state[i // 3][i % 3] == 0]
            return random.choice(valid_moves)
        else:
            # Exploit: choose the action with the highest Q-value
            return max(
                (i for i in range(9) if state[i // 3][i % 3] == 0),
                key=lambda i: self.q_table.get(state_tuple, {}).get(i, 0),
                default=random.choice([i for i in range(9) if state[i // 3][i % 3] == 0])
            )

    def learn(self, state, action, reward, next_state):
        state_tuple = tuple(map(tuple, state))
        next_state_tuple = tuple(map(tuple, next_state))

        if state_tuple not in self.q_table:
            self.q_table[state_tuple] = {}
        if next_state_tuple not in self.q_table:
            self.q_table[next_state_tuple] = {}

        if action not in self.q_table[state_tuple]:
            self.q_table[state_tuple][action] = 0

        # Get the best next action's Q-value
        best_next_action = max(
            (i for i in range(9) if next_state[i // 3][i % 3] == 0),
            key=lambda i: self.q_table.get(next_state_tuple, {}).get(i, 0),
            default=None
        )

        if best_next_action is not None:
            self.q_table[state_tuple][action] += self.alpha * (
                reward + self.gamma * self.q_table[next_state_tuple].get(best_next_action, 0)
                - self.q_table[state_tuple][action]
            )
        else:
            self.q_table[state_tuple][action] += self.alpha * (reward - self.q_table[state_tuple][action])

In [7]:
import numpy as np
import random

# Assuming the existence of the QLearningAgent class
# Define the check_win and check_draw functions used in play_game

def check_win(state, player):
    # Check rows, columns, and diagonals for a win
    for i in range(3):
        if all([state[i][j] == player for j in range(3)]) or all([state[j][i] == player for j in range(3)]):
            return True
    if state[0][0] == player and state[1][1] == player and state[2][2] == player:
        return True
    if state[0][2] == player and state[1][1] == player and state[2][0] == player:
        return True
    return False

def check_draw(state):
    # Check if the board is full with no winner
    return all(state[i][j] != 0 for i in range(3) for j in range(3))

def play_game(agent1, agent2, board):
    state = board.copy()
    agent1_turn = True  # Alternate turns between agent1 and agent2
    
    while True:
        # Agent 1's turn
        if agent1_turn:
            action1 = agent1.get_action(state)
            row, col = divmod(action1, 3)
            state[row][col] = 1

            if check_win(state, 1):
                agent1.learn(state, action1, 1, state)
                return 1  # Agent 1 wins

            if check_draw(state):
                return 0  # Draw

            agent1_turn = False  # Switch turn to agent 2

        # Agent 2's turn
        else:
            action2 = agent2.get_action(state)
            row, col = divmod(action2, 3)
            state[row][col] = -1

            if check_win(state, -1):
                agent1.learn(state, action1, -1, state)
                return -1  # Agent 2 wins

            if check_draw(state):
                return 0  # Draw

            agent1_turn = True  # Switch turn to agent 1

def train_q_learning_agents(agent1, agent2, num_episodes):
    for episode in range(num_episodes):
        board = np.zeros((3, 3), dtype=int)  # Initialize an empty board
        
        # Alternate which agent starts each game
        if episode % 2 == 0:
            result = play_game(agent1, agent2, board)
            if result == 1:
                agent1.learn(board, None, 1, board)
                agent2.learn(board, None, -1, board)
            elif result == -1:
                agent1.learn(board, None, -1, board)
                agent2.learn(board, None, 1, board)
            else:
                agent1.learn(board, None, 0, board)
                agent2.learn(board, None, 0, board)
        else:
            result = play_game(agent2, agent1, board)
            if result == 1:
                agent1.learn(board, None, -1, board)
                agent2.learn(board, None, 1, board)
            elif result == -1:
                agent1.learn(board, None, 1, board)
                agent2.learn(board, None, -1, board)
            else:
                agent1.learn(board, None, 0, board)
                agent2.learn(board, None, 0, board)

# Initialize Q-learning agents with specified parameters
agent1 = QLearningAgent(epsilon=0.2, alpha=0.1, gamma=0.9)
agent2 = QLearningAgent(epsilon=0.2, alpha=0.1, gamma=0.9)

# Train agents over a specified number of episodes
train_q_learning_agents(agent1, agent2, num_episodes=10000)

# Test the board after training (optional)
board = np.zeros((3, 3), dtype=int)


In [8]:
 while True: 
    display_board(board) 
    action1 = agent1.get_action(board) 
    row, col = divmod(action1, 3) 
    board[row][col] = 1 
    if check_win(board, 1): 
        display_board(board) 
        print("Agent 1 wins!") 
        break 
    if check_draw(board): 
        display_board(board) 
        print("It's a draw!") 
        break 
    display_board(board) 
    action2 = agent2.get_action(board) 
    row, col = divmod(action2, 3) 
    board[row][col] = -1 
    if check_win(board, -1): 
        display_board(board) 
        print("Agent 2 wins!") 
        break

  |   |  
---------
  |   |  
---------
  |   |  
---------
  |   |  
---------
  | X |  
---------
  |   |  
---------
O |   |  
---------
  | X |  
---------
  |   |  
---------
O | X |  
---------
  | X |  
---------
  |   |  
---------
O | X | O
---------
  | X |  
---------
  |   |  
---------
O | X | O
---------
  | X |  
---------
  | X |  
---------
Agent 1 wins!
