In [4]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
from collections import defaultdict

In [2]:
class TicTacToeEnv(gym.Env):
    def __init__(self):
        super(TicTacToeEnv, self).__init__()
        
        # 9 discrete positions to place X or O
        self.action_space = spaces.Discrete(9)

        # Observation: 9 cells with values {0: empty, 1: agent, -1: opponent}
        self.observation_space = spaces.Box(low=-1, high=1, shape=(9,), dtype=np.int8)

        self.reset()

    def reset(self, seed=None, options=None):
        self.board = np.zeros(9, dtype=np.int8)
        self.done = False
        self.winner = None
        return self.board.copy(), {}

    def step(self, action):
        if self.done or self.board[action] != 0:
            return self.board.copy(), -10, True, False, {}  # illegal move

        # Agent (1) moves
        self.board[action] = 1
        reward, terminated = self.check_winner(1)
        if terminated:
            return self.board.copy(), reward, True, False, {}

        # Opponent (-1) random move
        available = np.where(self.board == 0)[0]
        if len(available) > 0:
            opp_action = np.random.choice(available)
            self.board[opp_action] = -1
            reward, terminated = self.check_winner(-1)
            if terminated:
                return self.board.copy(), -1, True, False, {}

        # Draw?
        if np.all(self.board != 0):
            return self.board.copy(), 0.5, True, False, {}

        return self.board.copy(), 0, False, False, {}

    def check_winner(self, player):
        b = self.board.reshape(3, 3)
        for i in range(3):
            if np.all(b[i, :] == player) or np.all(b[:, i] == player):
                return (1 if player == 1 else -1), True
        if np.all(np.diag(b) == player) or np.all(np.diag(np.fliplr(b)) == player):
            return (1 if player == 1 else -1), True
        return 0, False

    def render(self):
        symbols = {1: 'X', -1: 'O', 0: '.'}
        b = self.board.reshape(3, 3)
        for row in b:
            print(" ".join(symbols[cell] for cell in row))
        print()


In [3]:
env = TicTacToeEnv()
obs, _ = env.reset()
env.render()
obs, reward, done, _, _ = env.step(0)  # Agent places at position 0
env.render()


. . .
. . .
. . .

X . .
. . O
. . .



In [5]:
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount=0.95, epsilon=1.0, epsilon_decay=0.9995, epsilon_min=0.1):
        self.q_table = defaultdict(float)
        self.alpha = learning_rate
        self.gamma = discount
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def get_qs(self, state):
        return [self.q_table[(tuple(state), a)] for a in range(9)]

    def choose_action(self, state, available_actions):
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        qs = self.get_qs(state)
        # Choose max Q only among available actions
        qs_avail = {a: qs[a] for a in available_actions}
        return max(qs_avail, key=qs_avail.get)

    def update(self, state, action, reward, next_state, done, available_actions):
        max_future_q = max([self.q_table[(tuple(next_state), a)] for a in available_actions]) if not done else 0
        current_q = self.q_table[(tuple(state), action)]

        new_q = current_q + self.alpha * (reward + self.gamma * max_future_q - current_q)
        self.q_table[(tuple(state), action)] = new_q

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [6]:
env = TicTacToeEnv()
agent = QLearningAgent()

num_episodes = 50000
win_count = 0
draw_count = 0
lose_count = 0

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        available = np.where(state == 0)[0]
        action = agent.choose_action(state, available)

        next_state, reward, done, _, _ = env.step(action)

        next_available = np.where(next_state == 0)[0]
        agent.update(state, action, reward, next_state, done, next_available)

        state = next_state

        if done:
            if reward == 1:
                win_count += 1
            elif reward == 0.5:
                draw_count += 1
            elif reward == -1:
                lose_count += 1

    # Print every 5000 episodes
    if (episode + 1) % 5000 == 0:
        print(f"Episode {episode+1} - Win: {win_count}, Draw: {draw_count}, Loss: {lose_count}")
        win_count = draw_count = lose_count = 0


Episode 5000 - Win: 4315, Draw: 296, Loss: 389
Episode 10000 - Win: 4550, Draw: 239, Loss: 211
Episode 15000 - Win: 4587, Draw: 246, Loss: 167
Episode 20000 - Win: 4620, Draw: 223, Loss: 157
Episode 25000 - Win: 4635, Draw: 207, Loss: 158
Episode 30000 - Win: 4633, Draw: 213, Loss: 154
Episode 35000 - Win: 4656, Draw: 192, Loss: 152
Episode 40000 - Win: 4628, Draw: 217, Loss: 155
Episode 45000 - Win: 4675, Draw: 187, Loss: 138
Episode 50000 - Win: 4709, Draw: 186, Loss: 105


In [10]:
def evaluate_agent(agent, env, episodes=1000):
    wins, draws, losses = 0, 0, 0

    for _ in range(episodes):
        state, _ = env.reset()
        done = False

        while not done:
            available = np.where(state == 0)[0]
            action = agent.choose_action(state, available)
            next_state, reward, done, _, _ = env.step(action)
            state = next_state

            if done:
                if reward == 1:
                    wins += 1
                elif reward == 0.5:
                    draws += 1
                elif reward == -1:
                    losses += 1
                break

    total = wins + draws + losses
    print(f"Out of {total} games:")
    print(f" Wins: {wins}")
    print(f" Draws: {draws}")
    print(f" Losses: {losses}")
    print(f" Win Rate: {wins / total:.2%}")


In [11]:
evaluate_agent(agent, env, episodes=1000)


Out of 1000 games:
 Wins: 927
 Draws: 45
 Losses: 28
 Win Rate: 92.70%


In [7]:
def play_vs_agent(agent, env):
    state, _ = env.reset()
    env.render()

    while True:
        # Agent move
        available = np.where(state == 0)[0]
        action = agent.choose_action(state, available)
        print(f"Agent (X) chooses: {action}")
        state, reward, done, _, _ = env.step(action)
        env.render()

        if done:
            if reward == 1:
                print("Agent wins!")
            elif reward == 0.5:
                print("Draw!")
            elif reward == -1:
                print("You win!")
            break

        # Human move
        while True:
            try:
                user_action = int(input("Your move (0-8): "))
                if state[user_action] == 0:
                    break
                else:
                    print("Invalid move! Cell is already taken.")
            except:
                print("Please enter a valid number from 0 to 8.")

        # Apply human move
        row, col = divmod(user_action, 3)
        env.board[row * 3 + col] = -1  # Human is -1
        state = env.board.copy()
        env.render()

        # Check if human wins
        r, done_flag = env.check_winner(-1)
        if done_flag:
            if r == -1:
                print("You win!")
            else:
                print("Draw!")
            break


In [13]:
play_vs_agent(agent, env)


. . .
. . .
. . .

Agent (X) chooses: 0
X . O
. . .
. . .



X O O
. . .
. . .

Agent (X) chooses: 6
X O O
. O .
X . .

X O O
. O O
X . .

Agent (X) chooses: 8
X O O
. O O
X O X

You win!
