In [1]:
import numpy as np
import random

In [10]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False

    def reset(self):
        self.board[:] = 0
        self.done = False
        return self.board.copy()

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def step(self, action, player):
        i, j = action
        if self.board[i, j] != 0:
            return self.board.copy(), -10, True  # Invalid move
        self.board[i, j] = player
        reward, self.done = self.check_winner(player)
        return self.board.copy(), reward, self.done

    def check_winner(self, player):
        for i in range(3):
            if all(self.board[i, :] == player) or all(self.board[:, i] == player):
                return 1, True
        if all(np.diag(self.board) == player) or all(np.diag(np.fliplr(self.board)) == player):
            return 1, True
        if not self.available_actions():
            return 0, True  # Draw
        return 0, False  # Game continues

In [11]:
def board_to_tuple(board):
    return tuple(board.flatten())

In [12]:
Q = {}
alpha = 0.1
gamma = 0.9
epsilon = 0.2
episodes = 1000000

env = TicTacToe()

In [13]:
for ep in range(episodes):
    state = env.reset()
    player = 1
    while True:
        state_tuple = board_to_tuple(state)
        actions = env.available_actions()
        if random.random() < epsilon:
            action = random.choice(actions)
        else:
            qs = [Q.get((state_tuple, a), 0) for a in actions]
            action = actions[np.argmax(qs)]
        next_state, reward, done = env.step(action, player)
        next_state_tuple = board_to_tuple(next_state)
        if done:
            Q[(state_tuple, action)] = Q.get((state_tuple, action), 0) + alpha * (reward - Q.get((state_tuple, action), 0))
            break
        else:
            next_actions = env.available_actions()
            max_next_q = max([Q.get((next_state_tuple, a), 0) for a in next_actions])
            Q[(state_tuple, action)] = Q.get((state_tuple, action), 0) + alpha * (reward + gamma * max_next_q - Q.get((state_tuple, action), 0))
            state = next_state
            player = 3 - player  # Switch player
    if ep % 1000 == 0:
        print(f"Episode {ep}, Q-table size: {len(Q)}")
print("Training complete. Final Q-table size:", len(Q))
print("Sample Q-values:", {k: Q[k] for k in list(Q)[:10]})  # Display first 10 Q-values

Episode 0, Q-table size: 8
Episode 1000, Q-table size: 888
Episode 2000, Q-table size: 1287
Episode 3000, Q-table size: 1603
Episode 4000, Q-table size: 1783
Episode 5000, Q-table size: 1943
Episode 6000, Q-table size: 2059
Episode 7000, Q-table size: 2163
Episode 8000, Q-table size: 2287
Episode 9000, Q-table size: 2406
Episode 10000, Q-table size: 2485
Episode 11000, Q-table size: 2570
Episode 12000, Q-table size: 2651
Episode 13000, Q-table size: 2734
Episode 14000, Q-table size: 2829
Episode 15000, Q-table size: 2918
Episode 16000, Q-table size: 2981
Episode 17000, Q-table size: 3028
Episode 18000, Q-table size: 3063
Episode 19000, Q-table size: 3122
Episode 20000, Q-table size: 3178
Episode 21000, Q-table size: 3232
Episode 22000, Q-table size: 3280
Episode 23000, Q-table size: 3345
Episode 24000, Q-table size: 3393
Episode 25000, Q-table size: 3442
Episode 26000, Q-table size: 3492
Episode 27000, Q-table size: 3538
Episode 28000, Q-table size: 3592
Episode 29000, Q-table size: 36

In [14]:
def print_board(board):
    symbols = {0: ' ', 1: 'X', 2: 'O'}
    for row in board:
        print('|'.join(symbols[x] for x in row))
        print('-' * 5)

In [15]:
def play_vs_agent():
    state = env.reset()
    player = 1  # Human is X (1), agent is O (2)
    while True:
        print_board(state)
        if player == 1:
            # Human move
            try:
                move = input("Enter your move as row,col (e.g., 0,2): ")
                i, j = map(int, move.split(','))
                if state[i, j] != 0:
                    print("Invalid move. Try again.")
                    continue
                action = (i, j)
            except Exception:
                print("Invalid input. Try again.")
                continue
        else:
            # Agent move
            state_tuple = board_to_tuple(state)
            actions = env.available_actions()
            qs = [Q.get((state_tuple, a), 0) for a in actions]
            action = actions[np.argmax(qs)]
            print(f"Agent chooses: {action}")

        next_state, reward, done = env.step(action, player)
        state = next_state
        if done:
            print_board(state)
            if reward == 1:
                print("Player" if player == 1 else "Agent", "wins!")
            elif reward == 0:
                print("It's a draw!")
            else:
                print("Invalid move!")
            break
        player = 3 - player  # Switch player


In [16]:
play_vs_agent()

 | | 
-----
 | | 
-----
 | | 
-----
 | | 
-----
 |X| 
-----
 | | 
-----
Agent chooses: (0, 0)
O| | 
-----
 |X| 
-----
 | | 
-----
Invalid move. Try again.
O| | 
-----
 |X| 
-----
 | | 
-----
O| |X
-----
 |X| 
-----
 | | 
-----
Agent chooses: (0, 1)
O|O|X
-----
 |X| 
-----
 | | 
-----
Invalid move. Try again.
O|O|X
-----
 |X| 
-----
 | | 
-----
O|O|X
-----
 |X| 
-----
X| | 
-----
Player wins!
