In [1]:
import numpy as np
import random

In [2]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False

    def reset(self):
        self.board[:] = 0
        self.done = False
        return self.board.copy()

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def step(self, action, player):
        i, j = action
        if self.board[i, j] != 0:
            return self.board.copy(), -10, True
        self.board[i, j] = player
        reward, self.done = self.check_winner(player)
        return self.board.copy(), reward, self.done

    def check_winner(self, player):
        for i in range(3):
            if all(self.board[i, :] == player) or all(self.board[:, i] == player):
                return 1, True
        if all(np.diag(self.board) == player) or all(np.diag(np.fliplr(self.board)) == player):
            return 1, True
        if not self.available_actions():
            return 0, True
        return 0, False


def board_to_tuple(board):
    return tuple(board.flatten())

In [6]:
# === Encouragement-Exploitation structures ===
N = {}   # visit counts: N[(state, action)]
R = {}   # average rewards: R[(state, action)]

episodes = 500000
epsilon = 0.1   # small probability of random exploration
env = TicTacToe()

In [7]:
for ep in range(episodes):
    state = env.reset()
    player = 1

    while True:
        state_tuple = board_to_tuple(state)
        actions = env.available_actions()

        # --- Encourage unexplored actions ---
        unexplored = [a for a in actions if (state_tuple, a) not in N]

        if unexplored:
            action = random.choice(unexplored)  # encourage exploration
        else:
            # Exploitation: choose action with best average reward
            if random.random() < epsilon:
                action = random.choice(actions)
            else:
                values = [R.get((state_tuple, a), 0) for a in actions]
                action = actions[np.argmax(values)]

        # Step environment
        next_state, reward, done = env.step(action, player)
        key = (state_tuple, action)

        # Update visit count and running average reward
        N[key] = N.get(key, 0) + 1
        R[key] = R.get(key, 0) + (reward - R.get(key, 0)) / N[key]

        state = next_state
        if done:
            break
        player = 3 - player

    if ep % 5000 == 0:
        print(f"Episode {ep}, N-size: {len(N)}, R-size: {len(R)}")

print("Training complete.")

Episode 0, N-size: 7, R-size: 7
Episode 5000, N-size: 2789, R-size: 2789
Episode 10000, N-size: 3737, R-size: 3737
Episode 15000, N-size: 4433, R-size: 4433
Episode 20000, N-size: 4976, R-size: 4976
Episode 25000, N-size: 5422, R-size: 5422
Episode 30000, N-size: 5740, R-size: 5740
Episode 35000, N-size: 6037, R-size: 6037
Episode 40000, N-size: 6232, R-size: 6232
Episode 45000, N-size: 6446, R-size: 6446
Episode 50000, N-size: 6565, R-size: 6565
Episode 55000, N-size: 6736, R-size: 6736
Episode 60000, N-size: 6830, R-size: 6830
Episode 65000, N-size: 6931, R-size: 6931
Episode 70000, N-size: 6988, R-size: 6988
Episode 75000, N-size: 7050, R-size: 7050
Episode 80000, N-size: 7105, R-size: 7105
Episode 85000, N-size: 7152, R-size: 7152
Episode 90000, N-size: 7220, R-size: 7220
Episode 95000, N-size: 7285, R-size: 7285
Episode 100000, N-size: 7328, R-size: 7328
Episode 105000, N-size: 7379, R-size: 7379
Episode 110000, N-size: 7432, R-size: 7432
Episode 115000, N-size: 7469, R-size: 7469

In [8]:
def print_board(board):
    symbols = {0: ' ', 1: 'X', 2: 'O'}
    for i in range(3):
        print(" | ".join(symbols[x] for x in board[i]))
        if i < 2:
            print("-----")


def play_vs_agent():
    state = env.reset()
    player = 1  # human = X
    while True:
        print_board(state)

        if player == 1:  # human turn
            try:
                move = input("Enter your move (row,col): ")
                i, j = map(int, move.split(','))
                if state[i, j] != 0:
                    print("Invalid move. Try again.")
                    continue
                action = (i, j)
            except:
                print("Invalid input. Try again.")
                continue
        else:  # agent turn (O)
            actions = env.available_actions()
            state_tuple = board_to_tuple(state)
            unexplored = [a for a in actions if (state_tuple, a) not in N]

            if unexplored:
                action = random.choice(unexplored)
            else:
                values = [R.get((state_tuple, a), 0) for a in actions]
                action = actions[np.argmax(values)]
            print(f"Agent chooses: {action}")

        next_state, reward, done = env.step(action, player)
        state = next_state

        if done:
            print_board(state)
            if reward == 1:
                print("Player wins!" if player == 1 else "Agent wins!")
            elif reward == 0:
                print("It's a draw!")
            else:
                print("Invalid move. You lose.")
            break

        player = 3 - player


# Start playing
play_vs_agent()


  |   |  
-----
  |   |  
-----
  |   |  
  |   |  
-----
  | X |  
-----
  |   |  
Agent chooses: (0, 0)
O |   |  
-----
  | X |  
-----
  |   |  
O |   | X
-----
  | X |  
-----
  |   |  
Agent chooses: (0, 1)
O | O | X
-----
  | X |  
-----
  |   |  
O | O | X
-----
  | X |  
-----
X |   |  
Player wins!
