In [1]:
import numpy as np

In [2]:
class TicTacToe:
    def __init__(self):
        self.board = [0] * 9
        self.current_player = 1
    
    def reset(self):
        self.board = [0] * 9
        self.current_player = 1
        return self.board
    
    def get_available_actions(self):
        return [i for i, x in enumerate(self.board) if x == 0]
    
    def make_move(self, action):
        self.board[action] = self.current_player
        if self.check_win(self.current_player):
            return self.current_player
        elif self.is_full():
            return 0
        else:
            self.current_player *= -1
            return None
    
    def check_win(self, player):
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        for condition in win_conditions:
            if all(self.board[i] == player for i in condition):
                return True
        return False
    
    def is_full(self):
        return all(x != 0 for x in self.board)

    def render(self):
        for i in range(0, 9, 3):
            print(self.board[i:i+3])


In [3]:
class QLearningAgent:
    def __init__(self, environment, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = environment
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}
        
    def get_state(self, board):
        return tuple(self.env.board)
    
    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.env.get_available_actions())
        else:
            if state not in self.q_table:
                self.q_table[state] = np.zeros(9)
            available_actions = self.env.get_available_actions()
            q_values = [self.q_table[state][i] if i in available_actions else -np.inf for i in range(9)]
            max_q_value = np.max(q_values)
            max_actions = [i for i in range(9) if q_values[i] == max_q_value]
            chosen_action = np.random.choice(max_actions)
            return available_actions[available_actions.index(chosen_action)]
    
    def update_q_table(self, state, action, reward, next_state):
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(9)
        self.q_table[state][action] += self.alpha * (
            reward + self.gamma * np.max(self.q_table[next_state]) - self.q_table[state][action]
        )

## Train one player

In [4]:
def train(agent, num_episodes=5000):
    for episode in range(num_episodes):
        state = agent.env.reset()
        done = False
        while not done:
            if agent.env.current_player == -1:
                action = np.random.choice(env.get_available_actions())
            else:
                action = agent.choose_action(agent.get_state(state))
            reward = agent.env.make_move(action)
            next_state = state.copy()
            if reward is not None:
                done = True
                agent.update_q_table(agent.get_state(state), action, reward, agent.get_state(next_state))
            state = next_state
    
        if (episode + 1) % 500 == 0:
            print(f"Episode {episode + 1}/{num_episodes} completed.")

In [5]:
def evaluate(agent, num_episodes=100, render=False):
    wins, losses, draws = 0, 0, 0
    for _ in range(num_episodes):
        state = agent.env.reset()
        done = False
        while not done:
            if render: agent.env.render()
            action = agent.choose_action(agent.get_state(state))
            if render: print(f"Player {1 if agent.env.current_player == 1 else -1} chooses position {action}")
            reward = agent.env.make_move(action)
            if reward is not None:
                if render: agent.env.render()
                if reward == 1:
                    wins += 1
                    if render: print("WIN!")
                elif reward == -1:
                    losses += 1
                    if render: print("LOSE!")
                elif reward == 0:
                    draws += 1
                    if render: print("It's a draw!")
                break
            state = state.copy()
        if render: print()
    print(f"Evaluation Results: Wins: {wins}, Losses: {losses}, Draws: {draws}")

## 5000

In [6]:
env = TicTacToe()
agent = QLearningAgent(env)
train(agent, 5000)

Episode 500/5000 completed.
Episode 1000/5000 completed.
Episode 1500/5000 completed.
Episode 2000/5000 completed.
Episode 2500/5000 completed.
Episode 3000/5000 completed.
Episode 3500/5000 completed.
Episode 4000/5000 completed.
Episode 4500/5000 completed.
Episode 5000/5000 completed.


In [7]:
# import random
# for _ in range(3):
#     random_key = random.choice(list(agent.q_table.keys()))
#     print(random_key)
#     print(agent.q_table[random_key])
#     print()

In [8]:
evaluate(agent)

Evaluation Results: Wins: 62, Losses: 25, Draws: 13


## 10000

In [9]:
env = TicTacToe()
agent = QLearningAgent(env)
train(agent, 10000)

Episode 500/10000 completed.
Episode 1000/10000 completed.
Episode 1500/10000 completed.
Episode 2000/10000 completed.
Episode 2500/10000 completed.
Episode 3000/10000 completed.
Episode 3500/10000 completed.
Episode 4000/10000 completed.
Episode 4500/10000 completed.
Episode 5000/10000 completed.
Episode 5500/10000 completed.
Episode 6000/10000 completed.
Episode 6500/10000 completed.
Episode 7000/10000 completed.
Episode 7500/10000 completed.
Episode 8000/10000 completed.
Episode 8500/10000 completed.
Episode 9000/10000 completed.
Episode 9500/10000 completed.
Episode 10000/10000 completed.


In [10]:
evaluate(agent)

Evaluation Results: Wins: 63, Losses: 20, Draws: 17


## 15000

In [11]:
env = TicTacToe()
agent = QLearningAgent(env)
train(agent, 15000)

Episode 500/15000 completed.
Episode 1000/15000 completed.
Episode 1500/15000 completed.
Episode 2000/15000 completed.
Episode 2500/15000 completed.
Episode 3000/15000 completed.
Episode 3500/15000 completed.
Episode 4000/15000 completed.
Episode 4500/15000 completed.
Episode 5000/15000 completed.
Episode 5500/15000 completed.
Episode 6000/15000 completed.
Episode 6500/15000 completed.
Episode 7000/15000 completed.
Episode 7500/15000 completed.
Episode 8000/15000 completed.
Episode 8500/15000 completed.
Episode 9000/15000 completed.
Episode 9500/15000 completed.
Episode 10000/15000 completed.
Episode 10500/15000 completed.
Episode 11000/15000 completed.
Episode 11500/15000 completed.
Episode 12000/15000 completed.
Episode 12500/15000 completed.
Episode 13000/15000 completed.
Episode 13500/15000 completed.
Episode 14000/15000 completed.
Episode 14500/15000 completed.
Episode 15000/15000 completed.


In [12]:
evaluate(agent)

Evaluation Results: Wins: 64, Losses: 22, Draws: 14
