In [1]:
import numpy as np

In [2]:
class TicTacToe:
    def __init__(self):
        self.board = [0] * 9
        self.current_player = 1
    
    def reset(self):
        self.board = [0] * 9
        self.current_player = 1
        return self.board
    
    def get_available_actions(self):
        return [i for i, x in enumerate(self.board) if x == 0]
    
    def make_move(self, action):
        self.board[action] = self.current_player
        if self.check_win(self.current_player):
            return self.current_player
        elif self.is_full():
            return 0
        else:
            self.current_player *= -1
            return None
    
    def check_win(self, player):
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        for condition in win_conditions:
            if all(self.board[i] == player for i in condition):
                return True
        return False
    
    def is_full(self):
        return all(x != 0 for x in self.board)

    def render(self):
        for i in range(0, 9, 3):
            print(self.board[i:i+3])


In [3]:
class QLearningAgent:
    def __init__(self, environment, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = environment
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}
        
    def get_state(self, board):
        return tuple(self.env.board)
    
    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.env.get_available_actions())
        else:
            if state not in self.q_table:
                self.q_table[state] = np.zeros(9)
            available_actions = self.env.get_available_actions()
            q_values = [self.q_table[state][i] if i in available_actions else -np.inf for i in range(9)]
            max_q_value = np.max(q_values)
            max_actions = [i for i in range(9) if q_values[i] == max_q_value]
            chosen_action = np.random.choice(max_actions)
            return available_actions[available_actions.index(chosen_action)]
    
    def update_q_table(self, state, action, reward, next_state):
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(9)
        self.q_table[state][action] += self.alpha * (
            reward + self.gamma * np.max(self.q_table[next_state]) - self.q_table[state][action]
        )

In [4]:
def train(agent, num_episodes=10000):
    for episode in range(num_episodes):
        state = agent.env.reset()
        done = False
        while not done:
            action = agent.choose_action(agent.get_state(state))
            reward = agent.env.make_move(action)
            next_state = state.copy()
            if reward is not None:
                done = True
                agent.update_q_table(agent.get_state(state), action, reward, agent.get_state(next_state))
            state = next_state
    
        if (episode + 1) % 1000 == 0:
            print(f"Episode {episode + 1}/{num_episodes} completed.")

env = TicTacToe()
agent = QLearningAgent(env)
train(agent)

Episode 1000/10000 completed.
Episode 2000/10000 completed.
Episode 3000/10000 completed.
Episode 4000/10000 completed.
Episode 5000/10000 completed.
Episode 6000/10000 completed.
Episode 7000/10000 completed.
Episode 8000/10000 completed.
Episode 9000/10000 completed.
Episode 10000/10000 completed.


In [5]:
import random
for _ in range(3):
    random_key = random.choice(list(agent.q_table.keys()))
    print(random_key)
    print(agent.q_table[random_key])
    print()

(0, 0, 1, 0, -1, 1, 1, -1, -1)
[0. 0. 0. 0. 0. 0. 0. 0. 0.]

(1, -1, 0, -1, -1, 1, -1, 1, 1)
[0. 0. 0. 0. 0. 0. 0. 0. 0.]

(0, 1, 1, 0, -1, 0, -1, 1, 0)
[0. 0. 0. 0. 0. 0. 0. 0. 0.]



In [6]:
def test(agent, num_episodes=10): 
    for _ in range(num_episodes):
        state = agent.env.reset()
        done = False
        while not done:
            agent.env.render()
            action = agent.choose_action(agent.get_state(state))
            print(f"Player {1 if agent.env.current_player == 1 else -1} chooses position {action}")
            reward = agent.env.make_move(action)
            if reward is not None:
                agent.env.render()
                if reward == 1:
                    print("WIN!")
                elif reward == -1:
                    print("LOSE!")
                elif reward == 0:
                    print("It's a draw!")
                break
            state = state.copy()
        print()

test(agent)

[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
Player 1 chooses position 0
[1, 0, 0]
[0, 0, 0]
[0, 0, 0]
Player -1 chooses position 8
[1, 0, 0]
[0, 0, 0]
[0, 0, -1]
Player 1 chooses position 3
[1, 0, 0]
[1, 0, 0]
[0, 0, -1]
Player -1 chooses position 2
[1, 0, -1]
[1, 0, 0]
[0, 0, -1]
Player 1 chooses position 5
[1, 0, -1]
[1, 0, 1]
[0, 0, -1]
Player -1 chooses position 7
[1, 0, -1]
[1, 0, 1]
[0, -1, -1]
Player 1 chooses position 4
[1, 0, -1]
[1, 1, 1]
[0, -1, -1]
WIN!

[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
Player 1 chooses position 4
[0, 0, 0]
[0, 1, 0]
[0, 0, 0]
Player -1 chooses position 0
[-1, 0, 0]
[0, 1, 0]
[0, 0, 0]
Player 1 chooses position 6
[-1, 0, 0]
[0, 1, 0]
[1, 0, 0]
Player -1 chooses position 8
[-1, 0, 0]
[0, 1, 0]
[1, 0, -1]
Player 1 chooses position 2
[-1, 0, 1]
[0, 1, 0]
[1, 0, -1]
WIN!

[0, 0, 0]
[0, 0, 0]
[0, 0, 0]
Player 1 chooses position 7
[0, 0, 0]
[0, 0, 0]
[0, 1, 0]
Player -1 chooses position 2
[0, 0, -1]
[0, 0, 0]
[0, 1, 0]
Player 1 chooses position 1
[0, 1, -1]
[0, 0, 0]
[0, 1, 0]
Pl