Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [75]:
import numpy as np
import random
from collections import namedtuple, defaultdict
from tqdm.auto import tqdm
from copy import deepcopy

In [76]:
# Create the Tic Tac Toe environment
class TicTacToe:
    def __init__(self):
        self.board = [' ' for _ in range(9)]  # Representing the Tic Tac Toe board
        self.current_player = 'X'  # Player 'X' starts the game
        self.winning_combinations = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # Rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # Columns
            [0, 4, 8], [2, 4, 6]             # Diagonals
        ]
    
    def print_board(self):
        for i in range(0, 9, 3):
            print(f"{self.board[i]} | {self.board[i+1]} | {self.board[i+2]}")
        print('---------')
    
    def available_moves(self, board = None):
        if board is None:
            return [i for i, val in enumerate(self.board) if val == ' ']
        else:
            return [i for i, val in enumerate(board) if val == ' ']

    
    def make_move(self, position):
        self.board[position] = self.current_player
        self.current_player = 'O' if self.current_player == 'X' else 'X'
    
    def check_winner(self):
        for combo in self.winning_combinations:
            if (self.board[combo[0]] == self.board[combo[1]] == self.board[combo[2]]) and (self.board[combo[0]] != ' '):
                return self.board[combo[0]]
        return None
    
    def game_over(self):
        return self.check_winner() or ' ' not in self.board
    
    def reset(self):
        self.board = [' ' for _ in range(9)]
        self.current_player = 'X'

# Q-Learning agent to play Tic Tac Toe
class QLearningAgent:
    def __init__(self, epsilon, alpha=0.5, gamma=0.1):
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.q_table = defaultdict(float)  # Q-table to store state-action values
        self.env = None
    
    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)
    
    def update_q_value(self, state, action, reward, next_state, player):
        if player == 'X':
            old_value = self.get_q_value(state, action)
            l = [self.get_q_value(next_state, a) for a in self.env.available_moves(next_state)]
            if len(l) > 0:
                best_next_action = max([self.get_q_value(next_state, a) for a in self.env.available_moves(next_state)])
            else:
                best_next_action = 0
            new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * best_next_action)
            self.q_table[(state, action)] = new_value
        else:
            old_value = self.get_q_value(state, action)
            l = [self.get_q_value(next_state, a) for a in self.env.available_moves(next_state)]
            if len(l) > 0:
                best_next_action = min([self.get_q_value(next_state, a) for a in self.env.available_moves(next_state)])
            else:
                best_next_action = 0
            new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * best_next_action)
            self.q_table[(state, action)] = new_value


    
    def choose_action(self, state, available_moves,play_as = None, playing = False):
        if not playing: #training
            if random.uniform(0, 1) < self.epsilon.get():
                return random.choice(available_moves)
            else:
                if play_as == 'X' :
                    return max(available_moves, key=lambda a: self.get_q_value(state, a))
                else:
                    return min(available_moves, key=lambda a: self.get_q_value(state, a))

        else:
            if play_as == 'X':
                return max(available_moves, key=lambda a: self.get_q_value(state, a))
            else:
                return min(available_moves, key=lambda a: self.get_q_value(state, a))

class RandomAgent:
    def __init__(self):
        pass

    def choose_action(self, state, available_moves, play_as = None, playing = False):
        return random.choice(available_moves)


class EpsilonScheduler():
    def __init__(self, low, high, num_round):
        self.low = low
        self.high = high
        self.num_round = num_round
        self.step = (high - low) / num_round

        self.counter = 0

    def get(self):
        return_val = self.high - self.counter * self.step / 9
        self.counter += 1
        return return_val


# Training the agent
def train_agent(episodes):
    epsilon = EpsilonScheduler(low = 0.1, high = 1, num_round = episodes)
    agent = QLearningAgent(epsilon= epsilon)
    env = TicTacToe()
    agent.env = env
    
    for episode in tqdm(range(episodes)):
        env.reset()
        state = tuple(env.board)
        
        while not env.game_over():
            available_moves = env.available_moves()
            player = env.current_player
            action = agent.choose_action(state, available_moves, play_as=player)
            env.make_move(action)
            next_state = tuple(env.board)
            
            if env.check_winner() == 'X':
                reward = 1


            elif env.check_winner() == 'O':
                reward = -1

            else:
                reward = simulate_reward(next_state, player)
            
            
            agent.update_q_value(state, action, reward, next_state, player = player)
            state = next_state
    
    return agent


# Playing against the trained agent
def play_vs_agent(agent):
    env = TicTacToe()
    state = tuple(env.board)
    
    while not env.game_over():
        if env.current_player == 'X':
            env.print_board()
            print("Your turn! Choose a position (0-8):")
            while True:
                try:
                    user_move = int(input())
                    if user_move in env.available_moves():
                        break
                    else:
                        print("Invalid move! Choose an available position.")
                except ValueError:
                    print("Invalid input! Enter a number.")
            env.make_move(user_move)
        else:
            available_moves = env.available_moves()
            action = agent.choose_action(state, available_moves, play_as = 'O', playing = True)
            env.make_move(action)
        
        state = tuple(env.board)
    
    env.print_board()
    winner = env.check_winner()
    if winner:
        print(f"{winner} wins!")
    else:
        print("It's a tie!")

def agent_vs_agent(agentX, agentO):
    env = TicTacToe()
    state = tuple(env.board)
    
    while not env.game_over():
        if env.current_player == 'X':
            available_moves = env.available_moves()
            action = agentX.choose_action(state, available_moves, play_as = 'X', playing = True)
            env.make_move(action)
        else:
            available_moves = env.available_moves()
            action = agentO.choose_action(state, available_moves, play_as = 'O', playing = True)
            env.make_move(action)
        
        state = tuple(env.board)
    
    return env.check_winner()

def simulate_reward(state, player):
    num_sim = 10
    env = TicTacToe()
    env.board = list(state)
    env.current_player = 'O' if player == 'X' else 'X'
    agentX = RandomAgent()
    agentO = RandomAgent()
    counter = 0

    for _ in range(num_sim):
        while not env.game_over():
            if env.current_player == 'X':
                available_moves = env.available_moves()
                action = agentX.choose_action(state, available_moves, play_as = 'X', playing = True)
                env.make_move(action)
            else:
                available_moves = env.available_moves()
                action = agentO.choose_action(state, available_moves, play_as = 'O', playing = True)
                env.make_move(action)
            
            state = tuple(env.board)
        
        if env.check_winner() == player:
            counter += 1
    
    if counter / num_sim < 0.4:
        return counter / num_sim if player == 'O' else - counter / num_sim
    else:
        return counter / num_sim if player == 'X' else - counter / num_sim
    

In [77]:
print('train agent')
trained_agent = train_agent(episodes=50_000)

train agent


  0%|          | 0/50000 [00:00<?, ?it/s]

In [78]:
#print('play vs agent')
#play_vs_agent(trained_agent)

In [79]:
agentX = RandomAgent()
agentO = trained_agent
num_rounds = 500_000
win = 0
for round in tqdm(range(num_rounds)):

    result = agent_vs_agent(agentX, agentO)

    if result == 'O':
        win += 1

print(f'winning rate: {win / num_rounds}')

  0%|          | 0/500000 [00:00<?, ?it/s]

winning rate: 0.772462


In [80]:
agentO= RandomAgent()
agentX = trained_agent
num_rounds = 500_000
win = 0
for round in tqdm(range(num_rounds)):

    result = agent_vs_agent(agentX, agentO)

    if result == 'X':
        win += 1

print(f'winning rate: {win / num_rounds}')

  0%|          | 0/500000 [00:00<?, ?it/s]

winning rate: 0.955484
