# Apprentissage par renforcement
## StickGame Q-Learning
This notebook show how use QLearning in a little environment like the stickGame.  
**Rules:**  
each player takes their turn, and draws between 1 and 3 sticks. the player pulling the last stick loses

## imports

In [1]:
from random import randint
from typing import Tuple
import random
import numpy as np

In [2]:
class StickGame(object):
    """
    A class that represents the Stick Game.

    Parameters:
    nb (int): Number of sticks to play with.

    Attributes:
    original_nb (int): The original number of sticks.
    nb (int): The current number of sticks in the game.
    """
    def __init__(self, nb: int):
        """
        Initializes a new StickGame object.

        Args:
        nb (int): Number of sticks to play with.
        """
        super(StickGame, self).__init__()
        self.original_nb = nb
        self.nb = nb

    def is_finished(self) -> bool:
        """
        Checks if the game is over.

        Returns:
        bool: True if the game is over, False otherwise.
        """
        if self.nb <= 0:
            return True
        return False

    def reset(self) -> int:
        """
        Resets the state of the game.

        Returns:
        int: The original number of sticks.
        """
        self.nb = self.original_nb
        return self.nb

    def display(self) -> None:
        """
        Displays the state of the game.
        """
        print(f"{self.nb} : {' '.join(['|'] * self.nb + ['_'] * (self.original_nb - self.nb))}")

    def step(self, action) -> Tuple[int, int]:
        """
        Takes an action in the environment.

        Args:
        action (int): The action to take. It can be 1, 2 or 3.

        Returns:
        tuple: A tuple consisting of the current state and the reward obtained
               by taking the action in the environment.
        """
        self.nb -= action
        if self.nb <= 0:
            return None, -1
        else:
            return self.nb, 0

In [3]:
class StickPlayer(object):
    """
    StickPlayer class representing a player in the Stick Game.
    """


    def __init__(self, is_human, size, trainable=True, name=''):
        """
        Initializes a StickPlayer object.

        Args:
        is_human (bool): True if the player is human, False otherwise.
        size (int): The number of sticks in the game.
        trainable (bool, optional): True if the player can be trained, False otherwise. Defaults to True.
        name (str, optional): The name of the player. Defaults to ''.
        """
        super(StickPlayer, self).__init__()
        self.is_human = is_human
        self.history = []
        self.V = {}
        for s in range(1, size+1):
            self.V[s] = 0.
        self.win_nb = 0.
        self.lose_nb = 0.
        self.rewards = []
        self.eps = 0.99
        self.trainable = trainable
        self.name = name

    def reset_stat(self):
        """
        Reset the statistics of the player.
        """
        self.win_nb = 0
        self.lose_nb = 0
        self.rewards = []

    def greedy_step(self, state):
        """
        Takes a greedy step based on the current state.

        Args:
        state (int): The current state.

        Returns:
        int: The action to take.
        """
        actions = [1, 2, 3]
        vmin = None
        vi = None
        for i in range(0, 3):
            a = actions[i]
            if state - a > 0 and (vmin is None or vmin > self.V[state - a]):
                vmin = self.V[state - a]
                vi = i
        return actions[vi if vi is not None else 1]

    def play(self, state):
        """
        Plays a move based on the current state.

        Args:
        state (int): The current state.

        Returns:
        int: The action to take.
        """
        if self.is_human is False:
            # Take random action
            if random.uniform(0, 1) < self.eps:
                action = randint(1, 3)
            else: # Or greedy action
                action = self.greedy_step(state)
        else:
            action = int(input(f"Player {self.name} $>"))
        return action

    def add_transition(self, n_tuple):
        """
        Adds a transition to the history of the player.

        Args:
        n_tuple (tuple): A tuple representing the transition. (s, a , r, s')
        """
        self.history.append(n_tuple)
        s, a, r, sp = n_tuple
        self.rewards.append(r)

    def train(self):
        """
        Trains the player by updating the value function.
        """
        if not self.trainable or self.is_human is True:
            return

        # Update the value function if this player is not human
        for transition in reversed(self.history):
            s, a, r, sp = transition
            if r == 0:
                self.V[s] = self.V[s] + 0.001*(self.V[sp] - self.V[s])
            else:
                self.V[s] = self.V[s] + 0.001*(r - self.V[s])

        self.history = []

In [4]:
def play(game, p1, p2, train=True):
    """
    Play a game between two players, with the option to train them.

    Args:
        game: a game instance that implements the methods `reset`, `is_finished`, `display`, `step`.
        p1, p2: two player instances that implement the method `play`.
        train: a boolean indicating whether to train the players or not. Defaults to True.

    Returns:
        None
    """
    state = game.reset() # Get the initial state of the game
    players = [p1, p2]
    random.shuffle(players) # Randomly select which player goes first
    p = 0 # Counter for turns taken
    
    # Loop until the game is finished
    while not game.is_finished():

        # If the current player is human, display the game board
        if players[p%2].is_human:
            game.display()

        # Get the action selected by the current player and the resulting state and reward
        action = players[p%2].play(state)
        n_state, reward = game.step(action)
        
        # Print out the action taken by the current player if either player is human
        if (players[p%2].is_human or players[(p+1)%2].is_human):
            print(f"\t - Player {players[p%2].name} take {action}")

        # Check if the game is over and update the players' win/loss statistics
        if (reward != 0):
            # Update stat of the current player
            players[p%2].lose_nb += 1. if reward == -1 else 0
            players[p%2].win_nb += 1. if reward == 1 else 0
            
            # Update stat of the other player
            players[(p+1)%2].lose_nb += 1. if reward == 1 else 0
            players[(p+1)%2].win_nb += 1. if reward == -1 else 0

            # Print out the winner if either player is human
            if (players[p%2].is_human or players[(p+1)%2].is_human):
                print(f"Player {players[p%2].name if reward == 1 else players[(p+1)%2].name} win! \n\n--------------------------\n")

        # Add the reversed reward and the new state to the other player
        if p != 0:
            s, a, r, sp = players[(p+1)%2].history[-1]
            players[(p+1)%2].history[-1] = (s, a, reward * -1, n_state)
        
        # Add the current state, action, and reward to the current player's transition history
        players[p%2].add_transition((state, action, reward, None))

        state = n_state # Update the current state for the next turn
        p += 1 # Increment the turn counter
    
    # Train the players if the train parameter is True
    if train:
        p1.train()
        p2.train()

In [5]:
NB_GAMES = 2

In [6]:
game = StickGame(12)

# PLayers to train
p1 = StickPlayer(is_human=False, size=12, trainable=True, name="p1")
p2 = StickPlayer(is_human=False, size=12, trainable=True, name="p2")
# Human player and random player
human = StickPlayer(is_human=True, size=12, trainable=False, name="human")
random_player = StickPlayer(is_human=False, size=12, trainable=False, name="random")

# Train the agent
for i in range(0, 10000):
    if i % 10 == 0:
        p1.eps = max(p1.eps*0.996, 0.05)
        p2.eps = max(p2.eps*0.996, 0.05)
    play(game, p1, p2)
p1.reset_stat()

# Display the value function
for key in p1.V:
    print(key, p1.V[key])
print("--------------------------")

# Play agains a random player
for _ in range(0, 1000):
    play(game, p1, random_player, train=False)
print("p1 win rate", p1.win_nb/(p1.win_nb + p1.lose_nb))
print("p1 win mean", np.mean(p1.rewards))
print("--------------------------")
# Play agains us
nb_games = NB_GAMES
while nb_games > 0:
    play(game, p1, human, train=False)
    nb_games -= 1

1 -0.9875892837196181
2 0.8855045321114904
3 0.25405435752743033
4 0.36849819774566034
5 -0.8575859356196484
6 0.6780372813101246
7 0.07550638220297146
8 0.0789650462041864
9 -0.6657310523621193
10 0.033496591371199744
11 0.023265038265168254
12 0.45105241964333675
--------------------------
p1 win rate 0.945
p1 win mean -0.0187138482477033
--------------------------
	 - Player p1 take 3
9 : | | | | | | | | | _ _ _


Player human $> 2


	 - Player human take 2
	 - Player p1 take 2
5 : | | | | | _ _ _ _ _ _ _


Player human $> 1


	 - Player human take 1
	 - Player p1 take 3
1 : | _ _ _ _ _ _ _ _ _ _ _


Player human $> 1


	 - Player human take 1
Player p1 win! 

--------------------------

12 : | | | | | | | | | | | |


Player human $> 3


	 - Player human take 3
	 - Player p1 take 1
8 : | | | | | | | | _ _ _ _


Player human $> 3


	 - Player human take 3
	 - Player p1 take 2
3 : | | | _ _ _ _ _ _ _ _ _


Player human $> 2


	 - Player human take 2
	 - Player p1 take 2
Player human win! 

--------------------------



## Explanation
La value function calcule la reward possible en fonction de l'état dans lequel on va être,
ici on cherche donc à atteindre la plus petite valeur après chaque action afin de mettre en difficulté l'adversaire.

Pour un jeux avec une taille de 12 baton, 
on observe que les états 9, 5, 1 sont des états perdants,
ainsi l'algorithme va chercher à effecter une action pour atteindre ces états.