# **Reinforcement learning : Puissance 4**

Le but de ce script est d'implémenter un agent capable de jouer au jeu Puissance 4

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 1) Création de l'environnement

In [None]:
class Game:
    """
    Class that describes a game being played
    """
    def __init__(self, nb_rows=6, nb_columns=7, nb_players=2):
        self.nb_rows = nb_rows
        self.nb_columns = nb_columns
        self.nb_players = nb_players
        self.grid = np.zeros((nb_rows, nb_columns))
        self.turn = 0
        self.nb_steps = 0 # Total number of steps in the game
        
    def is_draw(self):
        return self.nb_steps == nb_rows*nb_columns
        
    def is_legal(self, column):
        # Takes as input a column and return whether it is possible or not to put a token or the column (True or False)
        col_values = self.grid[:, column]
        non_zeros, = np.where(col_values == 0)
        return not (len(non_zeros) == 0)
    
    def set_token(self, column):
        # Takes as input a column, and in the move is legal, put a token in the column
        try:
            assert self.is_legal(column)
        except:
            raise Exception("The move is not legal. Column {} is already filled".format(column))
        
        row_index = np.where(self.grid[:, column] == 0)[0][0]
        player_token = 1
        if self.turn == 1:
            player_token = -1
            
        self.grid[row_index][column] = player_token
        
        self.turn = (self.turn + 1) % self.nb_players
        self.nb_steps += 1
        
    def won(self, player_id):
        # Return True if the player n°player_id won, False otherwise
        token_player = 1
        if player_id == 1:
            token_player = -1
            
        for row in range(self.nb_rows):
            for column in range(self.nb_columns):
                current_token = self.grid[row][column]
                if current_token == token_player:
                    token_array = token_player*np.ones(3)
                    
                    if np.array_equal(self.grid[row-3:row, column], token_array):
                        return True
                    
                    elif np.array_equal(self.grid[row, column-3:column], token_array):
                        return True
                    
                    elif np.array_equal(self.grid[row-3:row, column-3:column].diagonal(), token_array):
                        return True
                    
        return False
    
    def get_observation(self):
        # Returns the observation that the agent makes
        return self.grid
    
    def render(self):
        # Displays the current state of the game
        X = []
        Y = []
        colors = []
        color_player1 = np.array([210, 200, 0]) / 255
        color_player2 = np.array([255, 0, 0]) / 255
        for row in range(self.nb_rows):
            for col in range(self.nb_rows):
                token = self.grid[row, col]
                if token == 1 or token == -1:
                    Y.append(row)
                    X.append(col)
                    
                    if token == 1:
                        colors.append(color_player1)
                    elif token == -1:
                        colors.append(color_player2)
                    
        plt.scatter(X, Y, color=colors, s=600)
        plt.xticks(list(range(-1, self.nb_columns+2)))
        plt.yticks(list(range(-1, self.nb_rows+2)))
        plt.grid()
        plt.show()

In [None]:
game = Game()

game.set_token(0)
game.set_token(1)
game.set_token(0)
game.set_token(0)
game.set_token(2)
game.set_token(3)
game.render()

In [None]:
class Env:
    """
    Class that represents the environment in which the agent is. 
    The agent is going to do multiple games in the environment
    """
    def __init__(self, actions=[], nb_rows=6, nb_columns=7, nb_players=2):
        self.nb_rows = nb_rows
        self.nb_columns = nb_columns
        self.nb_players = nb_players
        self.game = Game(nb_rows=nb_rows, nb_columns=nb_columns, nb_players=nb_players)
        self.winners = np.array([False, False])
        self.actions = actions
        
    def step(self, action):
        # Takes an action, which is the index of the column where we want to put a token in
        observation = self.game.get_observation()
        self.game.set_token(actions[action])
        next_observation = self.game.get_observation()
        reward = 0
        game_finished = False
        
        if self.game.won(0):
            self.winners[0] = True
            reward = 1
            game_finished = True
            
        elif self.game.won(1):
            self.winners[1] = True
            reward = -10
            game_finished = True
            
        elif self.game.is_draw():
            reward = -1
            game_finished = True
        
        return observation, reward, next_observation, game_finished
    
    def reset(self):
        self.game = Game(nb_rows=self.nb_rows, nb_columns=self.nb_columns, nb_players=self.nb_players)
        self.winners = np.array([False, False])

In [None]:
nb_rows = 6
nb_columns = 7

actions = np.arange(nb_columns)
nb_actions = len(actions)

env = Env(nb_rows=nb_rows, nb_columns=nb_columns, nb_players=2)

# 2) On définit l'estimateur des valeurs d'actions

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [None]:
class MyModel(keras.Model):
  def __init__(self, nb_rows=6, nb_columns=7, nb_actions=7):
    super().__init__()
    self.conv1 = layers.Conv2D(8, (3, 3), activation='relu', input_shape=(nb_rows, nb_columns, 1))
    self.conv2 = layers.Conv2D(8, (2, 2), activation='relu')

    self.flatten = layers.Flatten()
    self.pooling = layers.MaxPooling2D((2,2))
    self.dense1 = layers.Dense(32, activation='relu')
    self.dense2 = layers.Dense(nb_actions)

  def call(self, x):
    x = tf.expand_dims(x, axis=-1)
    x = self.conv1(x)
    x = self.pooling(x)
    x = self.conv2(x)
    x = self.flatten(x)
    x = self.dense1(x)
    return self.dense2(x)

model = MyModel()