In [None]:
# Installation des dépendances
%pip install numpy

import numpy as np


In [209]:

"""
Direction:
    - 1: haut
    - 2: droite
    - 3: bas
    - 4: gauche
"""

class Mdp():

    def __init__(self, gamma=0.9, epsilon=0.03, width=3, height=3, start=(0,0), goal=(2,2), swamps=[]):
        self.gamma = gamma
        self.epsilon = epsilon
        self.width = width
        self.height = height
        self.goal = goal
        self.swamps = swamps
        #self.Pmatrix = matrice_P(self, pos, aim, width, height)

    def generate_grid(self, rewards=[-1, 0, 5]):
        """
        Génère une grille avec les valeurs de rewards
        """
        self.grid = np.zeros((self.height, self.width))
        self.values = np.zeros((self.height, self.width, 2))

        # print(self.grid)

        for i in range(self.height):
            for j in range(self.width):
                self.grid[i, j] = rewards[1]

        # place the swamps (-1)
        for swamp in self.swamps:
            self.grid[swamp] = rewards[0]
        
        # place the goal (5)
        self.grid[self.goal] = rewards[2]

    def print_grid(self):
        """
        Affiche la grille
        """
        for i in range(self.height):
            for j in range(self.width):
                print(self.grid[i, j], end="\t")
            print()
        
    def possible_actions(self, pos):
        """
        Retourne les actions possibles pour une position donnée
        """
        x, y = pos
        actions = []
        if x > 0:
            actions.append(1)
        if y < self.width - 1:
            actions.append(2)
        if x < self.height - 1:
            actions.append(3)
        if y > 0:
            actions.append(4)
        return actions
        
    def rewards(self, pos):
        return self.grid[pos]

    def P_Matrix(self, pos, aim):
        """
        Retourne la matrice P 
        Si on est sur un bord, 90% de chance d'attérir à côté de la case
        Si on est au milieu, 80% de chance d'attérir à côté de la case (10% gauche 10% droite)
        aim : 
        1 vers le haut
        2 vers la droite
        3 vers le bas
        4 vers la gauche
        """
        # create a 3 x 3 matrix of zeros
        P = np.zeros((3, 3))
        width = self.width
        height = self.height
        
        x, y = pos

        # on the left going up or down
        if (y == 0 and (aim == 1 or aim == 3)):
            if aim == 1:
                P[0, 1] = 0.9
                P[0, 2] = 0.1
            else:
                P[2, 1] = 0.9
                P[2, 2] = 0.1
        # on the right going up or down
        elif (y == width - 1 and (aim == 1 or aim == 3)):
            if aim == 1:
                P[0, 2] = 0.9
                P[0, 1] = 0.1
            else:
                P[2, 2] = 0.9
                P[2, 1] = 0.1
        # on the top going left or right
        elif (x == 0 and (aim == 2 or aim == 4)):
            if aim == 2:
                P[1, 2] = 0.9
                P[2, 2] = 0.1
            else:
                P[1, 0] = 0.9
                P[2, 0] = 0.1
        # on the bottom going left or right
        elif (x == height - 1 and (aim == 2 or aim == 4)):
            if aim == 2:
                P[2, 2] = 0.9
                P[1, 2] = 0.1
            else:
                P[2, 0] = 0.9
                P[1, 0] = 0.1

        # We asume we are not on a border
        # Going up
        elif aim == 1:
            P[0, 1] = 0.8
            P[0, 0] = 0.1
            P[0, 2] = 0.1
        
        # Going right
        elif aim == 2:
            P[1, 2] = 0.8
            P[0, 2] = 0.1
            P[2, 2] = 0.1
        
        # Going down
        elif aim == 3:
            P[2, 1] = 0.8
            P[2, 2] = 0.1
            P[2, 0] = 0.1
        
        # Going left
        elif aim == 4:
            P[1, 0] = 0.8
            P[0, 0] = 0.1
            P[2, 0] = 0.1

        return P
    
    def value_calculation(self, pos, aim):
        P = self.P_Matrix(pos, aim)
        val = 0
        for i in range(3):
            for j in range(3):
                next_pos = (pos[0]+i-1, pos[1]+j-1)
                if (next_pos[0] < 0 or next_pos[0] > self.height - 1 or next_pos[1] < 0 or next_pos[1] > self.width - 1):
                    continue
                val += P[i, j] * self.values[next_pos][0]
        return val
    
    def step_up(self):
        """
        Effectue une étape de l'algorithme de value iteration
        """
        delta = 0
        for i in range(self.height):
            for j in range(self.width):
                pos = (i, j)
                old_value = self.values[pos][0]
                possible_actions = self.possible_actions(pos)
                values = {action: 0 for action in possible_actions}
                for action in possible_actions:
                    values[action] = self.value_calculation(pos, action)
                new_value = self.rewards(pos) + self.gamma * max(values.values())
                new_value_action = max(values, key=values.get)
                self.values[pos] = [new_value, new_value_action]
                delta = max(delta, abs(old_value - self.values[pos][0]))
        return delta

    def algorithm(self):
        t = 1
        delta = self.step_up()
        deltas = [delta]
        while delta > self.epsilon:
            delta = self.step_up()
            deltas.append(delta)
            t += 1
                
        return deltas



In [230]:
test_pos = (10, 10)
test_aim = 1

area_size = (5, 5)
swamps = [(1, 1),(1, 2), (1, 3), (2, 3), (3, 3)]
goal = (0, 3)
start = (4, 1)
rewards = [-1, 0, 5]

gamma = 0.4
epsilon = 0.01

mdp = Mdp(gamma=gamma, epsilon=epsilon, width=area_size[0], height=area_size[1], goal=goal, start=start, swamps=swamps)
mdp.generate_grid(rewards=rewards)
#mdp.print_grid()
#print("---")

#print(mdp.possible_actions((1, 0)))
#print(mdp.P_Matrix((1, 0), 4))
#print("---")


#actions = mdp.possible_actions(test_pos)
#print(actions)

dc = {1: "↑", 2: "→", 3: "↓", 4: "←"}

deltas = mdp.algorithm()
for i in range(mdp.height):
    for j in range(mdp.width):
        if (i, j) == start: print("S", end="")
        if (i, j) == goal: print("G", end="")
        print(dc[mdp.values[i, j, 1]], end="\t")
    print()

→	→	→	G→	←	
↑	↑	↑	↑	←	
↑	←	↑	↑	↑	
↑	↑	←	→	↑	
↑	S↑	←	←	↑	


In [None]:
array = np.zeros((1,2))

print(array)
