---
# Programmation dynamique


Fabrice Mulotti<br>

v2 2023

---

In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import time

---
## Frozen Lake

Découvrons notre environnement <br>
<br>
![ForzenLake](images/frozen_lake.gif)

https://gymnasium.farama.org/environments/toy_text/frozen_lake/

In [2]:
# déclaration de l'environnement
env = env = gym.make('FrozenLake8x8-v1',is_slippery = False,map_name="4x4", render_mode="ansi") # ,render_mode="human")

In [3]:
# affichage
env.reset()
print(env.render())


[41mS[0mFFF
FHFH
FFFH
HFFG



In [4]:
# nombre d'états
env.observation_space.n

16

In [5]:
# nombre d'actions possibles
env.action_space.n

4

In [6]:
LEFT=0
DOWN=1
RIGHT=2
UP=3

In [7]:
# tirage aléatoire de fonction
env.action_space.sample()

2

In [8]:
print(env.reset())

(0, {'prob': 1})


---
## action

https://gymnasium.farama.org/api/env/#gymnasium.Env.step
<br>
env.step retourne les infos suivantes :<br>
- observation (s')<br>
- reward (r)<br>
- termination (bool)<br>
- truncated (bool)<br>
- info <br>

In [9]:
r=env.step(0)

In [10]:
print(r)

(0, 0.0, False, False, {'prob': 1.0})


In [11]:
print(f"Récompense {r[1]}")

Récompense 0.0


## Matrice de transition

__env.P[etat][action] retourne :__<br>
Probabilité<br>
s'<br>
r<br>
état final ? <br>


In [12]:
# Matrice de transition, exemple s=4
env.unwrapped.P[4]

{0: [(1.0, 4, 0.0, False)],
 1: [(1.0, 8, 0.0, False)],
 2: [(1.0, 5, 0.0, True)],
 3: [(1.0, 0, 0.0, False)]}

Si le sol n'est pas glissant : <br>
1 action => 1 état suivant <br>

Si le sol est glissant : <br>
3 destinations possibles (33% de prob), dont une en terminaison <br>

In [13]:
# récompense
env.P[4][RIGHT][0][2]

  logger.warn(


0.0

In [14]:
# prochain état 
env.P[4][RIGHT][0][1]

5

---
## Test complet

In [15]:
# S: initial state
# F: frozen lake
# H: hole
# G: the goal

env.reset()
fin=False
print(env.render())
c=0
while not fin:
    action=env.action_space.sample()
    r=env.step(action)
    print(f"Action={action}, {r}")
    fin = r[2] or r[3]
    time.sleep(0.5)
    print(env.render())
    c+=1
    if c==10:
        fin=True


[41mS[0mFFF
FHFH
FFFH
HFFG

Action=1, (4, 0.0, False, False, {'prob': 1.0})
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

Action=3, (0, 0.0, False, False, {'prob': 1.0})
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG

Action=3, (0, 0.0, False, False, {'prob': 1.0})
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG

Action=1, (4, 0.0, False, False, {'prob': 1.0})
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG

Action=2, (5, 0.0, True, False, {'prob': 1.0})
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG



---
# Itération sur politique


![Politique](images/politique.png)

In [16]:
theta = 0.005 # Notre limite de convergence
gamma = 0.8 # dépréciation du futur

V = np.zeros((env.observation_space.n)) # initialisation fonction de valeur
Policy = np.zeros((env.observation_space.n)) # initialisation d'une politique

loopCounter=0
while True:
    # Policy evaluation -----------------------------------
    while True:
        delta = 0
        loopCounter+=1
        for s in range(env.observation_space.n):
            v = V[s]
            action = Policy[s]
            q=0
            for destination in env.P[s][action]:
                probabilite=destination[0]
                s_prime=destination[1]
                recompense=destination[2]
                q+=probabilite*(recompense+gamma*V[s_prime])
            V[s]=q
            delta = max(delta,np.abs(v-V[s]))
        print(delta)
        if delta < theta:
            break;

    # Policy improvement --------------------------------
    policy_stable=True
    for s in range(env.observation_space.n):
        old_action=Policy[s]
        Q=[]
        for a in range(env.action_space.n):
            q=0
            for destination in env.P[s][a]:
                probabilite=destination[0]
                s_prime=destination[1]
                recompense=destination[2]
                q+=probabilite*(recompense+gamma*V[s_prime])
            Q.append(q)
        new_action=np.argmax(Q)
        if new_action!=old_action:
            policy_stable=False
            Policy[s]=new_action
    if policy_stable==True:
        break

0
1.0
0
0.8
0
0.6400000000000001
0
0.5120000000000001
0
0.40960000000000013
0
0.32768000000000014
0


In [17]:
loopCounter

13

In [18]:
Policy.reshape(4,4)

array([[1., 2., 1., 0.],
       [1., 0., 1., 0.],
       [2., 1., 1., 0.],
       [0., 2., 2., 0.]])

![ForzenLake](images/frozen_lake.gif)

In [38]:
V.reshape(4,4)

array([[0.00997014, 0.01244529, 0.02606749, 0.01445414],
       [0.02067014, 0.        , 0.05871582, 0.        ],
       [0.05304812, 0.13062298, 0.19511969, 0.        ],
       [0.        , 0.24417446, 0.54307587, 0.        ]])

# Conclusion
slippery = False , deterministe<br>
slippery = True , stocastique, choix des actions évitant le risque<br>

---
# Itération sur valeurs

![Valeur](images/iteration_valeur.png)

In [19]:
theta = 0.005 # Notre limite de convergence
gamma = 0.8 # dépréciation du futur

V = np.zeros((env.observation_space.n)) # initialisation fonction de valeur
Policy = np.zeros((env.observation_space.n)) # initialisation d'une politique

# Update value function -----------------------------------
while True:
        delta = 0
        loopCounter+=1
        for s in range(env.observation_space.n):
            v = V[s]
            qmax=0
            for action in range(env.action_space.n): # avant  Policy[s]
                q=0
                for destination in env.P[s][action]:
                    probabilite=destination[0]
                    s_prime=destination[1]
                    recompense=destination[2]
                    q+=probabilite*(recompense+gamma*V[s_prime])
                qmax=max(qmax,q)
                
            V[s]=qmax
            delta = max(delta,np.abs(v-V[s]))
        print(delta)
        if delta < theta:
            break;

            
# Policy  --------------------------------
for s in range(env.observation_space.n):
    Q=[]
    for a in range(env.action_space.n):
        q=0
        for destination in env.P[s][a]:
            probabilite=destination[0]
            s_prime=destination[1]
            recompense=destination[2]
            q+=probabilite*(recompense+gamma*V[s_prime])
        Q.append(q)
    Policy[s]=np.argmax(Q)


1.0
0.8
0.6400000000000001
0.5120000000000001
0.40960000000000013
0.32768000000000014
0


In [20]:
Policy.reshape(4,4)

array([[1., 2., 1., 0.],
       [1., 0., 1., 0.],
       [2., 1., 1., 0.],
       [0., 2., 2., 0.]])

![ForzenLake](images/frozen_lake.gif)

In [42]:
V.reshape(4,4)

array([[0.00616192, 0.0087758 , 0.02240322, 0.01076427],
       [0.0167648 , 0.        , 0.05667309, 0.        ],
       [0.04880346, 0.12684139, 0.19275183, 0.        ],
       [0.        , 0.24076724, 0.54135257, 0.        ]])