### Install Package  😈
---

In [42]:
import numpy as np
import gym
import random

### Créer la env 🛋
---
- Here we'll create the FrozenLake environment.
- OpenAI Gym is a library composed of many environments that we can use to train our agents.
- In our case we choose to use Frozen Lake.

In [43]:
env = gym.make("FrozenLake-v0")

### Créer la Q-Table 📜
---
- Now, we'll create our Q-table, to know how much rows (states) and columns (actions) we need, we need to calculate the action_size and the state_size
- OpenAI Gym provides us a way to do that: env.action_space.n and env.observation_space.n


In [44]:
# C'est cols
action_size = env.action_space.n

# C'est rows
state_size = env.observation_space.n

# Créer la Q-table:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


### Créer la parameters:
---


In [45]:
total_episodes = 20000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.001             # Exponential decay rate for exploration prob

### Créer la code de simulation:
---

<img src="image/01_g_learning_algorithm.png"> 
<br>

In [52]:
# List of rewards
rewards = []

# 2 Loop tout episodes:
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    game_over = False
    total_rewards = 0
    
    for step in range(max_steps):
        # On faire le random-number
        exploration_exploitation_flag = random.uniform(0,1)
        
        # Si la flag >  epsilon, on faire la exploitation:
        # Prendre la gros value pour cette state.
        if exploration_exploitation_flag > epsilon:
            action = np.argmax(qtable[state,:])
        # Si la flag <  epsilon, on faire la exploration:
        # Prendre la random-action
        else:
            action = env.action_space.sample()
            
        # Prendre la action, obtenir la prochain state (s), obetenir la reward (r)
        new_state , reward, done, info = env.step(action)
        
        # Update Q(s,a) = Q(s,a) + lr [ R(s,a) + gamma * max Q(s',a') - Q(s,a) ]
        qtable[state,action] = qtable[state, action] + learning_rate *(reward + gamma * np.max(qtable[new_state,:])-qtable[state,action])
        
        # [total_reward]: Mise à jour  
        total_rewards += reward
        state = new_state
        
        # Si game_over, on arrete:
        if game_over == True:
            break
    
    # réduire epsilon. (on a besoin de moin de epsilon, apres beaucoup de epsiodes)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)* np.exp(-decay_rate*episode)
    rewards.append(total_rewards)

print("Score average over time: " + str(sum(rewards)/total_episodes))
print(qtable)
    
    

Score average over time: 0.39213333333333333
[[3.01761472e-01 9.33860053e-02 6.67167880e-02 9.64229354e-02]
 [1.72670875e-03 2.84671311e-03 3.19467470e-03 9.77669380e-02]
 [9.51787724e-03 1.02797602e-01 4.59375686e-03 2.92442004e-02]
 [8.05869129e-03 2.38979502e-03 5.11088320e-04 2.87209909e-02]
 [3.57377444e-01 4.03810701e-03 1.65507229e-03 1.23089050e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.49607125e-07 1.27072759e-06 1.43632908e-03 4.29753813e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.17790528e-02 6.33951267e-02 7.23005310e-02 4.97126631e-01]
 [2.56116906e-02 6.37137949e-01 1.62250338e-02 3.37630919e-02]
 [1.44058513e-01 8.80709516e-03 8.67646795e-03 7.06437298e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.27995039e-02 7.20602990e-02 2.89110683e-01 4.07920935e-01]
 [3.61878687e-01 9.38362214e-01 2.25678009e-01 1.67927844e-01]
 [0.000000

In [56]:
# Afficher:
# left: 0, down: 1, right: 2, up: 3
env.reset()
env.render()
possible_action_in_each_postion = np.argmax(qtable,axis=1).reshape(4,4)
possible_action_in_each_postion


[41mS[0mFFF
FHFH
FFFH
HFFG


array([[0, 3, 1, 3],
       [0, 0, 2, 0],
       [3, 1, 0, 0],
       [0, 3, 1, 0]])

In [57]:
env.reset()
max_steps = 99
for episode in range(5):
    state     = env.reset()
    step      = 0
    game_over = False
    msg  = "---------------------------------------------------\n"
    msg += "Dans la episonde [%d]\n"%episode
    print(msg)
    
    for step in range(max_steps):
        
        action = np.argmax(qtable[state,:])
        
        new_state, reward, game_over, info = env.step(action)
        
        
        if game_over:
            env.render()   
            print("Number of steps ",step)
            print(info)
            break
        
        state = new_state
    
    
print("C'est fini....")
env.close()

---------------------------------------------------
Dans la episonde [0]

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps  26
{'prob': 0.3333333333333333}
---------------------------------------------------
Dans la episonde [1]

---------------------------------------------------
Dans la episonde [2]

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps  22
{'prob': 0.3333333333333333}
---------------------------------------------------
Dans la episonde [3]

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps  25
{'prob': 0.3333333333333333}
---------------------------------------------------
Dans la episonde [4]

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps  72
{'prob': 0.3333333333333333}
C'est fini....


### JA_Test
---

In [17]:
rows_size = int(10) # states
cols_size = int(4)  # actions

check_my_np = np.zeros((rows_size,cols_size))
check_my_np

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [24]:
check_my_np[9][3]

0.0

#### np.argmax Chercher la max action dans la Q-Table:
---

In [8]:
state_size  = 10
action_size = 4    # (haut, bas, gauche, droite)
qtable     = np.zeros((state_size,action_size))
qtable

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [9]:
# update state[1]
for col in range(4):
    qtable[1,col] = col
qtable

array([[0., 0., 0., 0.],
       [0., 1., 2., 3.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [17]:
qtable[9,:] = 55
qtable[2][0]=3.1
qtable[2][1]=2.7
qtable[2][2]=3.3
qtable[2][3]=0.9
qtable[3][:] = 2.1
qtable

array([[ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  1. ,  2. ,  3. ],
       [ 3.1,  2.7,  3.3,  0.9],
       [ 2.1,  2.1,  2.1,  2.1],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [55. , 55. , 55. , 55. ]])

In [18]:
state = 1
get_max_col = np.argmax(qtable[1,:])
get_max_col

3

In [19]:
state = 2
get_max_col = np.argmax(qtable[state,:])
get_max_col

2

In [20]:
qtable

array([[ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  1. ,  2. ,  3. ],
       [ 3.1,  2.7,  3.3,  0.9],
       [ 2.1,  2.1,  2.1,  2.1],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ],
       [55. , 55. , 55. , 55. ]])

In [21]:
qtable[2,3]  # Q(state,action)

0.9

In [22]:
qtable[3,:]  #Q (new_stat, all)

array([2.1, 2.1, 2.1, 2.1])

In [24]:
what_is_this = qtable[3,:] - qtable[2,3]
what_is_this

array([1.2, 1.2, 1.2, 1.2])

In [23]:
value = np.max(qtable[3,:] - qtable[2,3])
value

1.2000000000000002