## Step 1: Install + Import Necessary Libraries

In [1]:
!pip install gym==0.17.3

Collecting gym==0.17.3
  Downloading gym-0.17.3.tar.gz (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 6.2 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyglet<=1.5.0,>=1.4.0
  Downloading pyglet-1.5.0-py2.py3-none-any.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 4.3 MB/s eta 0:00:00
Collecting cloudpickle<1.7.0,>=1.2.0
  Downloading cloudpickle-1.6.0-py3-none-any.whl (23 kB)
Building wheels for collected packages: gym
  Building wheel for gym (setup.py): started
  Building wheel for gym (setup.py): finished with status 'done'
  Created wheel for gym: filename=gym-0.17.3-py3-none-any.whl size=1654631 sha256=bca745c354b43fce6f70a00487a74946e26d3b9efc836eab8f92348c4491a9fa
  Stored in directory: c:\users\xavim\appdata\local\pip\cache\wheels\cc\e4\97\f9097746896a5a5595e1477b95603324bf6dde572a89e88bc0
Successfully built gym
Installing collected packages: pyglet, cl

In [2]:
import gym
import numpy as np
import random as rd
from IPython.display import clear_output

env = gym.make('FrozenLake-v0', desc=None, map_name="4x4", is_slippery=False)

## Step 2: Display the 4x4 space and run the reward table

In [3]:
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [4]:
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(4)
State Space Discrete(16)


In [5]:
# Reward Table

env.P

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, True)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, True)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 5, 0, True)],
  1: [(1.0, 5, 0, True)],
  2: [(1.0, 5, 0, True)],
  3: [(1.0, 5, 0, True)]},
 6: {0: [(1.0, 5, 0.0, True)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, True)],
  2: [(

## Step 3: Define the starting position, which will be from the same place

In [6]:
state = env.s
print("State:", state) # In this case, we'll start always from the position zero

State: 0


In [7]:
# Movements from the start position

env.P[0]

{0: [(1.0, 0, 0.0, False)],
 1: [(1.0, 4, 0.0, False)],
 2: [(1.0, 1, 0.0, False)],
 3: [(1.0, 0, 0.0, False)]}

### Action Space:



*   0: Left
*   1: Down
*   2: Right
*   3: Up

### Rewards:

*   Reach goal(G): +1
*   Reach hole(H): 0
*   Reach frozen(F): 0

## Step 4: Define Q table

In [9]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [10]:
q_table[0]

array([0., 0., 0., 0.])

## Step 5: Define Greedy Policy

In [11]:
def greedy(epsilon,q_table,state,env):
    if rd.random() < epsilon:
        action=env.action_space.sample() #explorar
    else:
        action=np.argmax(q_table[state]) #explotar
    return action

## Step 6: Train and execute

In [12]:
# Define the Hyperparameters

alpha = 0.7 # learning rate
gamma = 0.95 # discount rate
epsilon = 1.0 # greedy policy

# We generate the empty lists to group the results
all_timestep = []
all_penalties = []

# We define the number of episodes we will carry out
episodes = 1001

for i in range(episodes):
    state = env.reset()

    timestep, penalties, reward = 0, 0, 0
    done = False

    while not done:
        action = greedy(epsilon,q_table,state,env) # apply greedy policy

        next_state, reward, done, info = env.step(action) # we take the chosen action

        old_value = q_table[state, action] # in the Q-table, we take the Q-value of the chosen action for the current state
        next_max = np.max(q_table[next_state]) # in the Q-table, we take the maximum between the Q values for the new state

        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max) # update the Q-value
        q_table[state, action] = new_value

        if reward == 0:
            penalties += 1

        state = next_state
        timestep += 1

    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 1000
Training finished.



In [13]:
env.s
env.render()

  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [14]:
q_table # Display the values of the Q-table

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450621, 0.9024329 , 0.9025    , 0.        ],
       [0.85737498, 0.95      , 0.        , 0.85737463],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.89507615, 0.94992937, 0.85736662],
       [0.90135128, 0.94999997, 1.        , 0.90249836],
       [0.        , 0.        , 0.        , 0.        ]])

In [15]:
class bcolors:
    RED= '\u001b[31m'
    GREEN= '\u001b[32m'
    RESET= '\u001b[0m'

env.s = 0
state = env.reset()
done = False

timestep, penalties, reward = 0, 0, 0
total_reward = 0

while not done:

  action = np.argmax(q_table[state])
  state, reward, done, info = env.step(action) # with "step" we carry out the chosen action

  if reward == 0:
      penalties += 1 # we add a penalty if the taxi tries to drop off the passenger when he/she is not yet on board

  timestep += 1
  total_reward += reward

  # Print each step
  clear_output(wait=True)
  env.render()
  print("")
  if reward == 0:
    print(f"Recompensa actual: {bcolors.RED}{reward}{bcolors.RESET}")
  else:
    print(f"Recompensa actual: {bcolors.GREEN}{reward}{bcolors.RESET}")
  if reward == 0:
    print(f"Recompensa total: {bcolors.RED}{total_reward}{bcolors.RESET}")
  else:
    print(f"Recompensa total: {bcolors.GREEN}{total_reward}{bcolors.RESET}")
  print("")
  print('Estado actual', state)

print("Timesteps taken: {}".format(timestep))
print("Penalties incurred: {}".format(penalties))

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m

Recompensa actual: [32m1.0[0m
Recompensa total: [32m1.0[0m

Estado actual 15
Timesteps taken: 6
Penalties incurred: 5
