### IMPORT NECESSARY LIBRARIES

In [84]:
# Import necessary libraries
import gym
import time
import random
import numpy as np
from IPython.display import clear_output

### INTIALIZE THE ENVIRONMENT AND Q-TABLE

In [85]:
# Initialize the FrozenLake-v0 gym environment
env = gym.make("FrozenLake-v0")

# Get the size of the action and state space
n_actions = env.action_space.n
n_states = env.observation_space.n

# Initialize the Q-table with zeroes with dimension states*actions
q_table = np.zeros((n_states, n_actions))

# Print the initial Q-table
print("INITIAL Q-TABLE")
print(q_table)

INITIAL Q-TABLE
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


### SET THE ALGORITHM PARAMETERS

In [92]:
# Set the exploration rate
epsilon = 1

# Set the minimum exploration rate
eps_min = 0.01

# Set the maximum exploration rate
eps_max = 1.0

# Set the epsilon decay rate
# Set it to a low value for better results
eps_decay_rate = 0.01

# Set the learning rate to compute the q-value
learning_rate = 0.1

# Set the discount factor
gamma = 0.99

# Set the number of episodes
num_episodes = 10000

# Set the maximum number of steps for each episode
num_steps_per_epi = 100


### EPSILON GREEDY POLICY

In [93]:
# Define a function epsilon greedy that takes as parameter the q-table,
# state and episode number
def epsilon_greedy(q_table, state, episode):

  # Compute a random threshold value
  threshold = np.random.uniform(0,1)

  # Compute the epsilon value (exploration rate) based on the equation in the isntructions
  epsilon = eps_min + (eps_max-eps_min) * np.exp(-eps_decay_rate*episode)

  # Check if the threshold value is lower than the updated epsilon
  if threshold < epsilon:

      # Take a random action
      action = env.action_space.sample()

  # Else take the best action for that state based to the q-table
  else:
      action = np.argmax(q_table[state,:])

  # Return the action
  return action


### Q-LEARNING ALGORITHM

In [94]:
# List to store the rewards of each episode
total_reward_episode = []

In [95]:
# Loop over the number of episodes
for episode in range(num_episodes):

  # Set the initial state of the environment
  state = env.reset()

  # Set done as False
  done = False

  # Initialize the reward for this episode as 0
  reward_episode = 0

  # Loop over the maximum number of steps in each episode
  for step in range(num_steps_per_epi):

    # Get the action based on the epsilong greedy policy
    action = epsilon_greedy(q_table, state, episode)

    # Call the step method with the action to get the next state and reward 
    next_state, reward, done, info = env.step(action)

    # Compute the q value based on the equation in the instructions
    q_value = (q_table[state, action] * (1-learning_rate)) + (learning_rate * (reward + gamma*(q_table[next_state, action])))

    # Update the q-table for the q-value
    q_table[state, action] = q_value

    # Update the state 
    state = next_state

    # Update the reward for this episode
    reward_episode+= reward

    # End the loop for this episode if done is True
    if done==True:
      break

  # Append the reward to the reward list 
  total_reward_episode.append(reward_episode)
    

In [96]:
# Print the updated q-table
print("Q-TABLE")
print(q_table)

Q-TABLE
[[0.         0.04573663 0.0669407  0.        ]
 [0.         0.03642594 0.0670219  0.        ]
 [0.         0.04831736 0.10000783 0.        ]
 [0.         0.01502818 0.         0.        ]
 [0.         0.06022531 0.06610448 0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.08274409 0.1614667  0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.08646087 0.09637048 0.        ]
 [0.         0.19262956 0.30497255 0.        ]
 [0.         0.24276446 0.2628909  0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.24813806 0.47648134 0.19435937]
 [0.         0.57709871 0.7052317  0.48218848]
 [0.         0.         0.         0.        ]]


### TEST THE Q-TABLE

In [98]:
# Number of test episodes
num_test_episode = 3

# Loop over the number of test episodes
for episode in range(num_test_episode):

  # Get the initial state os the environment
  state = env.reset()

  # Set done as False
  done = False

  # Print episode number
  print("EPISODE: ", episode+1,"\n")
  time.sleep(1)

  # Loop over the maximum number of steps per episode
  for step in range(num_steps_per_epi):

    # Print the episode output
    clear_output(wait=True)
    env.render()
    time.sleep(0.3)

    # Get the next state based on the q-table
    action = np.argmax(q_table[state,:])

    # Get the next state given the action
    next_state, reward, done, info = env.step(action)

    # Helper code to plot the agent state and action in the environment 
    if done:
      clear_output(wait=True)
      env.render()
      if reward==1:
        print("\nGoal Reached!")
        time.sleep(3)
      else:
        print("\nFell in the hole!!")
        time.sleep(3)
      clear_output(wait=True)
      break
  
    # Update the state
    state = next_state

# Close the environment
env.close()



EPISODE:  3 

  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG

Fell in the hole!!


### MAKE CHANGES

If your q-table does not result in your agent reaching the goal, alter the algorithm parameters and try again.