In [1]:
# Importing the necessary libraries.

import numpy as np # For fast numeric / linear algebra computation.
import time        # For controling time of execution.  
import pickle      # For storing updated Q-table.
import gym         # For working with open AI frozen lake v1 environment and utilities.
import pygame      # For rendering the game and gym dependencies.

from platform import python_version
print("Python Version: ", python_version())
print("Numpy Version: ", np.__version__)
print("Gym Version: ", gym.__version__)
print("Pygame Version: ", pygame.__version__)

Python Version:  3.9.18
Numpy Version:  1.23.5
Gym Version:  0.26.1
Pygame Version:  2.5.2


In [2]:
# Hyper-Parameters

total_episodes = 10000 # Total number of iterations or episodes of training.

# A higher value of epsilon encourages more exploration, while a lower value of epsilon favors exploitation.
epsilon = 0.9 # For epsilon-gready policy, Positive real number (0 < epsilon < 1)

max_steps = 100 # Maximum number of steps that agent can take in environment

lr_rate = 0.81 # Learning Rate of convergence to global minimum

# A high discount factor means that future rewards are highly valued, while a 
# low discount factor means that immediate rewards are given greater weight
gamma = 0.96 # Discount Factor, Positive real number (0 < gamma < 1)

In [3]:
# creating environment using gym package.
# Default parameters:- gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
# For more information:- https://www.gymlibrary.dev/environments/toy_text/frozen_lake/

env = gym.make("FrozenLake-v1") # Using FrozenLake-v1. Since FrozenLake-v0 is depricated.

In [4]:
print("Number of observation states:- ", env.observation_space.n)
print("Number of action space :- ", env.action_space.n)


Number of observation states:-  16
Number of action space :-  4


In [5]:
# Initializing the Q-table with zeros for 2-D array of (observation_space, action_space).

Q = np.zeros((env.observation_space.n, env.action_space.n))
Q

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [6]:
def choose_action(state):
    """ 
    The choose_action() function used a Epsilon-Gready policy for Exploration and Exploitation.
    
    Exploration is done when uniform random number from 0 to 1 is less than epsilon value.
    Else, Maximum value of the state and action pair is taken which is Exploitation.

    Args:
        state (int): Gets the current state as parameter/

    Returns:
        int: Returns action to be taken in that state
    """
    action=0
    if np.random.uniform(0, 1) < epsilon: # Epsilon-Gready policy
        action = env.action_space.sample() # Exploration, Random action sample space
    else:
        action = np.argmax(Q[state, :]) # Exploitation, Maximum value is taken from (State, Action)
    return action

In [7]:
def learn(state, state2, reward, action):
    """ 
    Updates the Q-table.
    Agent learn to find a optimal policy by using bellman optimality equation.

    Args:
        state (int): Current state
        state2 (int): Future state
        reward (int): Reward if rached to goal state
        action (int): action states
    """
    predict = Q[state, action]
    target = reward + gamma * np.max(Q[state2, :])
    Q[state, action] = Q[state, action] + lr_rate * (target - predict)

In [8]:
for episode in range(total_episodes):
    state = env.reset()
    t = 0

    while t < max_steps:
        try:
            env.render()  # Add a try-except block to handle rendering issues
        except Exception as e:
            print("Rendering exception:", e)
            break  # or continue, depending on how critical rendering is to your process

        action = choose_action(state)
        state2, reward, done, info = env.step(action)
        learn(state, state2, reward, action)

        state = state2
        t += 1

        if done:
            break

        time.sleep(0.1)

Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Something went wrong with pygame. This should never happen.
Rendering exception: Somethi