In [None]:
import gymnasium as gym
import numpy as np

# Initialise the environment
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
env = gym.make(
    'FrozenLake-v1',
    desc=generate_random_map(size=4),    # generating a random map
    is_slippery=True,                    # creating a slippery environment i.e. stochastic environment
    success_rate=1.0/3.0,
    reward_schedule=(1, -1, 0)           # reward: 1 for goal, -1 for hole, 0 for frozen
)

In [None]:
# Training

l_rate = 0.8            # learning rate
gamma = 0.95            # discount factor
epsilon = 1.0           # epsilon
min_epsilon = 0.01      # minimum value of epsilon
decay_ratio = 0.001     # decay ratio for decaying epsilon
num_episodes = 2000     # total no. of episodes for training
n_states = env.observation_space.n         # no. of states
n_action = env.action_space.n              # no. of actions possible
# A Q-table containing Q(S,A) pairs defining estimated optimal policy Ï€*
q_table = np.zeros((n_states, n_action))

for ep in range(num_episodes):
    # Initialize state S
    state, info = env.reset()
    run = True
    while(run):
        # Choose A from S using epsilon-greedy algorithm
        n = np.random.rand(1)
        if n < epsilon:  # Exploration
            # randomly decide on an action
            action = np.random.choice(range(n_action))
        else:       # Exploitaion
            # choose policy using Q-Table
            # choosing the max Q(s, a) value for given state; in case of multiple occurreces of the value, choosing randomly among them
            action = np.random.choice((list(np.where(q_table[state, :]==np.max(q_table[state, :]))))[0])
            
        # making observation after taking action
        next_state, reward, terminated, truncated, info = env.step(action)
        
        q_observed = reward + gamma * (np.max(q_table[next_state, :]))     # observed Q(s, a)
        q_expected = q_table[state, action]          # expected Q(s, a) from Q-Table
        tde = q_observed - q_expected             # calculating Temporal Difference Error
        
        # updating Q-Table
        q_table[state, action] += l_rate * tde
        
        state = next_state
        
        if(terminated or truncated):
            run = False
    
    # updating the value of epsilon (by simplified exponential decay)
    epsilon = max(min_epsilon, epsilon*decay_ratio)       # min_epsilon ensures some minimum exploration at all times

In [None]:
# Saving Q-Table
np.save('q_table.npy', q_table)

In [None]:
# Testing

num_episodes = 100
n_holes = 0
n_goals = 0
total_reward = 0
for ep in range(num_episodes):
    # Initialize state S
    state, info = env.reset()
    run = True
    while(run):
        # Choose A from S using optimal policy from Q-Table
        # choosing the max Q(s, a) value for given state; in case of multiple occurreces of the value, choosing randomly among them
        action = np.random.choice((list(np.where(q_table[state, :]==np.max(q_table[state, :]))))[0])
        
        # making observation after taking action
        state, reward, terminated, truncated, info = env.step(action)
        
        # storing reward
        total_reward += reward
        
        if(terminated or truncated):
            if reward==1:
                n_goals += 1
            elif reward==-1:
                n_holes += 1
            run = False
            
avg_reward = total_reward / num_episodes       # calculating avg reward
success_rate = n_goals/num_episodes*100        # calculating success rate
failure_rate = n_holes/num_episodes*100        # calculating failure rate
# Printing Test Results
print(f'Success Rate = {success_rate}')
print(f'Failure Rate = {failure_rate}')
print(f'Average Reward = {avg_reward}')

In [None]:
env.close()