In [1]:
import gymnasium as gym
import numpy as np
import random
import time

In [2]:
desc=["SFF", "FHF", "FGF"]

env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False, render_mode="human") 
observation, info = env.reset()

In [3]:
blacklistedActions = np.empty((env.observation_space.n), dtype=object)

for rowIndex, row in enumerate(desc):
    for columnIndex, typ in enumerate(row):
        blacklist = []
        if rowIndex == 0:
            blacklist.append(3)
        elif rowIndex == len(desc)-1:
            blacklist.append(1)
        if columnIndex == 0:
            blacklist.append(0)
        elif columnIndex == len(row)-1:
            blacklist.append(2)
        blacklistedActions[(len(row)*rowIndex)+(columnIndex)] = blacklist

def is_valid_action(state, action):
    stateBlacklist = blacklistedActions[state]
    
    if action in stateBlacklist:
        return False
    else:
        return True

In [4]:
# Define parameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.05  # Epsilon-greedy parameter
num_episodes = 20

# Initialize Q-values
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

stateI = 0
for state in Q:
    for action in blacklistedActions[stateI]:
        Q[stateI][action] = -10
    stateI+=1

# Q-learning algorithm
start_time = time.time()
for episode in range(num_episodes):
    state = env.reset()[0]
    done = False
    
    states = []
    
    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            actionLoop = True
            while actionLoop:
                action = env.action_space.sample()  # Random action
                if is_valid_action(state, action):
                    actionLoop = False
        else:
            ind = 0
            maxIndices = []
            maxNum = -1
            for num in Q[state]:
                if num > maxNum:
                    maxIndices = [ind]
                    maxNum = num
                elif num == maxNum:
                    maxIndices.append(ind)
                ind += 1
                
            maxInd = -1
            if len(maxIndices) >= 1:
                maxInd = maxIndices[random.randint(0, len(maxIndices) - 1)]
                
            if maxInd== -1:
                maxInd = env.action_space.sample()
            action = maxInd  # Greedy action
        
        # Take action and observe next state and reward
        states.append(state)
        step = env.step(action)
        next_state = step[0]
        reward = step[1]
        done = step[2]
        
        if reward == 0 and done:
            reward = -1
        elif next_state == state:
            reward = -1
            print("hit wall?")
        
        # Update Q-value
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
        
        if reward == 1:
            print("\n\nHit gift! At Episode: "+str(episode +1)+"\n")
            print(Q)
            states.append(next_state)
            print("\nStates / path: ")
            print(states)
            end_time = time.time()
            elapsed_time = end_time - start_time
            print("\nElapsed time:", elapsed_time, "seconds\n\n")
        
        # Move to next state
        state = next_state

# Optimal policy
optimal_policy = np.argmax(Q, axis=1)

print("Optimal policy:")
print(optimal_policy)




Hit gift! At Episode: 4

[[-10.    0.    0.  -10. ]
 [  0.   -0.1   0.  -10. ]
 [  0.    0.  -10.  -10. ]
 [-10.    0.   -0.1   0. ]
 [  0.    0.    0.    0. ]
 [ -0.1   0.  -10.    0. ]
 [-10.  -10.    0.    0. ]
 [  0.  -10.    0.    0. ]
 [  0.1 -10.  -10.    0. ]]

States / path: 
[0, 1, 0, 3, 6, 3, 0, 3, 6, 3, 0, 3, 0, 3, 0, 1, 0, 1, 2, 1, 2, 5, 8, 7]

Elapsed time: 9.757920265197754 seconds




Hit gift! At Episode: 5

[[-1.0e+01  0.0e+00  0.0e+00 -1.0e+01]
 [ 0.0e+00 -1.0e-01  0.0e+00 -1.0e+01]
 [ 0.0e+00  0.0e+00 -1.0e+01 -1.0e+01]
 [-1.0e+01  0.0e+00 -1.0e-01  0.0e+00]
 [ 0.0e+00  0.0e+00  0.0e+00  0.0e+00]
 [-1.0e-01  9.0e-03 -1.0e+01  0.0e+00]
 [-1.0e+01 -1.0e+01  0.0e+00  0.0e+00]
 [ 0.0e+00 -1.0e+01  0.0e+00  0.0e+00]
 [ 1.9e-01 -1.0e+01 -1.0e+01  0.0e+00]]

States / path: 
[0, 1, 0, 1, 2, 5, 2, 1, 2, 1, 2, 5, 8, 7]

Elapsed time: 13.27092456817627 seconds




Hit gift! At Episode: 6

[[-1.0e+01  0.0e+00  0.0e+00 -1.0e+01]
 [ 0.0e+00 -1.0e-01  0.0e+00 -1.0e+01]
 [ 0.0e+0



Hit gift! At Episode: 19

[[-1.00000000e+01  9.00871115e-02  0.00000000e+00 -1.00000000e+01]
 [ 0.00000000e+00 -1.00000000e-01  0.00000000e+00 -1.00000000e+01]
 [ 0.00000000e+00  8.10000000e-04 -1.00000000e+01 -1.00000000e+01]
 [-1.00000000e+01  3.06897973e-01 -1.90000000e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-1.00000000e-01  2.52000000e-02 -1.00000000e+01  0.00000000e+00]
 [-1.00000000e+01 -1.00000000e+01  7.17570464e-01  0.00000000e+00]
 [ 0.00000000e+00 -1.00000000e+01  0.00000000e+00  0.00000000e+00]
 [ 2.71000000e-01 -1.00000000e+01 -1.00000000e+01  0.00000000e+00]]

States / path: 
[0, 3, 6, 7]

Elapsed time: 33.085387229919434 seconds




Hit gift! At Episode: 20

[[-1.00000000e+01  1.08699218e-01  0.00000000e+00 -1.00000000e+01]
 [ 0.00000000e+00 -1.00000000e-01  0.00000000e+00 -1.00000000e+01]
 [ 0.00000000e+00  8.10000000e-04 -1.00000000e+01 -1.00000000e+01]
 [-1.00000000e+01  3.40789518e-01 -1.90000000e-01  0.00000000e+00