In [1]:
import gymnasium as gym
import numpy as np
import random

In [2]:
desc=["SFF", "FHF", "FGF"]

env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False, render_mode="human") 
observation, info = env.reset()

In [15]:
# Define parameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.05  # Epsilon-greedy parameter
num_episodes = 10

# Initialize Q-values
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()[0]
    done = False
    
    steps = []
    
    while not done:
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Random action
        else:
            ind = 0
            maxIndices = []
            maxNum = -1
            for num in Q[state]:
                if num > maxNum:
                    maxIndices = [ind]
                    maxNum = num
                elif num == maxNum:
                    maxIndices.append(ind)
                ind += 1
                
            maxInd = -1
            if len(maxIndices) >= 1:
                maxInd = maxIndices[random.randint(0, len(maxIndices) - 1)]
                
            if maxInd== -1:
                maxInd = env.action_space.sample()
            action = maxInd  # Greedy action
        
        # Take action and observe next state and reward
        steps.append(action)
        step = env.step(action)
        next_state = step[0]
        reward = step[1]
        done = step[2]
        
        if reward == 0 and done:
            print("hit lake :C")
            reward = -1
        elif next_state == state:
            print("ran into wall")
            reward = -1
        
        # Update Q-value
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
        
        if reward == 1:
            print("Hit gift! At Episode: "+str(episode +1))
            print(Q)
            print("Steps: ")
            print(steps)
        
        # Move to next state
        state = next_state

# Optimal policy
optimal_policy = np.argmax(Q, axis=1)

print("Optimal policy:")
print(optimal_policy)


ran into wall
ran into wall
hit lake :C
ran into wall
ran into wall
ran into wall
Hit gift! At Episode: 2
[[-0.1  0.   0.  -0.1]
 [ 0.  -0.1  0.  -0.1]
 [ 0.   0.   0.   0. ]
 [-0.1  0.   0.   0. ]
 [ 0.   0.   0.   0. ]
 [ 0.   0.   0.   0. ]
 [ 0.  -0.1  0.1  0. ]
 [ 0.   0.   0.   0. ]
 [ 0.   0.   0.   0. ]]
Steps: 
[1, 3, 2, 0, 1, 0, 3, 2, 3, 0, 1, 1, 1, 3, 1, 2]
Hit gift! At Episode: 3
[[-0.1    0.     0.    -0.1  ]
 [ 0.    -0.1    0.    -0.1  ]
 [ 0.     0.     0.     0.   ]
 [-0.1    0.009  0.     0.   ]
 [ 0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.   ]
 [ 0.    -0.1    0.19   0.   ]
 [ 0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.   ]]
Steps: 
[1, 1, 2]
Hit gift! At Episode: 4
[[-0.1      0.00081  0.      -0.1    ]
 [ 0.      -0.1      0.      -0.1    ]
 [ 0.       0.       0.       0.     ]
 [-0.1      0.0252   0.       0.     ]
 [ 0.       0.       0.       0.     ]
 [ 0.       0.       0.       0.     ]
 [ 0.      -0.1      0.271    0.     ]
 [ 0.     