In [None]:
import gymnasium as gym
import numpy as np
import random
custom_map = [
    "SFFF",
    "FHFH",
    "FFFH",
    "HFFG"
]
env = gym.make("FrozenLake-v1",desc=custom_map, is_slippery=True)

state_space_size = env.observation_space.n
action_space_size = env.action_space.n

print("State space:", state_space_size)
print("Action space:", action_space_size)
q_table = np.zeros((state_space_size, action_space_size))
episodes = 30000
max_steps_per_episode = 2000

learning_rate = 0.8
discount_factor = 0.95

epsilon_start = 1.0
epsilon_end = 0.001
epsilon_decay = 0.999
rewards_all_episodes = []


State space: 16
Action space: 4


In [None]:
for episode in range(episodes):
    state, _ = env.reset()
    total_rewards = 0

    epsilon = max(
        epsilon_end,
        epsilon_start * (epsilon_decay ** episode)
    )

    for step in range(max_steps_per_episode):


        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])

        new_state, reward, terminated, truncated, _ = env.step(action)


        q_table[state, action] = q_table[state, action] + learning_rate * (
            reward
            + discount_factor * np.max(q_table[new_state, :])
            - q_table[state, action]
        )

        state = new_state
        total_rewards += reward


        if terminated or truncated:
            break

    rewards_all_episodes.append(total_rewards)


**Training**

In [None]:
print("Average reward per 1000 episodes:")
for i in range(0, episodes, 1000):
    avg_reward = np.mean(rewards_all_episodes[i:i+1000])
    print(f"{i} - {i+1000}: {avg_reward}")


Average reward per 1000 episodes:
0 - 1000: 0.028
1000 - 2000: 0.088
2000 - 3000: 0.186
3000 - 4000: 0.38
4000 - 5000: 0.459
5000 - 6000: 0.534
6000 - 7000: 0.681
7000 - 8000: 0.7
8000 - 9000: 0.645
9000 - 10000: 0.694
10000 - 11000: 0.667
11000 - 12000: 0.703
12000 - 13000: 0.664
13000 - 14000: 0.661
14000 - 15000: 0.68
15000 - 16000: 0.698
16000 - 17000: 0.67
17000 - 18000: 0.691
18000 - 19000: 0.691
19000 - 20000: 0.707
20000 - 21000: 0.676
21000 - 22000: 0.643
22000 - 23000: 0.687
23000 - 24000: 0.697
24000 - 25000: 0.683
25000 - 26000: 0.616
26000 - 27000: 0.637
27000 - 28000: 0.655
28000 - 29000: 0.621
29000 - 30000: 0.691


# Testing

In [None]:
test_episodes = 100
success_count = 0

for episode in range(test_episodes):
    state, _ = env.reset()

    for step in range(max_steps_per_episode):


        action = np.argmax(q_table[state, :])

        new_state, reward, terminated, truncated, _ = env.step(action)

        state = new_state


        if terminated or truncated:
            if reward == 1:
                success_count += 1
            break

print(f"Success rate over {test_episodes} episodes: {success_count}%")

print("Optimal policy (action per state):")
print(np.argmax(q_table, axis=1))


Success rate over 100 episodes: 74%
Optimal policy (action per state):
[0 3 3 3 0 0 0 0 3 1 0 0 0 2 1 0]
