In [1]:
import gym
import numpy as np
import random

# Initialize the FrozenLake environment
env = gym.make("FrozenLake-v1", is_slippery=False)

# Q-learning parameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1  # Minimum exploration rate
epsilon_decay = 0.995  # Decay rate for exploration probability

# Initialize the Q-table
q_table = np.zeros((env.observation_space.n, env.action_space.n))

# Training parameters
num_episodes = 1000
max_steps_per_episode = 100

# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False
    step = 0
    total_reward = 0

    while not done and step < max_steps_per_episode:
        # Exploration-exploitation tradeoff
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state, :])  # Exploit

        # Take the action and observe the outcome
        next_state, reward, done, _ = env.step(action)

        # Update the Q-table
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state, :])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        state = next_state
        step += 1
        total_reward += reward

    # Decay the exploration rate
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if (episode + 1) % 100 == 0:
        print(f'Episode {episode + 1}/{num_episodes} - Total reward: {total_reward} - Epsilon: {epsilon}')
        print(f'Q-table snapshot:\n{q_table}')

# Evaluate the agent
num_eval_episodes = 100
total_rewards = 0

for episode in range(num_eval_episodes):
    state = env.reset()
    done = False
    step = 0
    episode_reward = 0

    while not done and step < max_steps_per_episode:
        action = np.argmax(q_table[state, :])  # Always exploit during evaluation
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state
        step += 1

    total_rewards += episode_reward

average_reward = total_rewards / num_eval_episodes
print(f'Average reward over {num_eval_episodes} evaluation episodes: {average_reward}')

env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 100/1000 - Total reward: 0.0 - Epsilon: 0.6057704364907278
Q-table snapshot:
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.  0. ]]
Episode 200/1000 - Total reward: 0.0 - Epsilon: 0.3669578217261671
Q-table snapshot:
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.  0. ]]
Episode 300/1000 - Total reward: 0.0 - Epsilon: 0.22229219984074702
Q-table snapshot:
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 