In [1]:
import numpy as np
import gym

In [2]:
# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Q-learning parameters
num_episodes = 1000
learning_rate = 0.8
discount_factor = 0.95
epsilon = 0.2

In [3]:
# Initialize Q-table with zeros
num_states = env.observation_space.n
num_actions = env.action_space.n
Q_table = np.zeros((num_states, num_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [28]:
# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset() # a tuple with an int and a probability.
    state = state[0] 
    done = False

    while not done:
        # Choose an action using epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(Q_table[state, :])  # Exploit
        # Take the chosen action and observe the new state and reward
        #next_state, reward, done, _ = env.step(action)
        next_state, reward, done, _, _ = env.step(action)
        # Update Q-value using the Q-learning update rule
        Q_table[state, action] = (1 - learning_rate) * Q_table[state, action] + \
                                 learning_rate * (reward + discount_factor * np.max(Q_table[next_state, :]))

        # Move to the next state
        state = next_state


In [29]:
print(Q_table)

[[1.72684307e-01 1.91805771e-01 2.02082609e-01 1.96969094e-01]
 [7.73077241e-03 6.82028104e-06 3.37133157e-03 3.81720859e-01]
 [8.27150842e-02 8.64136178e-02 4.32101721e-01 1.55793956e-01]
 [6.27907743e-02 2.94899038e-01 6.26203255e-02 7.25060483e-02]
 [1.15017821e-01 2.51613841e-02 3.13363856e-02 8.39105777e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.36680099e-01 8.71533733e-03 4.87738330e-01 1.84347423e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.68730351e-02 8.93720453e-02 4.07737453e-03 9.39308304e-02]
 [1.81978682e-02 1.31244196e-01 6.54023988e-02 2.37262063e-02]
 [6.99171402e-01 6.24885011e-03 8.58719381e-03 3.28397731e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.45073181e-02 1.76639825e-02 7.83859683e-01 1.50195964e-01]
 [2.58081935e-01 6.80487325e-01 7.26876701e-01 9.94440097e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [31]:
# Evaluate the trained agent
total_reward = 0
num_episodes_eval = 100

for _ in range(num_episodes_eval):
    state = env.reset()
    state = state[0] 
    done = False

    while not done:
        action = np.argmax(Q_table[state, :])
        state, reward, done, _, _ = env.step(action)
        total_reward += reward

average_reward = total_reward / num_episodes_eval
print(f"Average reward over {num_episodes_eval} episodes: {average_reward}")

# Close the environment
env.close()

Average reward over 100 episodes: 0.04
