In [1]:
import gym
import numpy as np
import random

In [2]:
env = gym.make('FrozenLake-v0')

In [3]:
# Initialize Q-value table randomly
q_table = np.zeros((env.observation_space.n, env.action_space.n))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
# Hyperparameters
gamma = 0.99
learning_rate = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
epsilon_decay_rate = 0.005

num_episodes = 20000
num_steps_per_episode = 100

In [5]:
def q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    for episode in range(num_episodes):
        state = env.reset()

        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        for step in range(num_steps_per_episode):
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + gamma * np.max(q_table[next_state,:]))

            reward_episode += reward
            state = next_state

            if done:
                break
        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    return q_table, rewards_all

In [6]:
q_table, rewards_all = q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished


In [7]:
q_table

array([[0.54303001, 0.48565555, 0.50847772, 0.49612016],
       [0.28253878, 0.22377806, 0.29024775, 0.48784149],
       [0.41348612, 0.39878291, 0.40212058, 0.45441835],
       [0.32363326, 0.3044394 , 0.23765193, 0.43259167],
       [0.5633247 , 0.45369887, 0.31788333, 0.36327417],
       [0.        , 0.        , 0.        , 0.        ],
       [0.35207806, 0.1666047 , 0.22547417, 0.11624194],
       [0.        , 0.        , 0.        , 0.        ],
       [0.25688238, 0.36907284, 0.42013482, 0.60666639],
       [0.4145851 , 0.67364368, 0.40529242, 0.39631236],
       [0.64286009, 0.33931153, 0.38491545, 0.34725435],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.58814287, 0.50112774, 0.75174252, 0.59399597],
       [0.73219525, 0.84259856, 0.79429268, 0.73380703],
       [0.        , 0.        , 0.        , 0.        ]])

In [8]:
sum(rewards_all)

9214.0

In [9]:
sum(rewards_all[0:1000])

105.0

In [10]:
sum(rewards_all[1000:2000])

204.0

In [11]:
sum(rewards_all[2000:3000])

228.0

In [12]:
sum(rewards_all[9000:10000])

486.0

In [13]:
def play(env, q_table, render=False):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    while not done:
        action = np.argmax(q_table[state, :])
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return (total_reward, steps)

In [14]:
def play_multiple_times(env, q_table, max_episodes):
    success = 0
    list_of_steps = []
    for i in range(max_episodes):
        total_reward, steps = play(env, q_table)

        if total_reward > 0:
            success += 1
            list_of_steps.append(steps)

    print(f'Number of successes: {success}/{max_episodes}')
    print(f'Average number of steps: {np.mean(list_of_steps)}')

In [15]:
play_multiple_times(env, q_table, 1000)

Number of successes: 750/1000
Average number of steps: 39.56
