In [1]:
# import gym
import gym.envs.toy_text.frozen_lake as fl
import numpy as np

In [4]:
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    if desc is None and map_name is None:
        desc = fl.generate_random_map(size=8)
    env = fl.FrozenLakeEnv(desc=desc, map_name=map_name, is_slippery=is_slippery)
    return env

In [6]:
# load_frozen_lake = __import__('0-load_env').load_frozen_lake
import numpy as np

np.random.seed(0)
env = load_frozen_lake()
print(env.desc)
print(env.P[0][0])
env = load_frozen_lake(is_slippery=True)
print(env.desc)
print(env.P[0][0])
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
print(env.desc)
env = load_frozen_lake(map_name='4x4')
print(env.desc)

[[b'S' b'F' b'F' b'F' b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'F' b'F' b'H' b'F' b'F']
 [b'F' b'H' b'F' b'H' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'H' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'H' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'G']]
[(1.0, 0, 0.0, False)]
[[b'S' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'H' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'H']
 [b'F' b'F' b'F' b'F' b'F' b'H' b'F' b'H']
 [b'F' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'G']]
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 8, 0.0, True)]
[[b'S' b'F' b'F']
 [b'F' b'H' b'H']
 [b'F' b'F' b'G']]
[[b'S' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'H']
 [b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'G']]


In [5]:
def q_init(env):
    # The Q-table has a row for each state (env.observation_space.n)
    # and a column for each action (env.action_space.n)
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    return q_table

In [13]:
# load_frozen_lake = __import__('0-load_env').load_frozen_lake
# q_init = __import__('1-q_init').q_init

env = load_frozen_lake()
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(is_slippery=True)
Q = q_init(env)
print(Q.shape)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(map_name='4x4')
Q = q_init(env)
print(Q.shape)

(64, 4)
(64, 4)
(9, 4)
(16, 4)


In [6]:
"""Epsilon Greedy"""
def epsilon_greedy(Q, state, epsilon):
    """Uses epsilon-greedy to determine the next action:
    Q is a numpy.ndarray containing the q-table
    state is the current state
    epsilon is the epsilon to use for the calculation
    Sample p with numpy.random.uniformn to determine if
        algorithm should explore or exploit.
    If exploring, pick the next action with numpy.random.randint
        from all possible actions.
    Returns: the next action index."""
    if np.random.uniform(0, 1) < epsilon:
        action_index = np.random.randint(Q.shape[1])
    else:
        action_index = np.argmax(Q[state, :])

    return action_index

In [7]:
# load_frozen_lake = __import__('0-load_env').load_frozen_lake
# q_init = __import__('1-q_init').q_init
# epsilon_greedy = __import__('2-epsilon_greedy').epsilon_greedy
import numpy as np

desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
Q[7] = np.array([0.5, 0.7, 1, -1])
np.random.seed(0)
print(epsilon_greedy(Q, 7, 0.5))
np.random.seed(1)
print(epsilon_greedy(Q, 7, 0.5))

2
0


In [36]:
def train(env, Q, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):
    total_rewards = []
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0

        for step in range(max_steps):
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state, :])

            result = env.step(action)
            next_state, reward, done, _ = result[:4]

            Q[state, action] = Q[
                state, action] + alpha * (
                    reward + gamma * np.max(
                        Q[next_state, :]) - Q[
                            state, action])

            state = next_state
            episode_reward += reward

            if done:
                break

        total_rewards.append(episode_reward)

        epsilon = max(min_epsilon, epsilon - epsilon_decay)

    return Q, total_rewards

In [37]:
# load_frozen_lake = __import__('0-load_env').load_frozen_lake
# q_init = __import__('1-q_init').q_init
# train = __import__('3-q_learning').train

np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(Q)
split_rewards = np.split(np.array(total_rewards), 10)
for i, rewards in enumerate(split_rewards):
    print((i+1) * 500, ':', np.mean(rewards))

[[0.96059554 0.970299   0.95085698 0.96059579]
 [0.96058252 0.         0.03326856 0.25881872]
 [0.27480916 0.         0.         0.        ]
 [0.97029891 0.9801     0.         0.96059496]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.98009965 0.98009984 0.99       0.97029876]
 [0.98009741 0.98999927 1.         0.        ]
 [0.         0.         0.         0.        ]]
500 : 0.856
1000 : 0.934
1500 : 0.938
2000 : 0.932
2500 : 0.928
3000 : 0.948
3500 : 0.924
4000 : 0.92
4500 : 0.946
5000 : 0.948


In [38]:
def play(env, Q, max_steps=100):
    state = env.reset()
    total_rewards = 0
    for step in range(max_steps):
        action = np.argmax(Q[state, :])
        state, reward, done, _ = env.step(action)
        total_rewards += reward
        env.render()
        if done:
            break
    return total_rewards

In [39]:
# load_frozen_lake = __import__('0-load_env').load_frozen_lake
# q_init = __import__('1-q_init').q_init
# train = __import__('3-q_learning').train
# play = __import__('4-play').play

import numpy as np

np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(play(env, Q))

ValueError: too many values to unpack (expected 4)