<a href="https://colab.research.google.com/github/wilstermanz/holbertonschool-machine_learning/blob/main/reinforcement_learning/q_learning/q_learning" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gym
import numpy as np

# 0. Load the Environment

Write a function ```def load_frozen_lake(desc=None, map_name=None, is_slippery=False):``` that loads the pre-made FrozenLakeEnv evnironment from OpenAI's gym:

    desc is either None or a list of lists containing a custom description of the map to load for the environment
    map_name is either None or a string containing the pre-made map to load
    Note: If both desc and map_name are None, the environment will load a randomly generated 8x8 map
    is_slippery is a boolean to determine if the ice is slippery
    Returns: the environment


In [74]:
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    return gym.make('FrozenLake-v1', desc=desc, map_name=map_name,
                    is_slippery=is_slippery, render_mode='ansi')

In [70]:
np.random.seed(0)
env = load_frozen_lake()
print(env.desc)
print(env.P[0][0])
env = load_frozen_lake(is_slippery=True)
print(env.desc)
print(env.P[0][0])
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
print(env.desc)
env = load_frozen_lake(map_name='4x4')
print(env.desc)

[[b'S' b'F' b'F' b'F' b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'F' b'F' b'H' b'F' b'F']
 [b'F' b'H' b'F' b'H' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'H' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'H' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'G']]
[(1.0, 0, 0.0, False)]
[[b'S' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'H' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'H']
 [b'F' b'F' b'F' b'F' b'F' b'H' b'F' b'H']
 [b'F' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'G']]
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 8, 0.0, True)]
[[b'S' b'F' b'F']
 [b'F' b'H' b'H']
 [b'F' b'F' b'G']]
[[b'S' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'H']
 [b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'G']]


# 1. Initialize Q-table

Write a function ```def q_init(env):``` that initializes the Q-table:

    env is the FrozenLakeEnv instance
    Returns: the Q-table as a numpy.ndarray of zeros


In [61]:
def q_init(env):
    return np.zeros((env.observation_space.n, env.action_space.n))

In [62]:
env = load_frozen_lake()
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(is_slippery=True)
Q = q_init(env)
print(Q.shape)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(map_name='4x4')
Q = q_init(env)
print(Q.shape)

(64, 4)
(64, 4)
(9, 4)
(16, 4)


# 2. Epsilon Greedy

Write a function ```def epsilon_greedy(Q, state, epsilon):``` that uses epsilon-greedy to determine the next action:

    Q is a numpy.ndarray containing the q-table
    state is the current state
    epsilon is the epsilon to use for the calculation
    You should sample p with numpy.random.uniform to determine if your algorithm should explore or exploit
    If exploring, you should pick the next action with numpy.random.randint from all possible actions
    Returns: the next action index


In [63]:
def epsilon_greedy(Q, state, epsilon):
    # exploration
    if np.random.uniform() < epsilon:
        return np.random.choice(Q.shape[1])
    # exploitation
    else:
        return np.argmax(Q[state, :])

In [64]:
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
Q[7] = np.array([0.5, 0.7, 1, -1])
np.random.seed(0)
print(epsilon_greedy(Q, 7, 0.5))
np.random.seed(1)
print(epsilon_greedy(Q, 7, 0.5))

2
0


# 3. Q-learning

Write the function ```def train(env, Q, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):``` that performs Q-learning:

    env is the FrozenLakeEnv instance
    Q is a numpy.ndarray containing the Q-table
    episodes is the total number of episodes to train over
    max_steps is the maximum number of steps per episode
    alpha is the learning rate
    gamma is the discount rate
    epsilon is the initial threshold for epsilon greedy
    min_epsilon is the minimum value that epsilon should decay to
    epsilon_decay is the decay rate for updating epsilon between episodes
    When the agent falls in a hole, the reward should be updated to be -1
    Returns: Q, total_rewards
        Q is the updated Q-table
        total_rewards is a list containing the rewards per episode


In [128]:
def train(env, Q, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99,
          epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):

    total_rewards = []
    for episode in range(episodes):
        state = env.reset()
        done = False
        episode_rewards = 0

        for step in range(max_steps):
            action = epsilon_greedy(Q, state, epsilon)
            new_state, reward, done, _ = env.step(action)

            if reward == 0 and done:
                episode_rewards += -1
            else:
                episode_rewards += reward

            Q[state, action] = Q[state, action] * (1 - alpha) + \
                alpha * (reward + gamma * np.max(Q[new_state, :]))

            state = new_state

            if done:
                break

        epsilon = min_epsilon + (1 - min_epsilon) * (np.exp(-epsilon_decay * episode))

        total_rewards.append(episode_rewards)

    return Q, total_rewards

In [129]:
np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(Q)
split_rewards = np.split(np.array(total_rewards), 10)
for i, rewards in enumerate(split_rewards):
    print((i+1) * 500, ':', np.mean(rewards))

[[0.96059593 0.970299   0.95098488 0.96059396]
 [0.96059557 0.         0.0094072  0.37627228]
 [0.18061285 0.         0.         0.        ]
 [0.97029877 0.9801     0.         0.96059583]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.98009763 0.98009933 0.99       0.9702983 ]
 [0.98009922 0.98999782 1.         0.        ]
 [0.         0.         0.         0.        ]]
500 : 0.812
1000 : 0.88
1500 : 0.9
2000 : 0.9
2500 : 0.88
3000 : 0.844
3500 : 0.892
4000 : 0.896
4500 : 0.852
5000 : 0.928


# 4. Play

Write a function ```def play(env, Q, max_steps=100):``` that has the trained agent play an episode:

    env is the FrozenLakeEnv instance
    Q is a numpy.ndarray containing the Q-table
    max_steps is the maximum number of steps in the episode
    Each state of the board should be displayed via the console
    You should always exploit the Q-table
    Returns: the total rewards for the episode



In [114]:
def play(env, Q, max_steps=100):

    state = env.reset()
    episode_rewards = 0

    def print_render(input):
        for line in input:
            print(line, end='')

    print_render(env.render())


    for step in range(max_steps):
        action = np.argmax(Q[state, :])
        new_state, reward, done, _ = env.step(action)
        episode_rewards += reward
        state = new_state

        print_render(env.render())


        if done:
            break

    return episode_rewards

In [115]:
np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(play(env, Q))


[41mS[0mFF
FHH
FFG
  (Down)
SFF
[41mF[0mHH
FFG
  (Down)
SFF
FHH
[41mF[0mFG
  (Right)
SFF
FHH
F[41mF[0mG
  (Right)
SFF
FHH
FF[41mG[0m
1.0
