In [1]:
import numpy as np

"""Monte Carlo function"""

def monte_carlo(env, V, policy, episodes=5000, max_steps=100,
                alpha=0.1, gamma=0.99):
    """Performs the Monte Carlo algorithm:
        env is the openAI environment instance.
        V is a numpy.ndarray of shape (s,) containing the value estimate.
        policy is a function that takes in a state and returns the
            next action to take.
        episodes is the total number of episodes to train over.
        max_steps is the maximum number of steps per episode.
        alpha is the learning rate.
        gamma is the discount rate.
        Returns: V, the updated value estimate."""
    for _ in range(episodes):
        state = env.reset()
        done = False
        steps = 0
        rewards = []
        states = []
        while not done and steps < max_steps:
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            states.append(state)
            rewards.append(reward)
            state = next_state
            steps += 1

        G = 0
        for t in range(steps-1, -1, -1):
            G = gamma * G + rewards[t]
            V[states[t]] = V[states[t]] + alpha * (G - V[states[t]])
    return V


In [4]:
import gym
import numpy as np
# monte_carlo = __import__('0-monte_carlo').monte_carlo

np.random.seed(0)

env = gym.make('FrozenLake8x8-v1')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=2)
env.seed(0)
print(monte_carlo(env, V, policy).reshape((8, 8)))


[[ 8.78e-02  6.08e-02  3.38e-02  4.68e-02  8.12e-02  3.29e-05  6.10e-02
   1.43e-02]
 [ 4.69e-02  3.84e-02  4.29e-13  5.64e-03  9.84e-02  3.46e-02  6.94e-02
   4.35e-02]
 [ 4.38e-44  3.62e-44  6.37e-49 -1.00e+00  1.23e-01  1.30e-01  2.42e-01
   1.39e-01]
 [ 2.14e-26  4.49e-35  6.77e-27  4.16e-12  2.98e-07 -1.00e+00  2.35e-01
   9.48e-02]
 [ 3.07e-20  1.75e-94  3.42e-71 -1.00e+00  7.46e-04  4.11e-04  1.61e-01
   1.72e-01]
 [ 5.20e-23 -1.00e+00 -1.00e+00  9.00e-01  2.49e-02  3.94e-02 -1.00e+00
   2.58e-01]
 [ 4.16e-17 -1.00e+00  1.67e-01  6.56e-01 -1.00e+00  3.26e-01 -1.00e+00
   2.83e-01]
 [ 1.88e-07  4.05e-05  7.07e-03 -1.00e+00  1.00e+00  7.45e-01  8.29e-01
   1.00e+00]]


In [4]:
"""TD(λ) function"""

import numpy as np


def td_lambtha(env, V, policy, lambtha, episodes=5000,
               max_steps=100, alpha=0.1, gamma=0.99):
    """Performs the TD(λ) algorithm:
        env is the openAI environment instance.
        V is a numpy.ndarray of shape (s,) containing the value estimate.
        policy is a function that takes in a state and returns the
            next action to take.
        lambtha is the eligibility trace factor.
        episodes is the total number of episodes to train over.
        max_steps is the maximum number of steps per episode.
        alpha is the learning rate.
        gamma is the discount rate.
        Returns: V, the updated value estimate."""
    for _ in range(episodes):
        state = env.reset()
        done = False
        steps = 0
        eligibility_trace = np.zeros_like(V)
        while not done and steps < max_steps:
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            delta = reward + gamma * V[next_state] * (not done) - V[state]
            eligibility_trace[state] += 1.0
            V += alpha * delta * eligibility_trace
            eligibility_trace *= gamma * lambtha
            state = next_state
            steps += 1
    return V


In [5]:
#!/usr/bin/env python3

import gym
import numpy as np
# td_lambtha = __import__('1-td_lambtha').td_lambtha

np.random.seed(0)

env = gym.make('FrozenLake8x8-v1')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=4)
print(td_lambtha(env, V, policy, 0.9).reshape((8, 8)))


[[ 7.7129e-03  1.3165e-02  2.0520e-02  3.4019e-02  3.8642e-02  6.9335e-02
   6.3121e-02  7.1144e-02]
 [ 3.5873e-03  6.1755e-03  7.4042e-03  2.8975e-02  3.9403e-02  8.4272e-02
   7.3803e-02  8.4687e-02]
 [ 1.7886e-03  2.1251e-03  1.8724e-03 -1.0000e+00  2.8866e-02  7.7807e-02
   6.0959e-02  9.9125e-02]
 [ 7.5919e-04  6.5023e-04  1.3745e-03  3.0894e-03  6.5835e-03 -1.0000e+00
   3.6495e-02  5.7337e-02]
 [ 1.1376e-03  2.3054e-04  2.5522e-04 -1.0000e+00  1.6921e-02  6.5772e-02
   5.9346e-02  1.0212e-01]
 [ 8.7576e-04 -1.0000e+00 -1.0000e+00  6.1363e-01  6.4190e-02  1.0289e-01
  -1.0000e+00  2.8864e-01]
 [ 1.4933e-03 -1.0000e+00  9.8777e-02  3.4497e-01 -1.0000e+00  3.5278e-01
  -1.0000e+00  3.1137e-01]
 [ 2.6897e-03  8.1614e-03  2.4684e-02 -1.0000e+00  1.0000e+00  5.9940e-01
   9.2261e-01  1.0000e+00]]


In [6]:
"""SARSA(λ) Function"""

def sarsa_lambtha(env, Q, lambtha, episodes=5000, max_steps=100, alpha=0.1,
                  gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):
    """Performs SARSA(λ):
        env is the openAI environment instance.
        Q is a numpy.ndarray of shape (s,a) containing the Q table.
        lambtha is the eligibility trace factor.
        episodes is the total number of episodes to train over.
        max_steps is the maximum number of steps per episode.
        alpha is the learning rate.
        gamma is the discount rate.
        epsilon is the initial threshold for epsilon greedy.
        min_epsilon is the minimum value that epsilon should decay to.
        epsilon_decay is the decay rate for updating epsilon between episodes.
        Returns: Q, the updated Q table."""
    n_actions = env.action_space.n
    for _ in range(episodes):
        state = env.reset()
        action = epsilon_greedy(Q, state, n_actions, epsilon)
        eligibility_trace = np.zeros_like(Q)
        for _ in range(max_steps):
            new_state, reward, done, _ = env.step(action)
            new_action = epsilon_greedy(Q, new_state, n_actions, epsilon)
            delta = reward + gamma * Q[new_state, new_action] * (not done) - Q[state, action]
            eligibility_trace[state, action] += 1.0
            Q += alpha * delta * eligibility_trace
            eligibility_trace *= gamma * lambtha
            if done:
                break
            state, action = new_state, new_action
        epsilon = max(epsilon - epsilon_decay, min_epsilon)
    return Q

def epsilon_greedy(Q, state, n_actions, epsilon):
    if np.random.rand() < epsilon:
        action = np.random.randint(n_actions)
    else:
        action = np.argmax(Q[state])
    return action

  and should_run_async(code)


In [7]:
#!/usr/bin/env python3

import gym
import numpy as np
# sarsa_lambtha = __import__('2-sarsa_lambtha').sarsa_lambtha

np.random.seed(0)
env = gym.make('FrozenLake8x8-v1')
Q = np.random.uniform(size=(64, 4))
np.set_printoptions(precision=4)
print(sarsa_lambtha(env, Q, 0.9))


  deprecation(
  deprecation(


[[0.008  0.0087 0.0084 0.0074]
 [0.0093 0.0086 0.0087 0.0093]
 [0.0099 0.0117 0.0075 0.01  ]
 [0.0101 0.0119 0.0135 0.0109]
 [0.0154 0.0154 0.0146 0.0153]
 [0.0195 0.0193 0.0152 0.0194]
 [0.0206 0.0216 0.0237 0.0233]
 [0.0208 0.0243 0.0206 0.0222]
 [0.0081 0.0085 0.0078 0.0086]
 [0.0081 0.0087 0.0081 0.0085]
 [0.0094 0.0091 0.0099 0.0097]
 [0.0086 0.0097 0.0103 0.0118]
 [0.0147 0.0138 0.0142 0.0148]
 [0.018  0.018  0.0163 0.0184]
 [0.0215 0.0223 0.0226 0.0191]
 [0.024  0.0238 0.022  0.0216]
 [0.0072 0.0069 0.0071 0.0071]
 [0.0066 0.007  0.0069 0.0068]
 [0.0066 0.0067 0.0064 0.0072]
 [0.2828 0.1202 0.2961 0.1187]
 [0.0109 0.0136 0.0155 0.0135]
 [0.0182 0.0152 0.0137 0.0179]
 [0.0237 0.0249 0.026  0.0222]
 [0.0262 0.0235 0.0245 0.0238]
 [0.0068 0.0064 0.0069 0.0063]
 [0.0034 0.0057 0.0061 0.0062]
 [0.0056 0.0058 0.0061 0.006 ]
 [0.0021 0.0079 0.0026 0.0039]
 [0.0139 0.0123 0.0131 0.0061]
 [0.8811 0.5813 0.8817 0.6925]
 [0.0195 0.0289 0.0314 0.0247]
 [0.0331 0.03   0.0318 0.0194]
 [0.0046