<a href="https://colab.research.google.com/github/wilstermanz/holbertonschool-machine_learning/blob/main/reinforcement_learning/temporal_difference/temporal_difference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tensorflow as tf
if tf.__version__!='2.11.0':
  !pip install tensorflow==2.11.0 --quiet
  os.kill(os.getpid(), 9)

In [2]:
!pip install gym==0.7.0 --quiet
import gym
try:
    if gym.__version__ != '0.7.0':
        !pip uninstall gym --quiet
        !pip install gym==0.7.0 --quiet
        os.kill(os.getpid(), 9)
except Exception:
    if gym.version.VERSION != '0.7.0':
        !pip uninstall gym --quiet
        !pip install gym==0.7.0 --quiet
        os.kill(os.getpid(), 9)
print(gym.version.VERSION)

0.7.0


In [3]:
!pip install keras-rl2 --quiet
!pip install atari-py --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m915.5 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.6/540.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for atari-py (setup.py) ... [?25l[?25hdone



# 0. Monte Carlo

Write the function ```def monte_carlo(env, V, policy, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99):``` that performs the Monte Carlo algorithm:

    env is the openAI environment instance
    V is a numpy.ndarray of shape (s,) containing the value estimate
    policy is a function that takes in a state and returns the next action to take
    episodes is the total number of episodes to train over
    max_steps is the maximum number of steps per episode
    alpha is the learning rate
    gamma is the discount rate
    Returns: V, the updated value estimate


In [4]:
def monte_carlo(env, V, policy, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99):

    # iterate through episodes
    for episode in range(episodes):
        cumulative_reward = 0
        state = env.reset()
        episode_results = []

        # perform one episode
        for step in range(max_steps):
            action = policy(state)
            observation, reward, done, info = env.step(action)
            episode_results.append([state, reward])
            if done:
                break
            state = observation

        # perform Monte Carlo algorithm for episode
        episode_results = np.array(episode_results, dtype=int)
        for time in range(len(episode_results), 0, -1):
            state, reward = episode_results[time - 1]
            cumulative_reward = gamma * cumulative_reward + reward
            if state not in episode_results[:episode, 0]:
                V[state] = V[state] + alpha * (cumulative_reward - V[state])


    return V


In [5]:
import gym
import numpy as np
np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=4)
env.seed(0)
print(monte_carlo(env, V, policy).reshape((8, 8)))

INFO:gym.envs.registration:Making new env: FrozenLake8x8-v0
[2023-09-26 14:25:52,559] Making new env: FrozenLake8x8-v0
  result = entry_point.load(False)


[[ 0.81    0.9     0.4783  0.4305  0.3874  0.4305  0.6561  0.9   ]
 [ 0.9     0.729   0.5905  0.4783  0.5905  0.2824  0.2824  0.3874]
 [ 1.      0.5314  0.729  -1.      1.      0.3874  0.2824  0.4305]
 [ 1.      0.5905  0.81    0.9     1.     -1.      0.3874  0.6561]
 [ 1.      0.6561  0.81   -1.      1.      1.      0.729   0.5314]
 [ 1.     -1.     -1.      1.      1.      1.     -1.      0.9   ]
 [ 1.     -1.      1.      1.     -1.      1.     -1.      1.    ]
 [ 1.      1.      1.     -1.      1.      1.      1.      1.    ]]


# 1. TD(λ)

Write the function `def td_lambtha(env, V, policy, lambtha, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99):` that performs the TD(λ) algorithm:

    env is the openAI environment instance
    V is a numpy.ndarray of shape (s,) containing the value estimate
    policy is a function that takes in a state and returns the next action to take
    lambtha is the eligibility trace factor
    episodes is the total number of episodes to train over
    max_steps is the maximum number of steps per episode
    alpha is the learning rate
    gamma is the discount rate
    Returns: V, the updated value estimate


In [6]:
def td_lambtha(env, V, policy, lambtha, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99):
    # iterate through episodes
    for episode in range(episodes):
        state = env.reset()

        # e_traces start at zero for the episode
        e_trace = np.zeros_like(V)

        # perform one episode
        for step in range(max_steps):
            # act according to policy
            action = policy(state)
            observation, reward, done, info = env.step(action)

            if done:
                break

            # update eligibility
            e_trace[state] += 1
            e_trace *= lambtha * gamma

            # get the td-error and update every state's value estimate
            # according to their eligibilities
            delta = reward + gamma * V[observation] - V[state]
            V += alpha * delta * e_trace

            state = observation

    return V

In [7]:
np.random.seed(0)

env = gym.make('FrozenLake8x8-v0')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=4)
env.seed(0)
print(td_lambtha(env, V, policy, 0.9).reshape((8, 8)))

INFO:gym.envs.registration:Making new env: FrozenLake8x8-v0
[2023-09-26 14:25:58,647] Making new env: FrozenLake8x8-v0


[[ 0.6863  0.7055  0.7161  0.7252  0.7357  0.7294  0.7228  0.7185]
 [ 0.7021  0.7154  0.7303  0.7472  0.7732  0.7525  0.7657  0.7523]
 [ 0.7242  0.7284  0.7605 -1.      0.8042  0.7763  0.7984  0.7836]
 [ 0.7524  0.7324  0.7877  0.8485  0.8616 -1.      0.8188  0.8359]
 [ 0.775   0.7378  0.7653 -1.      0.8792  0.8932  0.8273  0.884 ]
 [ 0.8604 -1.     -1.      0.9983  0.9032  0.9409 -1.      0.9261]
 [ 0.8695 -1.      0.9649  0.9922 -1.      0.9832 -1.      0.9433]
 [ 0.8819  0.9196  0.9298 -1.      1.      0.9867  0.9991  1.    ]]


# 2. SARSA(λ)

Write the function `def sarsa_lambtha(env, Q, lambtha, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):` that performs SARSA(λ):

    env is the openAI environment instance
    Q is a numpy.ndarray of shape (s,a) containing the Q table
    lambtha is the eligibility trace factor
    episodes is the total number of episodes to train over
    max_steps is the maximum number of steps per episode
    alpha is the learning rate
    gamma is the discount rate
    epsilon is the initial threshold for epsilon greedy
    min_epsilon is the minimum value that epsilon should decay to
    epsilon_decay is the decay rate for updating epsilon between episodes
    Returns: Q, the updated Q table


In [18]:
def sarsa_lambtha(env, Q, lambtha, episodes=5000, max_steps=100, alpha=0.1,
                  gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):

    def epsilon_greedy(Q, state, epsilon):
        # exploration
        if np.random.uniform() < epsilon:
            return np.random.choice(Q.shape[1])
        # exploitation
        else:
            return np.argmax(Q[state, :])

    # play episodes
    for episode in range(episodes):

        # reset env state at the beginning of each episode
        state = env.reset()

        # reset eligibility to 0 for each episode
        e_trace = np.zeros_like(Q)

        # find first action according to policy
        action = epsilon_greedy(Q, state, epsilon)

        for step in range(max_steps):
            # decay epsilon
            epsilon = min_epsilon + (1 - min_epsilon) * (
                np.exp(-epsilon_decay * episode))

            # take first action, observe r, s'
            observation, reward, done, info = env.step(action)

            # choose a' from s' using policy
            next_action = epsilon_greedy(Q, observation, epsilon)

            # find delta
            delta = reward + (gamma * Q[observation, next_action]) \
                - Q[state, action]

            # update eligibility
            e_trace[state, action] += 1

            # for all s, a:
            Q += alpha * delta * e_trace
            e_trace *= gamma * lambtha

            # check if done
            if done:
                break

            action = next_action
            state = observation

    return Q

In [24]:
np.random.seed(0)
env = gym.make('FrozenLake8x8-v0')
Q = np.random.uniform(size=(64, 4))
np.set_printoptions(precision=4)
print(sarsa_lambtha(env, Q, 0.9))

INFO:gym.envs.registration:Making new env: FrozenLake8x8-v0
[2023-09-26 14:54:13,854] Making new env: FrozenLake8x8-v0


[[0.6751 0.5399 0.5991 0.6146]
 [0.569  0.5506 0.6948 0.5634]
 [0.5421 0.6697 0.5076 0.5551]
 [0.6799 0.563  0.5342 0.5503]
 [0.5702 0.5168 0.5433 0.562 ]
 [0.5698 0.5841 0.5518 0.5655]
 [0.6185 0.6161 0.6514 0.6225]
 [0.6692 0.6142 0.6416 0.6112]
 [0.598  0.6946 0.608  0.5809]
 [0.7055 0.6087 0.6112 0.5702]
 [0.682  0.5616 0.6079 0.5397]
 [0.3986 0.4996 0.4892 0.678 ]
 [0.5955 0.5288 0.5465 0.5586]
 [0.6176 0.6244 0.6354 0.6411]
 [0.7083 0.6466 0.6574 0.6469]
 [0.6182 0.6071 0.6169 0.6584]
 [0.6566 0.7563 0.6364 0.6397]
 [0.5981 0.7691 0.6528 0.5993]
 [0.7591 0.5113 0.4666 0.5392]
 [0.2828 0.1202 0.2961 0.1187]
 [0.5112 0.5743 0.4602 0.497 ]
 [0.6011 0.6412 0.7813 0.6693]
 [0.7781 0.6965 0.717  0.7144]
 [0.6717 0.7463 0.5937 0.6758]
 [0.681  0.6884 0.8111 0.689 ]
 [0.5859 0.8456 0.716  0.7061]
 [0.7085 0.8558 0.6916 0.6272]
 [0.6685 0.8575 0.6714 0.5628]
 [0.6877 0.8238 0.7181 0.6877]
 [0.8811 0.5813 0.8817 0.6925]
 [0.7418 0.8189 0.7575 0.7514]
 [0.7623 0.7458 0.6196 0.7127]
 [0.633 