In [17]:
#  _       __                 _            
# | |     / /___ __________  (_)___  ____ _
# | | /| / / __ `/ ___/ __ \/ / __ \/ __ `/
# | |/ |/ / /_/ / /  / / / / / / / / /_/ / 
# |__/|__/\__,_/_/  /_/ /_/_/_/ /_/\__, /  
#                                 /____/   
# set up a new python enviroment with python 3.8 and use it to run this code.
# it will install all required packages
# some code is based on the tutorial below:
# https://www.sliceofexperiments.com/p/an-actually-runnable-march-2023-tutorial


#run the following commands in terminal to set up the env. 
# conda create -n dsan_rl python=3.8
#conda activate dsan_rl


In [18]:
import gymnasium as gym
from gymnasium.wrappers import FlattenObservation

In [19]:
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

for _ in range(100):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [20]:
env = gym.make("LunarLander-v2")
print(env.observation_space.shape)
print(env.action_space)

wrapped_env = FlattenObservation(env)
print(wrapped_env.observation_space)
print(wrapped_env.observation_space.shape)

(8,)
Discrete(4)
Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)
(8,)


In [21]:
print(wrapped_env)
print(wrapped_env.unwrapped)

<FlattenObservation<TimeLimit<OrderEnforcing<PassiveEnvChecker<LunarLander<LunarLander-v2>>>>>>
<LunarLander<LunarLander-v2>>


In [37]:
from __future__ import annotations

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym

In [38]:
env = gym.make("LunarLander-v2")

In [39]:
done = False
observation, info = env.reset()

print(observation)
print(info)

[-0.00717926  1.4079839  -0.72719353 -0.13051201  0.00832572  0.16472018
  0.          0.        ]
{}


In [46]:
class LunarLader:
    def __init__(
            self,
            learning_rate,#float
            init_eps,#float
            decay_eps,#float
            final_eps,#float
            discount = 0.95#float
    ):
        # self.q_values = defaultdict(lambda: np.zeros(env.action.space.n))
        # self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount = discount
        self.eps = init_eps
        self.decay_eps = decay_eps
        self.final_eps = final_eps

        self.training_error = []

    def get_action(self, obs):
        if np.random.random() < self.eps:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
        
    # def get_action(self, obs):
    #     if np.random.random() < self.eps:
    #         return env.action_space.sample()
    #     else:
    #         return int(np.argmax(self.q_values[obs]))

        
    # def update(
    #         self,
    #         obs,
    #         action,
    #         reward,
    #         terminated,
    #         next_obs
    # ):
    #     future_q_value = (not terminated) * np.max(self.q_values[tuple(next_obs)])
    #     temporal_difference = (
    #         reward + self.discount * future_q_value - self.q_values[obs][action]
    #     )

    #     self.q_values[obs][action] = (self.q_values + self.lr * temporal_difference)

    #     self.training_error.append(temporal_difference)

    def update(self, obs, action, reward, terminated, next_obs):
        future_q_value = (not terminated) * np.max(self.q_values[tuple(next_obs)])
        temporal_difference = (
            reward + self.discount * future_q_value - self.q_values[tuple(obs)][action]
    )

        self.q_values[tuple(obs)][action] += self.lr * temporal_difference  # Fix this line

        self.training_error.append(temporal_difference)


    def decay_epsilon(self):
        self.eps = max(self.final_eps, self.eps - self.decay_eps)



In [47]:
learning_rate = 0.01
n_episodes = 100_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = LunarLader(
    learning_rate=learning_rate,
    init_eps=start_epsilon,
    decay_eps=epsilon_decay,
    final_eps=final_epsilon,
)

In [50]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    obs = obs
    done = False

    # play one episode
    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent
        agent.update(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

  0%|          | 0/100000 [00:00<?, ?it/s]




ValueError: Attempted to add episode stats when they already exist

In [36]:
obs, info = env.reset()

print(type(obs))

<class 'tuple'>


In [32]:
from __future__ import annotations

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym


# Let's start by creating the blackjack environment.
# Note: We are going to follow the rules from Sutton & Barto.
# Other versions of the game can be found below for you to experiment.

env = gym.make("Blackjack-v1", sab=True)

In [33]:
# reset the environment to get the first observation
done = False
observation, info = env.reset()

# observation = (16, 9, False)

In [34]:
class BlackjackAgent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):

        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, obs: tuple[int, int, bool]) -> int:

        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - epsilon_decay)

# hyperparameters
learning_rate = 0.01
n_episodes = 100_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = BlackjackAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [35]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False

    # play one episode
    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent
        agent.update(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

100%|██████████| 100000/100000 [00:14<00:00, 7104.87it/s]
