In [1]:
#  _       __                 _            
# | |     / /___ __________  (_)___  ____ _
# | | /| / / __ `/ ___/ __ \/ / __ \/ __ `/
# | |/ |/ / /_/ / /  / / / / / / / / /_/ / 
# |__/|__/\__,_/_/  /_/ /_/_/_/ /_/\__, /  
#                                 /____/   
# set up a new python enviroment with python 3.8 and use it to run this code.
# it will install all required packages
# some code is based on the tutorial below:
# https://www.sliceofexperiments.com/p/an-actually-runnable-march-2023-tutorial


#run the following commands in terminal to set up the env. 
# conda create -n dsan_rl python=3.8
#conda activate dsan_rl


Collecting ray
  Downloading ray-2.10.0-cp38-cp38-manylinux2014_x86_64.whl.metadata (13 kB)
Collecting click>=7.0 (from ray)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting filelock (from ray)
  Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting jsonschema (from ray)
  Using cached jsonschema-4.21.1-py3-none-any.whl.metadata (7.8 kB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray)
  Downloading msgpack-1.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting protobuf!=3.19.5,>=3.15.3 (from ray)
  Downloading protobuf-5.26.1-cp37-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting pyyaml (from ray)
  Downloading PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting aiosignal (from ray)
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist (from ray)
  Downloading frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_

In [5]:
import gymnasium as gym
from gymnasium.wrappers import FlattenObservation

In [6]:
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

for _ in range(100):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [8]:
env = gym.make("LunarLander-v2")
print(env.observation_space.shape)
print(env.action_space)

wrapped_env = FlattenObservation(env)
print(wrapped_env.observation_space)
print(wrapped_env.observation_space.shape)

(8,)
Discrete(4)
Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)
(8,)


In [9]:
print(wrapped_env)
print(wrapped_env.unwrapped)

<FlattenObservation<TimeLimit<OrderEnforcing<PassiveEnvChecker<LunarLander<LunarLander-v2>>>>>>
<LunarLander<LunarLander-v2>>


In [1]:
from __future__ import annotations

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm

import gymnasium as gym

In [2]:
env = gym.make("LunarLander-v2")

In [3]:
done = False
observation, info = env.reset()

print(observation)
print(info)

[ 0.00423479  1.4111595   0.42892367  0.01062951 -0.00490029 -0.09715758
  0.          0.        ]
{}


In [7]:
class LunarLader:
    def __init__(
            self,
            learning_rate,#float
            init_eps,#float
            decay_eps,#float
            final_eps,#float
            discount = 0.95#float
    ):
        # self.q_values = defaultdict(lambda: np.zeros(env.action.space.n))
        # self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount = discount
        self.eps = init_eps
        self.decay_eps = decay_eps
        self.final_eps = final_eps

        self.training_error = []

    def get_action(self, obs):
        if np.random.random() < self.eps:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
        
    # def get_action(self, obs):
    #     if np.random.random() < self.eps:
    #         return env.action_space.sample()
    #     else:
    #         return int(np.argmax(self.q_values[obs]))

        
    # def update(
    #         self,
    #         obs,
    #         action,
    #         reward,
    #         terminated,
    #         next_obs
    # ):
    #     future_q_value = (not terminated) * np.max(self.q_values[tuple(next_obs)])
    #     temporal_difference = (
    #         reward + self.discount * future_q_value - self.q_values[obs][action]
    #     )

    #     self.q_values[obs][action] = (self.q_values + self.lr * temporal_difference)

    #     self.training_error.append(temporal_difference)

    def update(self, obs, action, reward, terminated, next_obs):
        future_q_value = (not terminated) * np.max(self.q_values[tuple(next_obs)])
        temporal_difference = (
            reward + self.discount * future_q_value - self.q_values[obs][action]
    )

        self.q_values[obs][action] += self.lr * temporal_difference  # Fix this line

        self.training_error.append(temporal_difference)


    def decay_epsilon(self):
        self.eps = max(self.final_eps, self.eps - self.decay_eps)



In [5]:
learning_rate = 0.01
n_episodes = 100_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = LunarLader(
    learning_rate=learning_rate,
    init_eps=start_epsilon,
    decay_eps=epsilon_decay,
    final_eps=final_epsilon,
)

In [9]:
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False

    # play one episode
    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent
        agent.update(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

  0%|          | 0/100000 [00:00<?, ?it/s]


TypeError: unhashable type: 'numpy.ndarray'