The below code is extracted from https://huggingface.co/learn/deep-rl-course/unit1/hands-on

Dependencies required for Colab are not used here, since this notebook is designed to be run offline.
Also, huggingface specific modules are not imported in this notebook

# Import all modules required for this unit

In [None]:
import gymnasium
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

from PIL import Image

## An example evaluation loop running a random policy

In [None]:
import gymnasium as gym

# First, we create our environment called LunarLander-v2
env = gym.make("LunarLander-v2")

# Then we reset this environment
observation, info = env.reset()

for _ in range(20):
    # Take a random action
    action = env.action_space.sample()
    print("Action taken:", action)

    # Do this action in the environment and get
    # next_state, reward, terminated, truncated and info
    observation, reward, terminated, truncated, info = env.step(action)

    # If the game is terminated (in our case we land, crashed) or truncated (timeout)
    if terminated or truncated:
        # Reset the environment
        print("Environment is reset")
        observation, info = env.reset()

env.close()

## Examining the observation space and action space

In [None]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make("LunarLander-v2")
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample())  # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())  # Take a random action

### Importance of checking the observation space

On the huggingface tutorial, it states that the last 2 observations are Booleans, however, sampling the observation space gave values between 0 and 1, instead of just a 0 or 1.
Can you explain the reason for this?

## Creating your first environment and defining your policy

In [None]:
# Create environment
env = gym.make('LunarLander-v2')

In [None]:
# Define hypermeters for your policy
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)

## Train your policy with just a single line of code

In [None]:
model.learn(total_timesteps=1000) #000

# Save the model after training
model_name = "ppo-LunarLander-v2"
model.save(model_name)

## Evaluate your model after training

In [None]:
eval_env = Monitor(gym.make("LunarLander-v2"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

## Visualization

In [None]:
import base64

def show_gif(fname):
    from IPython import display
    with open(fname, 'rb') as fd:
        b64 = base64.b64encode(fd.read()).decode('ascii')
    return display.HTML(f'<img src="data:image/gif;base64,{b64}" />')

In [None]:
images = []
terminated = False
truncated = False

param_filename = 'LunarLander'

render_env = gym.make("LunarLander-v2", render_mode="rgb_array")

state, info = render_env.reset()
img = render_env.render()
images.append(Image.fromarray(img))

while not terminated or truncated:
    action = model.predict(state)
    state, reward, terminated, truncated, info = render_env.step(action[0])
    img = render_env.render()
    images.append(Image.fromarray(img))

# Save your gif
images[0].save(f"{param_filename}.gif", save_all=True, append_images=images[1:],loop=0)

show_gif(f"{param_filename}.gif")