In [None]:
! apt install swig cmake -q
! pip install stable-baselines3==2.0.0a5 swig gymnasium[box2d] huggingface_sb3 -q
! sudo apt-get update -q
! apt install python3-opengl ffmpeg xvfb -q
! pip3 install pyvirtualdisplay -q

In [None]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

In [None]:
import gymnasium as gym

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [None]:
notebook_login()

In [None]:
POLICY = "MlpPolicy"

## Proximal Policy Optimization (PPO)

In [None]:
total_timesteps = 1000000
env = gym.make("LunarLander-v2")

model = PPO(
    policy=POLICY,
    env=env,
    n_steps=1024,
    batch_size=32,
    n_epochs=5,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=0
    )

In [None]:
model.learn(total_timesteps=total_timesteps, progress_bar=True)

model_name = "/content/ppo-LunarLander-v2"
model.save(model_name)

In [None]:
eval_env = Monitor(gym.make("LunarLander-v2"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
env_id = "LunarLander-v2"

model_architecture = "PPO"

repo_id = "zypchn/ppo-Lunar-Lander"

commit_message = f"Upload {env_id} with {model_architecture} trained agent"

eval_env = DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))])

package_to_hub(
    model=model,
    model_name=model_name,
    model_architecture=model_architecture,
    env_id=env_id,
    eval_env=eval_env,
    repo_id=repo_id,
    commit_message=commit_message
)

## Deep Q-Network (DQN)

In [None]:
total_timesteps = 1000000
env = gym.make("LunarLander-v2")

In [None]:
model = DQN(
    policy=POLICY,
    env=env,
    batch_size=128,
    gamma=0.999,
    learning_rate=3e-4,
    learning_starts=1_000,
    buffer_size=200_000,
    exploration_final_eps=0.1,
    exploration_fraction=0.3,
    policy_kwargs=dict(net_arch=[256, 256]),
    verbose=0
)

In [None]:
model.learn(total_timesteps=total_timesteps, progress_bar=True)

model_name = "/content/dqn-LunarLander-v2"
model.save(model_name)

In [None]:
eval_env = Monitor(gym.make("LunarLander-v2"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
env_id = "LunarLander-v2"

model_architecture = "DQN"

repo_id = "zypchn/dqn-Lunar-Lander"

commit_message = f"Upload {env_id} with {model_architecture} trained agent"

eval_env = DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))])

package_to_hub(
    model=model,
    model_name=model_name,
    model_architecture=model_architecture,
    env_id=env_id,
    eval_env=eval_env,
    repo_id=repo_id,
    commit_message=commit_message
)

## Advanced Actor-Critic (A2C)

In [None]:
total_timesteps = 1000000
env = make_vec_env("LunarLander-v2", n_envs=8)

In [None]:
model = A2C(
    policy=POLICY,
    env=env,
    gamma=0.999,
    gae_lambda=0.95,
    n_steps=16,
    ent_coef=0.05,
    vf_coef=0.25,
    max_grad_norm=0.5,
    policy_kwargs=dict(net_arch=[256, 256]),
    verbose=0
)

In [None]:
model.learn(total_timesteps=total_timesteps, progress_bar=True)

model_name = "/content/a2c-LunarLander-v2"
model.save(model_name)

In [None]:
eval_env = Monitor(gym.make("LunarLander-v2"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
env_id = "LunarLander-v2"

model_architecture = "A2C"

repo_id = "zypchn/a2c-Lunar-Lander"

commit_message = f"Upload {env_id} with {model_architecture} trained agent"

eval_env = DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))])

package_to_hub(
    model=model,
    model_name=model_name,
    model_architecture=model_architecture,
    env_id=env_id,
    eval_env=eval_env,
    repo_id=repo_id,
    commit_message=commit_message
)