Install and load all dependencies (first time only) \
NOTE: you may need to restart the runtime afterwards (CTRL+M .).

In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf

!pip install gym
!pip install free-mujoco-py
!pip install stable-baselines3[extra]

Set up the custom Hopper environment



1.   Upload `classes.zip` to the current session's file storage
2.   Un-zip it by running cell below


In [None]:
!unzip classes.zip



---



\

**Train an RL agent on the OpenAI Gym Hopper environment using REINFORCE and Actor-critic algorithms**

\


TASK 2 and 3: interleave data collection to policy updates

In [None]:
import gym
from env.custom_hopper import *
import os
from stable_baselines3 import PPO, SAC
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

In [None]:
n_episodes = 300000
print_every = 1000
device = 'cpu'
algorithm = 'PPO' # choices=['PPO', 'SAC']
env_id = 'CustomHopper-source-v0' # choices=['CustomHopper-source-v0', 'CustomHopper-target-v0']
save_path = './models/'
log_path = './logs/'

In [None]:
env = gym.make(env_id)

print('Action space:', env.action_space)
print('State space:', env.observation_space)
print('Dynamics parameters:', env.get_parameters())

os.makedirs(save_path, exist_ok=True)
os.makedirs(log_path, exist_ok=True)

In [None]:
"""
  Training
"""
model_params = {
    'n_steps': 2048,      # Number of steps to run for each environment per update
    'batch_size': 64,
    'gamma': 0.99,        # Discount factor
    'learning_rate': 3e-4 
}

if algorithm == 'PPO':
    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
elif algorithm == 'SAC':
    model = SAC('MlpPolicy', env, verbose=1, tensorboard_log=log_path)
else:
    raise ValueError("Algorithm not supported")

checkpoint_callback = CheckpointCallback(save_freq=10000, save_path=save_path,
                                            name_prefix='rl_model')

#eval_env = CustomHopperEnv() if env_id == 'CustomHopper' else gym.make(env_id)
eval_env = gym.make(env_id)
eval_callback = EvalCallback(eval_env, best_model_save_path=save_path,
                                log_path=log_path, eval_freq=5000,
                                deterministic=True, render=False)

model.learn(total_timesteps=n_episodes, callback=[checkpoint_callback, eval_callback])
model.save(os.path.join(save_path, f"{algorithm}_final_model"))