Install and load all dependencies (first time only) \
NOTE: you may need to restart the runtime afterwards (CTRL+M .).

In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf

!pip install gym
!pip install free-mujoco-py
!pip install stable-baselines3
!pip install cma

Set up the custom Hopper environment



1.   Upload `classes.zip` to the current session's file storage
2.   Un-zip it by running cell below


In [None]:
!unzip classes.zip



---



\

**Train an RL agent on the OpenAI Gym Hopper environment using REINFORCE and Actor-critic algorithms**

\


TASK 2 and 3: interleave data collection to policy updates

In [None]:
import gym
import os
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from env.custom_hopper import *
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
n_episodes = 300000
print_every = 1000
device = 'cpu'
algorithm = 'PPO' # choices=['PPO']
env_id = 'CustomHopper-sudr-v0' # choices=['CustomHopper-source-v0', 'CustomHopper-target-v0', 'CustomHopper-sudr-v0']
save_path = './models/'
log_path = './logs/'

In [None]:
os.makedirs(save_path, exist_ok=True)
os.makedirs(log_path, exist_ok=True)

In [None]:
# Parameters
num_iterations = 3  # Number of times to repeat the optimization and training process
best_mean_reward = -float('inf')
best_model = None


human = PPO.load("PPO_300k_target_2")

for i in range(num_iterations):
    print(f"Iteration {i + 1}/{num_iterations}")

    # Step 1: Collect rewards from the target environment
    env_target = gym.make('CustomHopper-target-v0')
    real_actions, real_rewards = env_target.unwrapped.collect_real_data(human)


    # Step 2: Optimize parameters based on the known torso mass difference
    env_source = gym.make('CustomHopper-source-v0')
    optimized_params = env_source.unwrapped.random_search_optimization(real_actions, real_rewards)
    print(optimized_params)

    # Step 3: Train policy using optimized parameters
    # trained_model = env_sudr.train_policy1(optimized_params)
    """Train policy using optimized parameters"""
    env_source.set_parameters(optimized_params)
    env_source.unwrapped.set_parameters(optimized_params)

    save_path += '_' + str(i)
    try:
      os.makedirs(save_path, exist_ok=True)
    except:
      print(save_path + 'exits')

    trained_model = PPO('MlpPolicy', env_source, verbose=1, tensorboard_log=log_path)

    checkpoint_callback = CheckpointCallback(save_freq=10000, save_path=save_path,
                                                name_prefix='rl_model' + str(i))

    eval_callback = EvalCallback(env_source, best_model_save_path=save_path,
                                    log_path=log_path, eval_freq=5000,
                                    deterministic=True, render=False)

    trained_model.learn(total_timesteps=n_episodes, callback=[checkpoint_callback, eval_callback])
    trained_model.save(os.path.join(save_path, f"{algorithm}_final_model"))

    # Step 4: Evaluate the policy in the target environment
    print("Evaluating in target environment (target)...")
    mean_reward, std_reward = evaluate_policy(trained_model, env_target, n_eval_episodes=10)
    print(f"Target Environment - Mean Reward: {mean_reward}, Std Reward: {std_reward}")

    # Save the model if it's the best one
    if mean_reward > best_mean_reward:
        best_mean_reward = mean_reward
        best_model = trained_model
        best_model.save("ppo_custom_hopper_best")
        best_model.save("ppo_custom_hopper_best" + str(i))

    save_path = './models/'

    

# Load and test the best model
best_model = PPO.load("ppo_custom_hopper_best")
mean_reward, std_reward = evaluate_policy(best_model, env_target, n_eval_episodes=10)
print(f"Best Model - Mean Reward: {mean_reward}, Std Reward: {std_reward}")