In [1]:
import gym
from stable_baselines3 import PPO, SAC, A2C
from stable_baselines3.common.evaluation import evaluate_policy

env_id = "BipedalWalker-v3"
timesteps = 200_000
eval_episodes = 10

algorithms_dict = {
    "PPO": PPO,
    "SAC": SAC,
    "A2C": A2C
}
results = {}



for name, algo in algorithms_dict.items():
    print(f"Training {name}")

    env = gym.make(env_id)
    model = algo("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=timesteps)
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=eval_episodes)
    results[name] = (mean_reward, std_reward)
    print(f"{name} - Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")
    env.close()

Training PPO
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 70.4     |
|    ep_rew_mean     | -108     |
| time/              |          |
|    fps             | 705      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 166          |
|    ep_rew_mean          | -113         |
| time/                   |              |
|    fps                  | 608          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0084170075 |
|    clip_fraction        | 0.0697       |
|    clip_range           | 0.2          |
|    entropy_loss         | -5.68        |
|    explained_variance   | -0.0123      |
|    learning_r



---------------------------------
| rollout/           |          |
|    ep_len_mean     | 469      |
|    ep_rew_mean     | -112     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 45       |
|    time_elapsed    | 41       |
|    total_timesteps | 1877     |
| train/             |          |
|    actor_loss      | -14.8    |
|    critic_loss     | 1.33     |
|    ent_coef        | 0.588    |
|    ent_coef_loss   | -3.5     |
|    learning_rate   | 0.0003   |
|    n_updates       | 1776     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 464      |
|    ep_rew_mean     | -106     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 43       |
|    time_elapsed    | 84       |
|    total_timesteps | 3708     |
| train/             |          |
|    actor_loss      | -20.8    |
|    critic_loss     | 1.05     |
|    ent_coef 

In [3]:
for name, (mean_reward, std_reward) in results.items():
    print(f"{name} - Mean reward: {mean_reward}")

PPO - Mean reward: 230.3684687
SAC - Mean reward: -81.8897189
A2C - Mean reward: -113.7345415


In [None]:
class JointPenaltyWrapper(gym.RewardWrapper):
    def __init__(self, env, penalty_weight=0.1):
        super().__init__(env)
        self.penalty_weight = penalty_weight
        self.prev_joint_angles = None

    def reset(self, **kwargs):
        result = self.env.reset(**kwargs)
        if isinstance(result, tuple):
            obs = result[0]
        else:
            obs = result
        self.prev_joint_angles = obs[4:8].copy()
        return result

    def step(self, action):
        result = self.env.step(action)
        if len(result) == 5:
            obs, reward, terminated, truncated, info = result
            joint_angles = obs[4:8]
            delta = np.abs(joint_angles - self.prev_joint_angles).sum()
            reward -= self.penalty_weight * delta
            self.prev_joint_angles = joint_angles.copy()
            return obs, reward, terminated, truncated, info
        else:
            obs, reward, done, info = result
            joint_angles = obs[4:8]
            delta = np.abs(joint_angles - self.prev_joint_angles).sum()
            reward -= self.penalty_weight * delta
            self.prev_joint_angles = joint_angles.copy()
            return obs, reward, done, info

In [14]:


penalty_weight = 0.1
results_penalized = {}

for name, algo in algorithms_dict.items():
    print(f"Training {name} with joint penalty")

    env = JointPenaltyWrapper(gym.make(env_id), penalty_weight=penalty_weight)
    model = algo("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=timesteps)
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=eval_episodes)
    results_penalized[name] = (mean_reward, std_reward)
    print(f"{name} (penalized) - Mean reward: {mean_reward}")
    env.close()

Training PPO with joint penalty
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 842      |
|    ep_rew_mean     | -210     |
| time/              |          |
|    fps             | 894      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 838         |
|    ep_rew_mean          | -213        |
| time/                   |             |
|    fps                  | 688         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009140572 |
|    clip_fraction        | 0.0887      |
|    clip_range           | 0.

In [15]:
for name, (mean_reward, std_reward) in results.items():
    print(f"{name} - Mean reward: {mean_reward}")

PPO - Mean reward: 230.3684687
SAC - Mean reward: -81.8897189
A2C - Mean reward: -113.7345415


### Unexpeted Output
**In the [15] you have printed the previous reward without the penlized rewards**

### Suggestions
- You can create a graph to show the reward differences between the default and penalized reward
- - Creating GIFs of robot simulations with different reward types in the same environment, shown side by side, can **enhance your understanding** of what actually changed — whether the robot **truly learned something new** after the reward modification.