In [2]:
import gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback


class CustomHalfCheetahEnv(gym.Wrapper):
    
    def __init__(self):
        env = gym.make("HalfCheetah-v4")
        super().__init__(env)
        
    def step(self, action):
        observation, original_reward, terminated, truncated, info = self.env.step(action)
        reward = self.modified_reward_function(observation, action, original_reward)
        return observation, reward, terminated, truncated, info
    
    def modified_reward_function(self, observation, action, original_reward):
    
    # Forward velocity reward (using x-coordinate velocity, index 8)
        forward_reward = 1.0 * observation[8]
    
    # Penalize excessive vertical movement (z-coordinate, index 0)
        height_penalty = -0.05 * abs(observation[0] - 0.5)  
    
    # Penalize excessive rotations for stability (angle of second rotor, index 2)
        rotation_penalty = -0.1 * abs(observation[2])  
    
    # Energy efficiency - penalize excessive joint movements
    # Angular velocities from indices 10-16
        energy_penalty = -0.001 * sum(abs(observation[i]) for i in range(10, 17))
    
    # Smooth control - penalize large action changes
        control_penalty = -0.01 * np.sum(np.square(action))
    
    # Balance original reward with custom components
        original_reward_weight = 0.5
    
    # Combined reward
        reward = (
            forward_reward +
            height_penalty +
            rotation_penalty +
            energy_penalty +
            control_penalty +
            original_reward_weight * original_reward
        )
    
        return reward


env = CustomHalfCheetahEnv()
    
model = PPO(
        "MlpPolicy",
        env,
        verbose=1,
    )
    
    
    
checkpoint_callback = CheckpointCallback(save_freq=100000, save_path='./logs/',
                                           name_prefix='ppo_halfcheetah')

model.learn(total_timesteps=1000000, callback=checkpoint_callback)

       
    
print("Training completed!")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -330     |
| time/              |          |
|    fps             | 745      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -387        |
| time/                   |             |
|    fps                  | 577         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009496237 |
|    clip_fraction        | 0.1         |
|    clip_range           | 0.2         |
|    entropy_loss  