In [None]:
import gymnasium as gym
import numpy as np
import os
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback

class EnhancedTrainingLogger(BaseCallback):
    def __init__(self, log_filepath="anti_inversion_ppo_training2.txt", save_dir="model_checkpoints", 
                 save_freq=50000, model_prefix="anti_inversion_ppo"):
        super().__init__(verbose=0)
        self.log_filepath = log_filepath
        self.save_dir = save_dir
        self.save_freq = save_freq
        self.model_prefix = model_prefix
        self.episode_total_reward = 0
        self.control_costs = 0
        self.last_save = 0
        
        # Create save directory if it doesn't exist
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
    def _on_step(self):
        # Save model at regular intervals
        if self.num_timesteps >= self.last_save + self.save_freq:
            model_path = os.path.join(self.save_dir, f"{self.model_prefix}_{self.num_timesteps}")
            self.model.save(model_path)
            print(f"Model saved at {model_path}")
            self.last_save = self.num_timesteps
        
        # Log rewards and metrics
        current_reward = self.locals.get("rewards")[0] if self.locals.get("rewards") is not None else 0
        self.episode_total_reward += current_reward
        
        environment_info = self.locals.get("infos")[0] if self.locals.get("infos") is not None else {}
        episode_done = self.locals.get("dones")[0] if self.locals.get("dones") is not None else False
        
        # Track control costs
        if "reward_ctrl" in environment_info:
            self.control_costs += abs(environment_info["reward_ctrl"])
        
        if episode_done:
            distance_traveled = environment_info.get("x_position", 0)
            with open(self.log_filepath, "a") as log_file:
                log_file.write(f"{self.num_timesteps},{self.episode_total_reward:.4f},{distance_traveled:.4f},{self.control_costs:.4f}\n")
            self.episode_total_reward = 0
            self.control_costs = 0
        return True
    
    def _on_training_start(self):
        with open(self.log_filepath, "w") as log_file:
            log_file.write("timestep,reward,distance,control_cost\n")

class AntiInversionCheetahEnv(gym.Wrapper):
    def __init__(self):
        super().__init__(gym.make("HalfCheetah-v4"))
        
    def step(self, action):
        observation, original_reward, terminated, truncated, info = self.env.step(action)
        modified_reward = self.modified_reward_function(observation, action, original_reward)
        return observation, modified_reward, terminated, truncated, info
    
    def modified_reward_function(self, observation, action, original_reward):
        # Get TRUE torso angle (using the correct index)
        torso_angle = observation[1]  # rooty - actual torso angle
        
        # Simple posture penalty based on cosine of the angle
        posture_penalty = -1.0 * min(0, np.cos(torso_angle))
        
        # Original reward weight
        original_reward_weight = 1.0
        
        # Combined reward
        reward = posture_penalty + original_reward_weight * original_reward
        
        return reward

def train_anti_inversion_with_ppo():
    # Setup environment
    print("Creating anti-inversion cheetah environment...")
    cheetah_environment = AntiInversionCheetahEnv()

    # Initialize PPO model with default hyperparameters
    print("Initializing PPO model...")
    training_model = PPO(
        policy="MlpPolicy",
        env=cheetah_environment,
        verbose=1
    )

    # Setup logger with checkpoint saving capability
    print("Setting up training logger...")
    progress_logger = EnhancedTrainingLogger(
        log_filepath="anti_inversion_ppo_training2.txt",
        save_dir="model_checkpoints",
        save_freq=50000,
        model_prefix="anti_inversion_ppo2"
    )

    # Train the model
    print("Starting training for 1,000,000 timesteps...")
    training_model.learn(total_timesteps=1000000, callback=progress_logger)

    # Save the final model
    print("Saving final trained model...")
    training_model.save("anti_inversion_ppo_final2")

    print("Training completed!")
    print(f"Training logs saved to {progress_logger.log_filepath}")
    print(f"Model checkpoints saved in {progress_logger.save_dir}")

if __name__ == "__main__":
    train_anti_inversion_with_ppo()

Creating anti-inversion cheetah environment...


  logger.deprecation(


Initializing PPO model...
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Setting up training logger...
Starting training for 1,000,000 timesteps...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 142      |
| time/              |          |
|    fps             | 966      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 105         |
| time/                   |             |
|    fps                  | 681         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009235426 |
|    clip_fraction        | 0.0921      |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.5       