In [None]:
import gymnasium as gym
import numpy as np
import os
from stable_baselines3 import DDPG
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.noise import NormalActionNoise

class EnhancedTrainingLogger(BaseCallback):
    def __init__(self, log_filepath="anti_inversion_ddpg_training2.txt", save_dir="model_checkpoints", 
                 save_freq=50000, model_prefix="anti_inversion_ddpg2"):
        super().__init__(verbose=0)
        self.log_filepath = log_filepath
        self.save_dir = save_dir
        self.save_freq = save_freq
        self.model_prefix = model_prefix
        self.episode_total_reward = 0
        self.control_costs = 0
        self.last_save = 0
        
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
    def _on_step(self):

        if self.num_timesteps >= self.last_save + self.save_freq:
            model_path = os.path.join(self.save_dir, f"{self.model_prefix}_{self.num_timesteps}")
            self.model.save(model_path)
            print(f"Model saved at {model_path}")
            self.last_save = self.num_timesteps
        

        current_reward = self.locals.get("rewards")[0] if self.locals.get("rewards") is not None else 0
        self.episode_total_reward += current_reward
        
        environment_info = self.locals.get("infos")[0] if self.locals.get("infos") is not None else {}
        episode_done = self.locals.get("dones")[0] if self.locals.get("dones") is not None else False
        

        if "reward_ctrl" in environment_info:
            self.control_costs += abs(environment_info["reward_ctrl"])
        
        if episode_done:
            distance_traveled = environment_info.get("x_position", 0)
            with open(self.log_filepath, "a") as log_file:
                log_file.write(f"{self.num_timesteps},{self.episode_total_reward:.4f},{distance_traveled:.4f},{self.control_costs:.4f}\n")
            self.episode_total_reward = 0
            self.control_costs = 0
        return True
    
    def _on_training_start(self):
        with open(self.log_filepath, "w") as log_file:
            log_file.write("timestep,reward,distance,control_cost\n")

class AntiInversionCheetahEnv(gym.Wrapper):
    def __init__(self):
        super().__init__(gym.make("HalfCheetah-v4"))
        
    def step(self, action):
        observation, original_reward, terminated, truncated, info = self.env.step(action)
        modified_reward = self.modified_reward_function(observation, action, original_reward)
        return observation, modified_reward, terminated, truncated, info
    
    def modified_reward_function(self, observation, action, original_reward):
        # Get TRUE torso angle (using the correct index)
        torso_angle = observation[1]  # rooty - actual torso angle
        
        # Simple posture penalty based on cosine of the angle
        posture_penalty = -1.0 * min(0, np.cos(torso_angle))
        
        # Original reward weight
        original_reward_weight = 1.0
        
        # Combined reward
        reward = posture_penalty + original_reward_weight * original_reward
        
        return reward

def train_anti_inversion_with_ddpg():
    # Setup environment
    print("Creating anti-inversion cheetah environment...")
    cheetah_environment = AntiInversionCheetahEnv()

    # Action noise for exploration
    n_actions = cheetah_environment.action_space.shape[0]
    action_noise = NormalActionNoise(
        mean=np.zeros(n_actions),
        sigma=0.1 * np.ones(n_actions)
    )

    # Initialize DDPG model
    print("Initializing DDPG model...")
    training_model = DDPG(
        policy="MlpPolicy",
        env=cheetah_environment,
        action_noise=action_noise,
        buffer_size=100000,
        learning_rate=1e-3,
        batch_size=256,
        gamma=0.99,
        verbose=1
    )

    # Setup logger with checkpoint saving capability
    print("Setting up training logger...")
    progress_logger = EnhancedTrainingLogger(
        log_filepath="anti_inversion_ddpg_training2.txt",
        save_dir="model_checkpoints",
        save_freq=50000,
        model_prefix="anti_inversion_ddpg2"
    )

    # Train the model
    print("Starting training for 1,000,000 timesteps...")
    training_model.learn(total_timesteps=1000000, callback=progress_logger)

    # Save the final model
    print("Saving final trained model...")
    training_model.save("anti_inversion_ddpg_final2")

    print("Training completed!")
    print(f"Training logs saved to {progress_logger.log_filepath}")
    print(f"Model checkpoints saved in {progress_logger.save_dir}")

if __name__ == "__main__":
    train_anti_inversion_with_ddpg()

Creating anti-inversion cheetah environment...


  logger.deprecation(


Initializing DDPG model...
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Setting up training logger...
Starting training for 1,000,000 timesteps...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -37.8    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 14       |
|    time_elapsed    | 284      |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -6.44    |
|    critic_loss     | 0.031    |
|    learning_rate   | 0.001    |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 47.8     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 13       |
|    time_elapsed    | 576      |
|    total_timesteps 

In [1]:
import gymnasium as gym
import numpy as np
import os
from stable_baselines3 import DDPG
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.noise import NormalActionNoise

class EnhancedTrainingLogger(BaseCallback):
    def __init__(self, log_filepath="anti_inversion_ddpg_training3.txt", save_dir="model_checkpoints", 
                 save_freq=50000, model_prefix="anti_inversion_ddpg3"):
        super().__init__(verbose=0)
        self.log_filepath = log_filepath
        self.save_dir = save_dir
        self.save_freq = save_freq
        self.model_prefix = model_prefix
        self.episode_total_reward = 0
        self.control_costs = 0
        self.last_save = 0
        
        # Create save directory if it doesn't exist
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
    def _on_step(self):
        # Save model at regular intervals
        if self.num_timesteps >= self.last_save + self.save_freq:
            model_path = os.path.join(self.save_dir, f"{self.model_prefix}_{self.num_timesteps}")
            self.model.save(model_path)
            print(f"Model saved at {model_path}")
            self.last_save = self.num_timesteps
        
        # Log rewards and metrics
        current_reward = self.locals.get("rewards")[0] if self.locals.get("rewards") is not None else 0
        self.episode_total_reward += current_reward
        
        environment_info = self.locals.get("infos")[0] if self.locals.get("infos") is not None else {}
        episode_done = self.locals.get("dones")[0] if self.locals.get("dones") is not None else False
        
        # Track control costs
        if "reward_ctrl" in environment_info:
            self.control_costs += abs(environment_info["reward_ctrl"])
        
        if episode_done:
            distance_traveled = environment_info.get("x_position", 0)
            with open(self.log_filepath, "a") as log_file:
                log_file.write(f"{self.num_timesteps},{self.episode_total_reward:.4f},{distance_traveled:.4f},{self.control_costs:.4f}\n")
            self.episode_total_reward = 0
            self.control_costs = 0
        return True
    
    def _on_training_start(self):
        with open(self.log_filepath, "w") as log_file:
            log_file.write("timestep,reward,distance,control_cost\n")

class AntiInversionCheetahEnv(gym.Wrapper):
    def __init__(self):
        super().__init__(gym.make("HalfCheetah-v4"))
        
    def step(self, action):
        observation, original_reward, terminated, truncated, info = self.env.step(action)
        modified_reward = self.modified_reward_function(observation, action, original_reward)
        return observation, modified_reward, terminated, truncated, info
    
    def modified_reward_function(self, observation, action, original_reward):
        # Get TRUE torso angle (using the correct index)
        torso_angle = observation[1]  # rooty - actual torso angle
        
        # Simple posture penalty based on cosine of the angle
        posture_penalty = -1.0 * min(0, np.cos(torso_angle))
        
        # Original reward weight
        original_reward_weight = 1.0
        
        # Combined reward
        reward = posture_penalty + original_reward_weight * original_reward
        
        return reward

def train_anti_inversion_with_ddpg():
    # Setup environment
    print("Creating anti-inversion cheetah environment...")
    cheetah_environment = AntiInversionCheetahEnv()

    # Action noise for exploration
    n_actions = cheetah_environment.action_space.shape[0]
    action_noise = NormalActionNoise(
        mean=np.zeros(n_actions),
        sigma=0.1 * np.ones(n_actions)
    )

    # Initialize DDPG model
    print("Initializing DDPG model...")
    training_model = DDPG(
        policy="MlpPolicy",
        env=cheetah_environment,
        action_noise=action_noise,
        buffer_size=100000,
        learning_rate=1e-3,
        batch_size=256,
        gamma=0.99,
        verbose=1
    )

    # Setup logger with checkpoint saving capability
    print("Setting up training logger...")
    progress_logger = EnhancedTrainingLogger(
        log_filepath="anti_inversion_ddpg_training3.txt",
        save_dir="model_checkpoints",
        save_freq=50000,
        model_prefix="anti_inversion_ddpg3"
    )

    # Train the model
    print("Starting training for 1,000,000 timesteps...")
    training_model.learn(total_timesteps=1000000, callback=progress_logger)

    # Save the final model
    print("Saving final trained model...")
    training_model.save("anti_inversion_ddpg_final3")

    print("Training completed!")
    print(f"Training logs saved to {progress_logger.log_filepath}")
    print(f"Model checkpoints saved in {progress_logger.save_dir}")

if __name__ == "__main__":
    train_anti_inversion_with_ddpg()

Creating anti-inversion cheetah environment...


  logger.deprecation(


Initializing DDPG model...
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Setting up training logger...
Starting training for 1,000,000 timesteps...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 94.3     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 104      |
|    time_elapsed    | 38       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -6.06    |
|    critic_loss     | 0.0675   |
|    learning_rate   | 0.001    |
|    n_updates       | 3899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 274      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 99       |
|    time_elapsed    | 80       |
|    total_timesteps 