In [None]:
import gymnasium as gym
import numpy as np
import os
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback

class EnhancedTrainingLogger(BaseCallback):
    def __init__(self, log_filepath="gait_optimization_ppo_training2.txt", save_dir="model_checkpoints", 
                 save_freq=50000, model_prefix="gait_optimization_ppo2"):
        super().__init__(verbose=0)
        self.log_filepath = log_filepath
        self.save_dir = save_dir
        self.save_freq = save_freq
        self.model_prefix = model_prefix
        self.episode_total_reward = 0
        self.control_costs = 0
        self.last_save = 0
        
        # Create save directory if it doesn't exist
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
    def _on_step(self):
        # Save model at regular intervals
        if self.num_timesteps >= self.last_save + self.save_freq:
            try:
                # Save the model
                model_path = os.path.join(self.save_dir, 
                                        f"{self.model_prefix}_{self.num_timesteps}")
                self.model.save(model_path)
                print(f"Model saved at {model_path}")
                self.last_save = self.num_timesteps
            except Exception as e:
                print(f"Error saving model at timestep {self.num_timesteps}: {e}")
        
        # Log rewards and metrics
        current_reward = self.locals.get("rewards")[0] if self.locals.get("rewards") is not None else 0
        self.episode_total_reward += current_reward
        
        environment_info = self.locals.get("infos")[0] if self.locals.get("infos") is not None else {}
        episode_done = self.locals.get("dones")[0] if self.locals.get("dones") is not None else False
        
        # Track control costs
        if "reward_ctrl" in environment_info:
            self.control_costs += abs(environment_info["reward_ctrl"])
        
        if episode_done:
            distance_traveled = environment_info.get("x_position", 0)
            with open(self.log_filepath, "a") as log_file:
                log_file.write(f"{self.num_timesteps},{self.episode_total_reward:.4f},{distance_traveled:.4f},{self.control_costs:.4f}\n")
            self.episode_total_reward = 0
            self.control_costs = 0
        return True
    
    def _on_training_start(self):
        with open(self.log_filepath, "w") as log_file:
            log_file.write("timestep,reward,distance,control_cost\n")

class GaitOptimizationCheetahEnv(gym.Wrapper):
    def __init__(self):
        super().__init__(gym.make("HalfCheetah-v4"))
        # Keep track of previous observations to measure smoothness
        self.prev_obs = None
        
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.prev_obs = obs
        return obs, info
        
    def step(self, action):
        observation, original_reward, terminated, truncated, info = self.env.step(action)
        modified_reward = self.gait_optimization_reward(observation, action, original_reward, info)
        
        # Update previous observation
        self.prev_obs = observation
        
        return observation, modified_reward, terminated, truncated, info
    
    def gait_optimization_reward(self, observation, action, original_reward, info):
        # 1. Original reward component
        original_reward_weight = 1.0
        
        # 2. Movement smoothness component
        # Calculate joint acceleration (change in velocities)
        if self.prev_obs is not None:
            current_velocities = observation[10:17]
            prev_velocities = self.prev_obs[10:17]
            joint_accelerations = np.abs(current_velocities - prev_velocities)
            # Penalize high accelerations (jerky movements)
            smoothness_reward = 0.2 * np.exp(-0.5 * np.mean(joint_accelerations))
        else:
            smoothness_reward = 0
        
        # 3. Energy efficiency through spring-like behavior
        # In biological systems, muscles and tendons store and release energy like springs
        # This term rewards joint movements that follow a spring-like pattern
        joint_angles = observation[2:8]  # All joint angles
        joint_velocities = observation[10:17]  # All joint velocities
        
        # Calculate spring-like behavior score
        # When a joint is extended, it should be moving back (negative velocity)
        # When a joint is flexed, it should be extending (positive velocity)
        # This creates a spring-like oscillation
        spring_score = 0
        for i in range(len(joint_angles)):
            # If joint is extended (positive angle), it should have negative velocity
            # If joint is flexed (negative angle), it should have positive velocity
            # This creates a spring-like oscillation
            spring_like = -1 * joint_angles[i] * joint_velocities[i]
            spring_score += max(0, spring_like)
        
        spring_reward = 0.15 * min(1.0, spring_score / 3.0)
        
        # 4. Gait symmetry reward - encourage alternating leg movements
        # Calculate phase relationships between front and back legs
        if len(joint_velocities) >= 5:
            # Back thigh vs front thigh
            back_thigh_vel = observation[11]
            front_thigh_vel = observation[14]
            # Reward when they're out of phase (one positive, one negative)
            phase_score = back_thigh_vel * front_thigh_vel
            symmetry_reward = 0.1 * np.exp(-2.0 * max(0, phase_score))
        else:
            symmetry_reward = 0
        
        # 5. Power-to-speed efficiency
        forward_reward = info.get("reward_run", 0)
        control_cost = abs(info.get("reward_ctrl", 0))
        
        # If making forward progress, calculate efficiency
        if forward_reward > 0 and control_cost > 0:
            # Higher ratio means more forward movement for energy expended
            efficiency_ratio = forward_reward / (control_cost + 0.1)  # Add 0.1 to avoid division by zero
            efficiency_reward = 0.2 * min(1.0, efficiency_ratio / 10.0)
        else:
            efficiency_reward = 0
                
        # Combined reward
        reward = (original_reward_weight * original_reward + 
                 smoothness_reward + spring_reward + 
                 symmetry_reward + efficiency_reward)
        
        return reward

def train_gait_optimization_with_ppo():
    try:
        # Setup environment
        print("Creating gait optimization cheetah environment...")
        cheetah_environment = GaitOptimizationCheetahEnv()

        # Initialize PPO model with default hyperparameters
        print("Initializing PPO model...")
        training_model = PPO(
            policy="MlpPolicy",
            env=cheetah_environment,
            verbose=1
        )

        # Setup logger with checkpoint saving capability
        print("Setting up training logger...")
        progress_logger = EnhancedTrainingLogger(
            log_filepath="gait_optimization_ppo_training2.txt",
            save_dir="model_checkpoints", 
            save_freq=50000,
            model_prefix="gait_optimization_ppo2"
        )

        # Train the model
        print("Starting training for 1,000,000 timesteps...")
        training_model.learn(total_timesteps=1000000, callback=progress_logger)

        # Save the final model
        print("Saving final trained model...")
        training_model.save("gait_optimization_ppo_final2")

        print("Training completed successfully!")
        print(f"Training logs saved to {progress_logger.log_filepath}")
        print(f"Model checkpoints saved in {progress_logger.save_dir}")
        
    except Exception as e:
        print(f"Training error occurred: {str(e)}")
        print("Training was interrupted, but intermediate models should be saved.")

if __name__ == "__main__":
    train_gait_optimization_with_ppo()

Creating gait optimization cheetah environment...


  logger.deprecation(


Initializing PPO model...
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Setting up training logger...
Starting training for 1,000,000 timesteps...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -118     |
| time/              |          |
|    fps             | 453      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -151        |
| time/                   |             |
|    fps                  | 351         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011106249 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.51      