In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
!pip install gymnasium

In [None]:
!pip install gymnasium[mujoco]

In [None]:
# prompt: create DDPG_2M folder and change location to it

import os

# Create the directory if it doesn't exist
if not os.path.exists('/content/drive/My Drive/PPO_HALF_CHEETAH'):
    os.makedirs('/content/drive/My Drive/PPO_HALF_CHEETAH')

# Change the current working directory
os.chdir('/content/drive/My Drive/PPO_HALF_CHEETAH')

# Verify the change (optional)
print(os.getcwd())

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import csv

# Set the device to CPU or GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Actor-Critic Neural Network
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_sizes, activation):
        super(ActorCritic, self).__init__()
        # Shared layers
        layers = []
        input_dim = state_dim
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(input_dim, hidden_size))
            layers.append(activation())
            input_dim = hidden_size
        self.shared = nn.Sequential(*layers)

        # Policy head
        self.policy_mean = nn.Linear(hidden_sizes[-1], action_dim)
        self.policy_std = nn.Parameter(torch.ones(action_dim) * 0.1)  # Initialize with small std dev

        # Value head
        self.value = nn.Linear(hidden_sizes[-1], 1)

    def forward(self, state):
        x = self.shared(state)
        mean = self.policy_mean(x)
        std = torch.exp(self.policy_std)
        value = self.value(x)
        return mean, std, value

    def act(self, state):
        mean, std, _ = self.forward(state)
        dist = torch.distributions.Normal(mean, std)
        action = dist.sample()
        action_log_prob = dist.log_prob(action).sum(dim=-1)
        return action, action_log_prob

    def evaluate(self, states, actions):
        mean, std, values = self.forward(states)
        dist = torch.distributions.Normal(mean, std)
        action_log_probs = dist.log_prob(actions).sum(dim=-1)
        entropy = dist.entropy().sum(dim=-1)
        return action_log_probs, values, entropy


# PPO Agent
class PPOAgent:
    def __init__(self, state_dim, action_dim, hidden_sizes, activation, lr, gamma, lam, clip_eps, update_epochs, batch_size):
        self.gamma = gamma
        self.lam = lam
        self.clip_eps = clip_eps
        self.update_epochs = update_epochs
        self.batch_size = batch_size

        self.actor_critic = ActorCritic(state_dim, action_dim, hidden_sizes, activation).to(device)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=lr)

        self.buffer = []

    def store_transition(self, transition):
        self.buffer.append(transition)

    def compute_gae(self, rewards, values, dones):
        advantages = []
        returns = []
        gae = 0
        next_value = 0
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + self.gamma * next_value * (1 - dones[step]) - values[step]
            gae = delta + self.gamma * self.lam * gae * (1 - dones[step])
            advantages.insert(0, gae)
            next_value = values[step]
            returns.insert(0, gae + values[step])
        return advantages, returns

    def train(self):
        states, actions, rewards, dones, log_probs, values = zip(*self.buffer)
        self.buffer = []

        states = torch.FloatTensor(np.array(states)).to(device)
        actions = torch.FloatTensor(np.array(actions)).to(device)
        rewards = torch.FloatTensor(np.array(rewards)).to(device)
        dones = torch.FloatTensor(np.array(dones)).to(device)
        old_log_probs = torch.FloatTensor(np.array(log_probs)).to(device)
        values = torch.FloatTensor(np.array(values)).to(device)

        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)  # Normalize rewards
        advantages, returns = self.compute_gae(rewards.cpu().numpy(), values.cpu().numpy(), dones.cpu().numpy())
        advantages = torch.FloatTensor(advantages).to(device)
        returns = torch.FloatTensor(returns).to(device)

        for _ in range(self.update_epochs):
            for idx in range(0, len(states), self.batch_size):
                batch_indices = slice(idx, idx + self.batch_size)
                batch_states = states[batch_indices]
                batch_actions = actions[batch_indices]
                batch_old_log_probs = old_log_probs[batch_indices]
                batch_advantages = advantages[batch_indices]
                batch_returns = returns[batch_indices]

                new_log_probs, new_values, entropy = self.actor_critic.evaluate(batch_states, batch_actions)
                ratios = torch.exp(new_log_probs - batch_old_log_probs)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.clip_eps, 1 + self.clip_eps) * batch_advantages
                policy_loss = -torch.min(surr1, surr2).mean()

                value_loss = nn.MSELoss()(new_values.squeeze(), batch_returns)
                loss = policy_loss + 0.5 * value_loss - 0.01 * entropy.mean()

                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), max_norm=0.5)  # Gradient clipping
                self.optimizer.step()


# Training loop
def train_ppo(env, agent, max_steps, log_interval):
    total_steps = 0
    timesteps = []
    avg_returns = []
    std_errors = []
    episode_rewards = []

    csv_file = "ppo_training_results.csv"
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Timesteps", "Average Return", "Standard Error"])

        while total_steps < max_steps:
            state, info = env.reset()
            state = torch.FloatTensor(state).to(device)
            episode_reward = 0

            while True:
                action, log_prob = agent.actor_critic.act(state)
                next_state, reward, done, truncated, info = env.step(action.cpu().numpy())
                next_state = torch.FloatTensor(next_state).to(device)
                value = agent.actor_critic.forward(state)[2]

                agent.store_transition((state.detach().cpu().numpy(),
                                        action.detach().cpu().numpy(),
                                        reward, done,
                                        log_prob.detach().cpu().numpy(),
                                        value.detach().cpu().numpy()))
                state = next_state
                episode_reward += reward
                total_steps += 1

                if done or truncated:
                    episode_rewards.append(episode_reward)
                    break

            if total_steps % log_interval == 0:
                agent.train()

                avg_return = np.mean(episode_rewards[-100:]) if len(episode_rewards) >= 100 else np.mean(episode_rewards)
                std_error = np.std(episode_rewards[-100:]) / np.sqrt(len(episode_rewards[-100:])) if len(episode_rewards) >= 100 else 0
                timesteps.append(total_steps)
                avg_returns.append(avg_return)
                std_errors.append(std_error)

                print(f"Steps: {total_steps}, Average Return: {avg_return:.2f}, Std Error: {std_error:.2f}")
                writer.writerow([total_steps, avg_return, std_error])

    return timesteps, avg_returns, std_errors


if __name__ == "__main__":
    # Fixed hyperparameters
    MAX_STEPS = 1000000
    BATCH_SIZE = 256
    LR = 1e-4
    GAMMA = 0.99
    LAM = 0.95
    CLIP_EPS = 0.2
    UPDATE_EPOCHS = 10
    HIDDEN_SIZES = [128, 128]
    LOG_INTERVAL = 2000

    # Environment setup
    env = gym.make("HalfCheetah-v4")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    # Create PPO agent
    agent = PPOAgent(
        state_dim=state_dim,
        action_dim=action_dim,
        hidden_sizes=HIDDEN_SIZES,
        activation=nn.ReLU,
        lr=LR,
        gamma=GAMMA,
        lam=LAM,
        clip_eps=CLIP_EPS,
        update_epochs=UPDATE_EPOCHS,
        batch_size=BATCH_SIZE,
    )

    # Train PPO
    timesteps, avg_returns, std_errors = train_ppo(env, agent, MAX_STEP)
