In [3]:
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import gymnasium as gym
import os

# ----------------------------
# Custom Reward Wrapper
# ----------------------------
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)

    def step(self, action):
        observation, original_reward, terminated, truncated, info = self.env.step(action)
        reward = self.modified_reward_function(observation, action, original_reward)
        return observation, reward, terminated, truncated, info

    def modified_reward_function(self, observation, action, original_reward):
        # Custom reward components:
        # 1. Forward velocity reward (using x-coordinate velocity, index 8)
        forward_reward = 1.0 * observation[8]

        # 2. Penalize excessive vertical movement (z-coordinate, index 0)
        height_penalty = -0.05 * abs(observation[0] - 0.5)

        # 3. Penalize excessive rotations for stability (angle of second rotor, index 2)
        rotation_penalty = -0.1 * abs(observation[2])

        # 4. Energy efficiency - penalize excessive joint movements (angular velocities indices 10-16)
        energy_penalty = -0.001 * sum(abs(observation[i]) for i in range(10, 17))

        # 5. Smooth control - penalize large action changes
        control_penalty = -0.01 * np.sum(np.square(action))

        # Balance the original reward with custom components (weight of original reward: 0.5)
        original_reward_weight = 0.5
        reward = (
            forward_reward +
            height_penalty +
            rotation_penalty +
            energy_penalty +
            control_penalty +
            original_reward_weight * original_reward
        )

        return reward

# ----------------------------
# Create the Environment
# ----------------------------
# Create the HalfCheetah environment (using v5 here, adjust if needed)
env = gym.make("HalfCheetah-v5", render_mode=None)
# Wrap the environment to override the reward function
env = CustomRewardWrapper(env)

# Get state and action dimensions from the environment
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# Set up device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------------------
# Define the Networks
# ----------------------------
# Critic network (QNet)
class QNet(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super().__init__()
        # Input dimension: state_dim + action_dim
        self.hidden = nn.Linear(state_dim + action_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, 1)

    def forward(self, s, a):
        x = torch.cat((s, a), dim=-1)
        x = self.hidden(x)
        x = F.relu(x)
        return self.output(x)

# Actor network (PolicyNet)
class PolicyNet(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super().__init__()
        # Input dimension: state_dim; output dimension: action_dim
        self.hidden = nn.Linear(state_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, action_dim)

    def forward(self, s):
        x = self.hidden(s)
        x = F.relu(x)
        x = self.output(x)
        return torch.tanh(x)

# ----------------------------
# Initialize Networks and Targets
# ----------------------------
q_origin_model = QNet(state_dim, action_dim).to(device)
q_target_model = QNet(state_dim, action_dim).to(device)
_ = q_target_model.requires_grad_(False)

mu_origin_model = PolicyNet(state_dim, action_dim).to(device)
mu_target_model = PolicyNet(state_dim, action_dim).to(device)
_ = mu_target_model.requires_grad_(False)

# ----------------------------
# Hyperparameters and Optimizers
# ----------------------------
gamma = 0.99
opt_q = torch.optim.AdamW(q_origin_model.parameters(), lr=0.0005)
opt_mu = torch.optim.AdamW(mu_origin_model.parameters(), lr=0.0005)

def optimize(states, actions, rewards, next_states, dones):
    # Convert lists to tensors.
    states = torch.tensor(np.stack(states), dtype=torch.float).to(device)
    actions = torch.tensor(actions, dtype=torch.float).to(device)
    rewards = torch.tensor(rewards, dtype=torch.float).unsqueeze(dim=1).to(device)
    next_states = torch.tensor(next_states, dtype=torch.float).to(device)
    dones = torch.tensor(dones, dtype=torch.float).unsqueeze(dim=1).to(device)

    # Compute target for the critic network.
    with torch.no_grad():
        next_actions = mu_target_model(next_states)
        target = rewards + (1 - dones) * gamma * q_target_model(next_states, next_actions)

    # Compute Critic loss.
    q_value = q_origin_model(states, actions)
    q_loss = F.mse_loss(q_value, target.detach())
    opt_q.zero_grad()
    q_loss.backward()
    opt_q.step()

    # Compute Actor loss: maximize Q(s, mu(s)) -> minimize -Q(s, mu(s))
    mu_value = mu_origin_model(states)
    q_value_for_mu = q_origin_model(states, mu_value)
    mu_loss = -q_value_for_mu.mean()
    opt_mu.zero_grad()
    mu_loss.backward()
    opt_mu.step()

    # Ensure gradients remain enabled.
    for p in q_origin_model.parameters():
        p.requires_grad = True

    return q_loss.item(), mu_loss.item()

tau = 0.002
def update_target():
    # Soft update target networks.
    for var, var_target in zip(q_origin_model.parameters(), q_target_model.parameters()):
        var_target.data = tau * var.data + (1.0 - tau) * var_target.data
    for var, var_target in zip(mu_origin_model.parameters(), mu_target_model.parameters()):
        var_target.data = tau * var.data + (1.0 - tau) * var_target.data

# ----------------------------
# Replay Buffer
# ----------------------------
class ReplayBuffer:
    def __init__(self, buffer_size: int):
        self.buffer_size = buffer_size
        self.buffer = []

    def add(self, item):
        if len(self.buffer) == self.buffer_size:
            self.buffer.pop(0)
        self.buffer.append(item)

    def sample(self, batch_size):
        items = random.sample(self.buffer, batch_size)
        states   = [i[0] for i in items]
        actions  = [i[1] for i in items]
        rewards  = [i[2] for i in items]
        n_states = [i[3] for i in items]
        dones    = [i[4] for i in items]
        return states, actions, rewards, n_states, dones

    def length(self):
        return len(self.buffer)

buffer = ReplayBuffer(buffer_size=20000)

# ----------------------------
# Ornstein-Uhlenbeck Noise for Exploration
# ----------------------------
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma, theta=0.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

ou_action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim), sigma=np.ones(action_dim) * 0.05)

def pick_sample(s):
    with torch.no_grad():
        s_np = np.array(s)
        s_batch = np.expand_dims(s_np, axis=0)
        s_batch = torch.tensor(s_batch, dtype=torch.float).to(device)
        action_det = mu_origin_model(s_batch).squeeze(0)
        noise = ou_action_noise()
        action = action_det.cpu().numpy() + noise
        action = np.clip(action, -1.0, 1.0)
        return action

# ----------------------------
# Training Loop Parameters and Logging
# ----------------------------
batch_size = 250
num_episodes = 500

log_file_path = "training_log.txt"
log_file = open(log_file_path, "w")
log_file.write("episode,reward,avg_q_loss,avg_mu_loss\n")

if __name__ == '__main__':
    reward_records = []
    for ep in range(num_episodes):
        s, _ = env.reset()
        done = False
        cum_reward = 0
        episode_q_loss = 0
        episode_mu_loss = 0
        training_steps = 0

        while not done:
            a = pick_sample(s)
            s_next, r, term, trunc, _ = env.step(a)
            done = term or trunc
            buffer.add([s, a, r, s_next, float(term)])
            cum_reward += r

            if buffer.length() >= batch_size:
                states, actions, rewards, n_states, dones = buffer.sample(batch_size)
                q_loss_val, mu_loss_val = optimize(states, actions, rewards, n_states, dones)
                update_target()
                episode_q_loss += q_loss_val
                episode_mu_loss += mu_loss_val
                training_steps += 1

            s = s_next

        if training_steps > 0:
            avg_q_loss = episode_q_loss / training_steps
            avg_mu_loss = episode_mu_loss / training_steps
        else:
            avg_q_loss = 0.0
            avg_mu_loss = 0.0

        reward_records.append(cum_reward)
        log_line = f"{ep+1},{cum_reward}\n"
        log_file.write(log_line)
        log_file.flush()
        print(f"Episode {ep+1}: Reward = {cum_reward:.2f}, Avg Q Loss = {avg_q_loss:.6f}, Avg Mu Loss = {avg_mu_loss:.6f}")

    os.makedirs("saved_models", exist_ok=True)
    torch.save(mu_origin_model.state_dict(), os.path.join("saved_models", "mu_origin_model.pth"))
    torch.save(q_origin_model.state_dict(), os.path.join("saved_models", "q_origin_model.pth"))
    print("Training Done. Models and log saved.")

    log_file.close()


  actions = torch.tensor(actions, dtype=torch.float).to(device)


Episode 1: Reward = -322.59, Avg Q Loss = 0.063979, Avg Mu Loss = 0.279278
Episode 2: Reward = -401.36, Avg Q Loss = 0.036174, Avg Mu Loss = 0.027727
Episode 3: Reward = -445.02, Avg Q Loss = 0.041199, Avg Mu Loss = -0.326236
Episode 4: Reward = -523.70, Avg Q Loss = 0.088967, Avg Mu Loss = -0.469968
Episode 5: Reward = -352.16, Avg Q Loss = 0.135112, Avg Mu Loss = -1.046863
Episode 6: Reward = -329.09, Avg Q Loss = 0.171037, Avg Mu Loss = -1.538507
Episode 7: Reward = -231.84, Avg Q Loss = 0.202725, Avg Mu Loss = -1.999841
Episode 8: Reward = 237.54, Avg Q Loss = 0.308300, Avg Mu Loss = -2.257400
Episode 9: Reward = 200.15, Avg Q Loss = 0.516101, Avg Mu Loss = -2.721211
Episode 10: Reward = 250.21, Avg Q Loss = 0.609636, Avg Mu Loss = -3.388768
Episode 11: Reward = 442.36, Avg Q Loss = 0.638714, Avg Mu Loss = -4.121386
Episode 12: Reward = 687.12, Avg Q Loss = 0.624729, Avg Mu Loss = -5.174935
Episode 13: Reward = 757.36, Avg Q Loss = 0.636483, Avg Mu Loss = -6.449167
Episode 14: Rewa

In [2]:
!pip install "gymnasium[mujoco]"

Collecting mujoco>=2.1.5 (from gymnasium[mujoco])
  Downloading mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Downloading mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.4/243.4 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packag