In [1]:
!pip install gymnasium[mujoco]
!apt install -y libgl1-mesa-glx libosmesa6 libglfw3 patchelf

import gymnasium as gym

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as D
import copy
import numpy as np
import random

from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt

torch.random.manual_seed(0)
np.random.seed(0)

Collecting mujoco>=2.1.5 (from gymnasium[mujoco])
  Downloading mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m726.2 kB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Downloading mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.4/243.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packa

In [2]:
# @title Visualization code. Used later.
import os
from gym.wrappers import RecordVideo
from IPython.display import Video, display, clear_output

# Force MuJoCo to use EGL for rendering (important for Colab)
os.environ["MUJOCO_GL"] = "egl"

def visualize(agent):
    """Visualize agent with a custom camera angle."""

    # Create environment in rgb_array mode
    env = gym.make("HalfCheetah-v5", render_mode="rgb_array", reset_noise_scale=0.2)

    # Apply video recording wrapper
    env = RecordVideo(env, video_folder="./", episode_trigger=lambda x: True)

    obs, _ = env.reset()

    # Access the viewer object through mujoco_py
    viewer = env.unwrapped.mujoco_renderer.viewer  # Access viewer
    viewer.cam.distance = 3.0     # Set camera distance
    viewer.cam.azimuth = 90       # Rotate camera around pendulum
    viewer.cam.elevation = 0   # Tilt the camera up/down


    for t in range(512):
        with torch.no_grad():
            actions = agent.get_action(obs)
        obs, _, done, _ = env.step(actions)
        if done:
            break

    env.render()
    env.close()

    # Display the latest video
    clear_output(wait=True)
    display(Video("./rl-video-episode-0.mp4", embed=True))


In [3]:
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)

        self.buffer[self.position] = (
            torch.FloatTensor(state).to(self.device),
            torch.FloatTensor(action).to(self.device),
            torch.FloatTensor([reward]).to(self.device),
            torch.FloatTensor(next_state).to(self.device),
            torch.FloatTensor([done]).to(self.device),
        )

        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)

        return (
            torch.stack(state).to(self.device),
            torch.stack(action).to(self.device),
            torch.stack(reward).to(self.device),
            torch.stack(next_state).to(self.device),
            torch.stack(done).to(self.device),
        )

    def __len__(self):
        return len(self.buffer)


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import copy
import numpy as np

# Q-Network Definition
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, state, action):
        # Concatenate state and action, then produce a single Q-value
        return self.fc(torch.cat([state, action], dim=1))

# Policy Network Definition
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim)
        )

    def forward(self, state):
        # Returns actions in [-1, 1]
        return 1.0 * torch.tanh(self.fc(state))

# CQL Agent (twin Q-networks, random+policy actions in CQL penalty)
class CQLAgent:
    def __init__(
        self,
        state_dim,
        action_dim,
        lr=3e-4,
        gamma=0.99,
        tau=0.005,
        alpha=3.0,
        num_random_actions=10
    ):
        # Detect GPU or CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau

        # CQL penalty coefficient
        self.alpha = alpha

        # Number of random actions for the conservative penalty
        self.num_random_actions = num_random_actions

        # -------------------------------
        #  Create twin Q networks
        # -------------------------------
        self.q1 = QNetwork(state_dim, action_dim).to(self.device)
        self.q2 = QNetwork(state_dim, action_dim).to(self.device)
        self.target_q1 = copy.deepcopy(self.q1).to(self.device)
        self.target_q2 = copy.deepcopy(self.q2).to(self.device)

        # For backward compatibility with main.py calls:
        self.q_net = self.q1
        self.target_q_net = self.target_q1

        # Create policy network
        self.policy = PolicyNetwork(state_dim, action_dim).to(self.device)
        self.target_policy = copy.deepcopy(self.policy).to(self.device)

        # Initialize optimizers
        self.q1_optim = optim.Adam(self.q1.parameters(), lr=lr)
        self.q2_optim = optim.Adam(self.q2.parameters(), lr=lr)
        self.policy_optim = optim.Adam(self.policy.parameters(), lr=lr / 10)

        # Logging
        self.q_loss = 0.0
        self.policy_loss = 0.0
        self.cql_loss = 0.0  # We'll store sum of CQL penalties from Q1 and Q2

    def get_action(self, state, deterministic=False):
        """Select action from current policy; add noise if not deterministic."""
        with torch.no_grad():
            # Convert to Tensor on self.device
            state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            action_t = self.policy(state_t)

            if not deterministic:
                noise = 0.1 * torch.randn_like(action_t).to(self.device)
                action_t = torch.clamp(action_t + noise, -1.0, 1.0)

            # Return numpy array on CPU
            return action_t.squeeze(0).cpu().numpy()

    def get_q_loss(self, states, actions, rewards, next_states, dones):
        """
        Q loss for both networks:
          L_Q = MSE(Q1 - target) + MSE(Q2 - target)
                + alpha * (CQL_penalty1 + CQL_penalty2).

        Each "CQL_penalty" is:
          E[logsumexp(Q(s,a)) - Q(s,a_in_batch)],
        sampling random + policy actions for the logsumexp.
        """
        # Move to device
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)

        # -------------------------------------------
        # Standard Bellman backups
        # -------------------------------------------
        with torch.no_grad():
            next_actions = self.target_policy(next_states)
            next_q1 = self.target_q1(next_states, next_actions)
            next_q2 = self.target_q2(next_states, next_actions)
            # Use min for the target
            target_q = rewards + self.gamma * torch.min(next_q1, next_q2) * (1 - dones)

        q1_vals = self.q1(states, actions)
        q2_vals = self.q2(states, actions)

        bellman_loss = F.mse_loss(q1_vals, target_q) + F.mse_loss(q2_vals, target_q)

        # -------------------------------------------
        # CQL: sample random + policy actions
        # -------------------------------------------
        batch_size = states.shape[0]

        # 1) Random actions in [-1, 1]
        random_actions = torch.FloatTensor(
            np.random.uniform(-1, 1, (batch_size, self.num_random_actions, self.action_dim))
        ).to(self.device)

        # 2) Current policy actions
        with torch.no_grad():
            policy_actions = self.policy(states)  # shape (B, action_dim)

        # Combine random + policy => shape (B, N+1, a_dim)
        policy_actions = policy_actions.unsqueeze(1)
        all_actions = torch.cat([random_actions, policy_actions], dim=1)


        q1_vals_all = []
        q2_vals_all = []
        for i in range(all_actions.shape[1]):
            q1_i = self.q1(states, all_actions[:, i, :])
            q2_i = self.q2(states, all_actions[:, i, :])
            q1_vals_all.append(q1_i)
            q2_vals_all.append(q2_i)

        q1_vals_all = torch.cat(q1_vals_all, dim=1)  # shape (B, N+1)
        q2_vals_all = torch.cat(q2_vals_all, dim=1)  # shape (B, N+1)

        # logsumexp across those actions (then subtract log(# actions))
        logsumexp_q1 = torch.logsumexp(q1_vals_all, dim=1, keepdim=True) - np.log(q1_vals_all.shape[1])
        logsumexp_q2 = torch.logsumexp(q2_vals_all, dim=1, keepdim=True) - np.log(q2_vals_all.shape[1])

        # CQL penalty for Q1 and Q2
        cql_penalty1 = (logsumexp_q1 - q1_vals).mean()
        cql_penalty2 = (logsumexp_q2 - q2_vals).mean()
        cql_penalty = cql_penalty1 + cql_penalty2

        total_q_loss = bellman_loss + self.alpha * cql_penalty

        # Logging
        self.cql_loss = cql_penalty.item()
        return total_q_loss

    def get_policy_loss(self, states):
        """Policy update: minimize -E[min(Q1, Q2)]."""
        states = states.to(self.device)
        actions_pi = self.policy(states)
        q1_pi = self.q1(states, actions_pi)
        q2_pi = self.q2(states, actions_pi)
        q_min = torch.min(q1_pi, q2_pi)
        policy_loss = -q_min.mean()
        return policy_loss

    def update(self, replay_buffer, batch_size=256):
        """Sample from replay buffer and update Q networks + policy."""
        if len(replay_buffer) < batch_size:
            return

        states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

        # 1) Update Q-functions
        self.q1_optim.zero_grad()
        self.q2_optim.zero_grad()
        q_loss = self.get_q_loss(states, actions, rewards, next_states, dones)
        q_loss.backward()
        self.q1_optim.step()
        self.q2_optim.step()
        self.q_loss = q_loss.item()

        # 2) Update Policy
        self.policy_optim.zero_grad()
        policy_loss = self.get_policy_loss(states)
        policy_loss.backward()
        self.policy_optim.step()
        self.policy_loss = policy_loss.item()

        # 3) Soft-update target networks
        with torch.no_grad():
            for param, target_param in zip(self.q1.parameters(), self.target_q1.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
            for param, target_param in zip(self.q2.parameters(), self.target_q2.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
            for param, target_param in zip(self.policy.parameters(), self.target_policy.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


In [None]:
def evaluate_policy(agent, env, num_eval_episodes=5):
    """
    Runs 'num_eval_episodes' episodes in the given 'env' using the agent's
    policy. Returns the average total reward across these episodes.
    """
    returns = []
    for ep in range(num_eval_episodes):
        # Reset environment
        state, _ = env.reset()
        done = False
        ep_return = 0.0

        while not done:
            # Get the action from the agent's policy, in deterministic mode
            action = agent.get_action(state, deterministic=True)

            # Step the environment with that action
            next_state, reward, terminated, truncated, _ = env.step(action)
            ep_return += reward
            done = terminated or truncated
            state = next_state

        returns.append(ep_return)
    return np.mean(returns)

BATCH_SIZE = 128
ONLINE_EPISODES = 250   # how many episodes to collect online
MAX_STEPS = 1000
BUFFER_SIZE = 100000
LOG_INTERVAL = 10
REWARD_MULTIPLIER = 1
LEARNING_RATE = 3e-4  # Match the CQL model's LR
RANDOM_EPISODES = 30  # how many random episodes to add
OFFLINE_TRAIN_STEPS = 1000  # how many offline training iterations

# Initialize environment
env = gym.make("InvertedPendulum-v5", max_episode_steps=MAX_STEPS)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# TensorBoard logging
writer = SummaryWriter()

# Initialize CQL agent and replay buffer
agent = CQLAgent(state_dim, action_dim, lr=LEARNING_RATE, alpha=0.0)
replay_buffer = ReplayBuffer(BUFFER_SIZE)

# Tracking losses and metrics
policy_losses = []
q_losses = []
cql_losses = []
cumulative_rewards = []

# ------------------------------------------------------------------------------
# (A) Short Online Training Phase
# ------------------------------------------------------------------------------
print("=== Collecting Online Episodes & Updating Agent Online ===")
evaluation_returns = []

# Online Training Loop
for ep in range(ONLINE_EPISODES):
    state, _ = env.reset()
    done = False
    ep_return = 0.0
    step_count = 0

    while not done and step_count < MAX_STEPS:
        action = agent.get_action(state, deterministic=False)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        ep_return += reward
        step_count += 1

        # Store in replay buffer
        replay_buffer.add(state, action, reward * REWARD_MULTIPLIER, next_state, float(done))
        state = next_state

        # Online training update
        agent.update(replay_buffer, BATCH_SIZE)

    # Evaluate every few episodes
    if ep % 2 == 0:
        eval_ret = evaluate_policy(agent, env, num_eval_episodes=2)
        evaluation_returns.append(eval_ret)  # Track evaluation returns
        print(f"Online Episode {ep}, Return={ep_return:.1f}, Eval={eval_ret:.1f}")

# ------------------------------------------------------------------------------
# (B) Add random data to the buffer (optional)
# ------------------------------------------------------------------------------
print("\n=== Collecting Random Episodes ===")
for _ in range(RANDOM_EPISODES):
    state, _ = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        replay_buffer.add(
            state,
            action,
            reward * REWARD_MULTIPLIER,
            next_state,
            float(done)
        )
        state = next_state

print(f"Replay buffer size after random data: {len(replay_buffer)}")

# ------------------------------------------------------------------------------
# (C) Offline Training Loop (NO new environment interaction)
# ------------------------------------------------------------------------------
print("\n=== Offline Training Only ===")
# Offline Training Loop
for step in range(OFFLINE_TRAIN_STEPS):
    agent.update(replay_buffer, BATCH_SIZE)

    # Track batch reward sum
    if len(replay_buffer) >= BATCH_SIZE:
        states, actions, rewards, next_states, dones = replay_buffer.sample(BATCH_SIZE)
        batch_reward_sum = rewards.sum().item()
    else:
        batch_reward_sum = 0.0

    q_losses.append(agent.q_loss)
    policy_losses.append(agent.policy_loss)
    cql_losses.append(agent.cql_loss)
    cumulative_rewards.append(batch_reward_sum)

    # Evaluate agent performance every LOG_INTERVAL steps
    if step % LOG_INTERVAL == 0:
        eval_ret = evaluate_policy(agent, env, num_eval_episodes=5)
        evaluation_returns.append(eval_ret)  # Track evaluation returns
        print(f"Offline Step {step}, Eval Return: {eval_ret:.2f}")

    # Logging to TensorBoard
    writer.add_scalar("Reward/BatchSum", batch_reward_sum, step)
    writer.add_scalar("Loss/Q", agent.q_loss, step)
    writer.add_scalar("Loss/Policy", agent.policy_loss, step)
    writer.add_scalar("Loss/CQL", agent.cql_loss, step)

# ------------------------------------------------------------------------------
# Final Evaluation
# ------------------------------------------------------------------------------
eval_episodes = 5
avg_return = evaluate_policy(agent, env, num_eval_episodes=eval_episodes)
print(f"\nEvaluated policy over {eval_episodes} episodes. Average return: {avg_return:.2f}")

# Plot training metrics
plt.figure(figsize=(15, 10))

# Plot losses
plt.subplot(1, 2, 1)
plt.plot(q_losses, label="Q Loss")
plt.plot(policy_losses, label="Policy Loss")
if cql_losses:
    plt.plot(cql_losses, label="CQL Loss")
plt.xlabel("Offline Update Step")
plt.ylabel("Loss")
plt.title("Training Losses")
plt.legend()

# Plot evaluation returns over time
plt.figure(figsize=(12, 6))
plt.plot(evaluation_returns, label="Evaluation Return")
plt.xlabel("Evaluation Step (Online + Offline)")
plt.ylabel("Average Return")
plt.title("Evaluation Return Over Time")
plt.legend()
plt.grid()
plt.show()

plt.tight_layout()
plt.show()

# Cleanup
writer.close()
env.close()

'''
def evaluate_policy(agent, env, num_eval_episodes=5):
    """
    Runs 'num_eval_episodes' episodes in the given 'env' using the agent's
    policy. Returns the average total reward across these episodes.
    """
    returns = []
    for ep in range(num_eval_episodes):
        # Reset environment
        state, _ = env.reset()
        done = False
        ep_return = 0.0

        while not done:
            # Get the action from the agent's policy, in deterministic mode
            action = agent.get_action(state, deterministic=True)

            # Step the environment with that action
            next_state, reward, terminated, truncated, _ = env.step(action)
            ep_return += reward
            done = terminated or truncated
            state = next_state

        returns.append(ep_return)
    return np.mean(returns)



# Hyperparameters
BATCH_SIZE = 256
MAX_EPISODES = 300
MAX_STEPS = 1000
BUFFER_SIZE = 100000
LOG_INTERVAL = 10
REWARD_MULTIPLIER = 1
LEARNING_RATE = 3e-4  # Match the CQL model's LR

# Initialize environment
env = gym.make("HalfCheetah-v5", max_episode_steps=MAX_STEPS)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# Setup device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize logging and agent
writer = SummaryWriter()
agent = CQLAgent(state_dim, action_dim, lr=LEARNING_RATE, alpha=1.0)

# Move agent model components to GPU
# agent.q1.to(device)
# agent.target_q_net.to(device)
# agent.policy.to(device)
# agent.target_policy.to(device)

# Initialize replay buffer
replay_buffer = ReplayBuffer(BUFFER_SIZE)  # No device needed in the replay buffer

# Tracking losses and metrics
policy_losses = []
q_losses = []
cql_losses = []
episode_lengths = []
cumulative_rewards = []

for _ in range(500):
    state = env.reset()[0]
    done = False
    while not done:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, _ = env.step(action)
        reward_to_go = reward * REWARD_MULTIPLIER
        print(reward_to_go)
        done = terminated or truncated
        replay_buffer.add(state, action, reward_to_go, next_state, done)
        state = next_state

# Training loop
for episode in range(MAX_EPISODES):
    episode_reward = 0

    if len(replay_buffer) > BATCH_SIZE:
        batch = replay_buffer.sample(BATCH_SIZE)
        states, actions, rewards, next_states, dones = batch
        agent.update(replay_buffer, BATCH_SIZE)
        # 3) Log the sum of rewards in this batch (not necessarily a real 'episode' reward)
        episode_reward = rewards.sum().item()

        # Collect the float losses from the agent
        q_losses.append(agent.q_loss)
        policy_losses.append(agent.policy_loss)
        cql_losses.append(agent.cql_loss)

        # Print debug info
        print(f"Episode {episode}, Batch Reward Sum: {episode_reward:.2f}")
        print(f"  Q-Loss: {agent.q_loss:.4f}, Policy Loss: {agent.policy_loss:.4f}, CQL Loss: {agent.cql_loss:.4f}")


    # Track episode length and cumulative reward
    cumulative_rewards.append(episode_reward)

    # Logging to TensorBoard
    if episode % LOG_INTERVAL == 0:
        writer.add_scalar("Reward/Episode", episode_reward, episode)
        if q_losses:
            writer.add_scalar("Loss/Q", q_losses[-1], episode)
        if policy_losses:
            writer.add_scalar("Loss/Policy", policy_losses[-1], episode)
        if cql_losses:
            writer.add_scalar("Loss/CQL", cql_losses[-1], episode)

eval_episodes = 5
avg_return = evaluate_policy(agent, env, num_eval_episodes=eval_episodes)
print(f"\nEvaluated policy over {eval_episodes} episodes. Average return: {avg_return:.2f}")

# Plot training metrics
plt.figure(figsize=(15, 10))

# Plot losses
plt.subplot(1, 2, 1)
plt.plot(q_losses, label="Q Loss")
plt.plot(policy_losses, label="Policy Loss")
if cql_losses:
    plt.plot(cql_losses, label="CQL Loss")
plt.xlabel("Update Step")
plt.ylabel("Loss")
plt.title("Training Losses")
plt.legend()

# Plot cumulative rewards
plt.subplot(1, 2, 2)
plt.plot(cumulative_rewards)
plt.xlabel("Episode")
plt.ylabel("Cumulative Reward")
plt.title("Training Rewards")

plt.tight_layout()
plt.show()

# Cleanup
writer.close()
env.close()
'''

Using device: cuda
=== Collecting Online Episodes & Updating Agent Online ===
Online Episode 0, Return=9.0, Eval=9.0
Online Episode 2, Return=11.0, Eval=9.5
Online Episode 4, Return=9.0, Eval=9.5
Online Episode 6, Return=12.0, Eval=10.0
Online Episode 8, Return=9.0, Eval=10.0
Online Episode 10, Return=10.0, Eval=9.5
Online Episode 12, Return=11.0, Eval=11.0
Online Episode 14, Return=9.0, Eval=8.0
Online Episode 16, Return=8.0, Eval=6.5
Online Episode 18, Return=6.0, Eval=6.0
Online Episode 20, Return=4.0, Eval=5.0
Online Episode 22, Return=5.0, Eval=5.0
Online Episode 24, Return=4.0, Eval=5.0
Online Episode 26, Return=4.0, Eval=4.0
Online Episode 28, Return=5.0, Eval=4.0
Online Episode 30, Return=4.0, Eval=4.0
Online Episode 32, Return=4.0, Eval=4.0
Online Episode 34, Return=4.0, Eval=4.0
Online Episode 36, Return=4.0, Eval=4.0
Online Episode 38, Return=4.0, Eval=4.0
Online Episode 40, Return=4.0, Eval=4.0
Online Episode 42, Return=4.0, Eval=4.0
Online Episode 44, Return=4.0, Eval=4.0


In [6]:
visualize(agent)

  deprecation(
  logger.warn(


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x17 and 4x256)