<a href="https://colab.research.google.com/github/velpulakaran/reinforcement-learning/blob/main/rlml_lab_8_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#2303A51587

KARAN

Batch:- 09

Implementing Actor-Critic method -e.g., A2A for continuous action space problems.

In [None]:
%pip install gymnasium



In [None]:
# a2c_continuous.py
import math
import time
import numpy as np
import gymnasium as gym # Changed from gym to gymnasium
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden=256):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(obs_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU()
        )
        self.mu_head = nn.Linear(hidden, act_dim)
        self.log_std = nn.Parameter(torch.zeros(act_dim))  # state-independent log-std
        self.value_head = nn.Linear(hidden, 1)

    def forward(self, x):
        x = self.shared(x)
        mu = self.mu_head(x)
        std = torch.exp(self.log_std)
        value = self.value_head(x).squeeze(-1)
        return mu, std, value

def get_env(name="Pendulum-v1", seed=0):
    # Removed new_step_api=True as it's not supported for this environment
    env = gym.make(name)
    # Changed env.seed(seed) to env.reset(seed=seed)
    env.reset(seed=seed)
    env.action_space.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    return env

def compute_returns(rewards, masks, values, gamma, next_value):
    """
    rewards: list of rewards (T,)
    masks: list of masks (1 if not done else 0)
    values: list of predicted values (T,)
    next_value: scalar predicted value for the state following the rollout
    returns computed in reversed order
    """
    T = len(rewards)
    returns = torch.zeros(T, device=device)
    R = next_value
    for t in reversed(range(T)):
        R = rewards[t] + gamma * R * masks[t]
        returns[t] = R
    return returns

def train_a2c(env_name="Pendulum-v1",
              total_frames=200_0,
              rollout_len=5,
              gamma=0.99,
              lr=3e-4,
              value_coef=0.5,
              entropy_coef=0.01,
              max_grad_norm=0.5,
              log_interval=200,
              seed=42):
    env = get_env(env_name, seed)
    # Use env.reset() to get the initial observation and info, handle the tuple return
    obs_sample, _ = env.reset()
    obs_dim = obs_sample.shape[0]
    act_dim = env.action_space.shape[0]

    model = ActorCritic(obs_dim, act_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Use env.reset() to get the initial state and info, handle the tuple return
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device)

    frame_count = 0
    episode_reward = 0.0
    episode_count = 0
    rewards_log = []
    t_start = time.time()

    while frame_count < total_frames:
        states = []
        actions = []
        log_probs = []
        rewards = []
        masks = []
        values = []

        # collect rollout_len steps
        for _ in range(rollout_len):
            mu, std, value = model(state.unsqueeze(0))
            mu = mu.squeeze(0)
            dist = Normal(mu, std)
            action = dist.sample()
            action_clipped = action.clamp(torch.tensor(env.action_space.low, device=device),
                                          torch.tensor(env.action_space.high, device=device))
            log_prob = dist.log_prob(action).sum()

            # Use the new step API which returns (observation, reward, terminated, truncated, info)
            step_out = env.step(action_clipped.cpu().numpy())
            next_state, reward, terminated, truncated, info = step_out
            done = terminated or truncated

            next_state = torch.tensor(next_state, dtype=torch.float32, device=device)

            states.append(state)
            actions.append(action)
            log_probs.append(log_prob)
            rewards.append(torch.tensor(reward, dtype=torch.float32, device=device))
            masks.append(torch.tensor(0.0 if done else 1.0, dtype=torch.float32, device=device))
            values.append(value.squeeze(0))

            state = next_state
            episode_reward += float(reward)
            frame_count += 1

            if done:
                rewards_log.append(episode_reward)
                episode_reward = 0.0
                episode_count += 1
                # Use env.reset() to get the initial state and info, handle the tuple return
                state, _ = env.reset()
                state = torch.tensor(state, dtype=torch.float32, device=device)

            if frame_count >= total_frames:
                break

        # bootstrap value for next state
        with torch.no_grad():
            _, _, next_value = model(state.unsqueeze(0))
            next_value = next_value.squeeze(0)

        # convert lists to tensors
        log_probs = torch.stack(log_probs)
        values = torch.stack(values)
        rewards = torch.stack(rewards)
        masks = torch.stack(masks)

        returns = compute_returns(rewards, masks, values, gamma, next_value)

        advantages = returns - values

        # losses
        policy_loss = -(log_probs * advantages.detach()).mean()
        value_loss = advantages.pow(2).mean()
        # entropy: encourage exploration
        # entropy of Normal with std: 0.5*ln(2*pi*e*sigma^2), sum across action dims
        mu_batch, std_batch, _ = model(torch.stack(states))
        dist_batch = Normal(mu_batch, std_batch)
        entropy = dist_batch.entropy().sum(-1).mean()

        loss = policy_loss + value_coef * value_loss - entropy_coef * entropy

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()

        # logging
        if frame_count % log_interval < rollout_len:
            avg_return = np.mean(rewards_log[-10:]) if len(rewards_log) > 0 else 0.0
            fps = frame_count / (time.time() - t_start + 1e-8)
            print(f"Frames: {frame_count:7d} | Episodes: {episode_count:4d} | AvgReturn(10): {avg_return:7.2f} | Loss: {loss.item():.4f} | FPS: {fps:.1f}")

    env.close()
    print("Training complete")
    return model

if __name__ == "__main__":
    trained = train_a2c(
        env_name="Pendulum-v1",
        total_frames=200_0,
        rollout_len=5,
        gamma=0.99,
        lr=3e-4,
        value_coef=0.5,
        entropy_coef=0.01,
        max_grad_norm=0.5,
        log_interval=200,
        seed=0
    )
    # save model
    torch.save(trained.state_dict(), "a2c_pendulum.pt")
    print("Saved model to a2c_pendulum.pt")

Frames:     200 | Episodes:    1 | AvgReturn(10): -1423.28 | Loss: 290.5755 | FPS: 515.5
Frames:     400 | Episodes:    2 | AvgReturn(10): -1424.33 | Loss: 74.4050 | FPS: 499.0
Frames:     600 | Episodes:    3 | AvgReturn(10): -1340.16 | Loss: 40.8703 | FPS: 501.5
Frames:     800 | Episodes:    4 | AvgReturn(10): -1371.58 | Loss: 112.3845 | FPS: 504.4
Frames:    1000 | Episodes:    5 | AvgReturn(10): -1415.79 | Loss: 61.4958 | FPS: 500.7
Frames:    1200 | Episodes:    6 | AvgReturn(10): -1413.73 | Loss: 113.4399 | FPS: 500.5
Frames:    1400 | Episodes:    7 | AvgReturn(10): -1439.66 | Loss: 303.8105 | FPS: 497.5
Frames:    1600 | Episodes:    8 | AvgReturn(10): -1406.50 | Loss: 761.2897 | FPS: 497.9
Frames:    1800 | Episodes:    9 | AvgReturn(10): -1452.90 | Loss: 971.7520 | FPS: 496.9
Frames:    2000 | Episodes:   10 | AvgReturn(10): -1430.42 | Loss: 6228.3228 | FPS: 495.5
Training complete
Saved model to a2c_pendulum.pt
