In [7]:
# DreamerV3-style RL agent for FetchReach-v3 with dense reward shaping

import os
import gymnasium as gym
import gymnasium_robotics
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random

config = {
    "env_name": "FetchReach-v3",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "steps": 20000,
    "hidden_dim": 256,
    "log_every": 1000,
    "imag_horizon": 15,
    "batch_size": 32,
    "buffer_size": 100000,
    "prefill_steps": 1000,
    "gamma": 0.99,
    "learning_rate": 3e-4,
    "action_scale": 0.05,
}

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, obs, action, reward, next_obs, done):
        self.buffer.append((obs, action, reward, next_obs, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        obs, act, rew, next_obs, done = map(np.stack, zip(*batch))
        return (
            torch.tensor(obs, dtype=torch.float32).to(config["device"]),
            torch.tensor(act, dtype=torch.float32).to(config["device"]),
            torch.tensor(rew, dtype=torch.float32).unsqueeze(1).to(config["device"]),
            torch.tensor(next_obs, dtype=torch.float32).to(config["device"]),
            torch.tensor(done, dtype=torch.float32).unsqueeze(1).to(config["device"])
        )

    def __len__(self):
        return len(self.buffer)

class WorldModel(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(obs_dim, config["hidden_dim"]), nn.ReLU())
        self.dynamics = nn.GRUCell(config["hidden_dim"] + act_dim, config["hidden_dim"])
        self.reward_head = nn.Linear(config["hidden_dim"], 1)

    def forward(self, obs, action, hidden):
        emb = self.encoder(obs)
        return self.forward_through_dynamics(emb, action, hidden)

    def forward_through_dynamics(self, emb, action, hidden):
        hidden = self.dynamics(torch.cat([emb, action], dim=-1), hidden)
        reward = self.reward_head(hidden)
        return hidden, reward

class Actor(nn.Module):
    def __init__(self, feat_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], act_dim),
            nn.Tanh()
        )
        self.action_scale = torch.tensor(config["action_scale"])

    def forward(self, feat):
        return self.action_scale.to(feat.device) * self.net(feat)

class Critic(nn.Module):
    def __init__(self, feat_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], 1)
        )

    def forward(self, feat):
        return self.net(feat)

def imagine_rollout(actor, world_model, start_hidden, horizon):
    imag_feats = []
    hidden = start_hidden
    for _ in range(horizon):
        action = actor(hidden)
        emb_dummy = torch.zeros_like(hidden)
        hidden, reward = world_model.forward_through_dynamics(emb_dummy, action, hidden)
        imag_feats.append((hidden, reward))
    return imag_feats

def make_env():
    env = gym.make(config["env_name"])
    env.reset(seed=42)
    return env

def train():
    env = make_env()
    obs_dim = env.observation_space["observation"].shape[0]
    act_dim = env.action_space.shape[0]

    buffer = ReplayBuffer(config["buffer_size"])
    wm = WorldModel(obs_dim, act_dim).to(config["device"])
    actor = Actor(config["hidden_dim"], act_dim).to(config["device"])
    critic = Critic(config["hidden_dim"]).to(config["device"])

    optim_wm = torch.optim.Adam(wm.parameters(), lr=config["learning_rate"])
    optim_actor = torch.optim.Adam(actor.parameters(), lr=config["learning_rate"])
    optim_critic = torch.optim.Adam(critic.parameters(), lr=config["learning_rate"])

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    total_reward = 0

    for _ in range(config["prefill_steps"]):
        action = env.action_space.sample()
        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        reward = -np.linalg.norm(next_obs_dict["achieved_goal"] - next_obs_dict["desired_goal"])
        buffer.push(obs, action, reward, next_obs, done or truncated)
        obs = next_obs if not done and not truncated else env.reset()[0]["observation"]

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    hidden = torch.zeros(config["batch_size"], config["hidden_dim"]).to(config["device"])

    for step in range(config["steps"]):
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(config["device"])
        with torch.no_grad():
            emb = wm.encoder(obs_tensor)
            hidden = wm.dynamics(torch.cat([emb, torch.zeros((1, act_dim)).to(config["device"])] , dim=-1), hidden[:1])
            action = actor(hidden).squeeze(0).cpu().numpy()

        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        reward = -np.linalg.norm(next_obs_dict["achieved_goal"] - next_obs_dict["desired_goal"])
        buffer.push(obs, action, reward, next_obs, done or truncated)
        total_reward += reward

        obs = next_obs if not done and not truncated else env.reset()[0]["observation"]

        if len(buffer) < config["batch_size"]:
            continue

        obs_b, act_b, rew_b, next_obs_b, done_b = buffer.sample(config["batch_size"])
        hidden = torch.zeros(config["batch_size"], config["hidden_dim"]).to(config["device"])
        next_hidden, pred_reward = wm(obs_b, act_b, hidden)
        loss_wm = F.mse_loss(pred_reward, rew_b)
        optim_wm.zero_grad()
        loss_wm.backward()
        
        optim_wm.step()

        rollout = imagine_rollout(actor, wm, next_hidden.detach(), config["imag_horizon"])
        feats, rewards = zip(*rollout)
        feats = torch.stack(feats)
        rewards = torch.stack(rewards)

        returns = torch.zeros_like(rewards)
        for t in reversed(range(len(rewards))):
            returns[t] = rewards[t] + config["gamma"] * (returns[t + 1] if t + 1 < len(rewards) else 0)

        val_preds = critic(feats.detach())
        loss_critic = F.mse_loss(val_preds.squeeze(-1), returns.detach().squeeze(-1))
        optim_critic.zero_grad()
        loss_critic.backward()
        optim_critic.step()

        val_preds = critic(feats)
        loss_actor = -val_preds.mean()
        optim_actor.zero_grad()
        loss_actor.backward()
        optim_actor.step()

        if (step + 1) % config["log_every"] == 0:
            avg_pred_reward = pred_reward.mean().item()
            print(f"Step {step+1}: Total Reward = {total_reward:.1f}")
            print(f"  Loss_WM: {loss_wm.item():.4f} | Loss_Critic: {loss_critic.item():.4f} | Loss_Actor: {loss_actor.item():.4f} | Pred_Reward_Mean: {avg_pred_reward:.4f}")
            total_reward = 0

    env.close()

if __name__ == "__main__":
    train()


Step 1000: Total Reward = -572.6
  Loss_WM: 0.0075 | Loss_Critic: 0.0534 | Loss_Actor: 1.0472 | Pred_Reward_Mean: -0.5491
Step 2000: Total Reward = -582.0
  Loss_WM: 0.0130 | Loss_Critic: 0.0307 | Loss_Actor: 1.0616 | Pred_Reward_Mean: -0.5435
Step 3000: Total Reward = -591.1
  Loss_WM: 0.0077 | Loss_Critic: 0.0224 | Loss_Actor: 1.1358 | Pred_Reward_Mean: -0.5623
Step 4000: Total Reward = -582.6
  Loss_WM: 0.0084 | Loss_Critic: 0.0169 | Loss_Actor: 1.1578 | Pred_Reward_Mean: -0.5591
Step 5000: Total Reward = -584.2
  Loss_WM: 0.0098 | Loss_Critic: 0.0138 | Loss_Actor: 1.2585 | Pred_Reward_Mean: -0.5931
Step 6000: Total Reward = -575.8
  Loss_WM: 0.0084 | Loss_Critic: 0.0127 | Loss_Actor: 1.3253 | Pred_Reward_Mean: -0.5773
Step 7000: Total Reward = -583.8
  Loss_WM: 0.0066 | Loss_Critic: 0.0125 | Loss_Actor: 1.3996 | Pred_Reward_Mean: -0.5866
Step 8000: Total Reward = -598.6
  Loss_WM: 0.0056 | Loss_Critic: 0.0130 | Loss_Actor: 1.4358 | Pred_Reward_Mean: -0.5762
Step 9000: Total Reward 

In [None]:
# DreamerV3-style RL agent for FetchReach-v3 with dense reward shaping

import os
import gymnasium as gym
import gymnasium_robotics
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random

config = {
    "env_name": "FetchReach-v3",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "steps": 20000,
    "hidden_dim": 256,
    "log_every": 1000,
    "imag_horizon": 15,
    "batch_size": 64,
    "buffer_size": 100000,
    "prefill_steps": 1000,
    "gamma": 0.99,
    "learning_rate": 2e-4,
    "action_scale": 0.05,
}

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, obs, action, reward, next_obs, done):
        self.buffer.append((obs, action, reward, next_obs, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        obs, act, rew, next_obs, done = map(np.stack, zip(*batch))
        return (
            torch.tensor(obs, dtype=torch.float32).to(config["device"]),
            torch.tensor(act, dtype=torch.float32).to(config["device"]),
            torch.tensor(rew, dtype=torch.float32).unsqueeze(1).to(config["device"]),
            torch.tensor(next_obs, dtype=torch.float32).to(config["device"]),
            torch.tensor(done, dtype=torch.float32).unsqueeze(1).to(config["device"])
        )

    def __len__(self):
        return len(self.buffer)

class WorldModel(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(obs_dim, config["hidden_dim"]), nn.ReLU())
        self.dynamics = nn.GRUCell(config["hidden_dim"] + act_dim, config["hidden_dim"])
        self.reward_head = nn.Linear(config["hidden_dim"], 1)

    def forward(self, obs, action, hidden):
        emb = self.encoder(obs)
        return self.forward_through_dynamics(emb, action, hidden)

    def forward_through_dynamics(self, emb, action, hidden):
        hidden = self.dynamics(torch.cat([emb, action], dim=-1), hidden)
        reward = self.reward_head(hidden)
        return hidden, reward

class Actor(nn.Module):
    def __init__(self, feat_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], act_dim),
            nn.Tanh()
        )
        self.action_scale = torch.tensor(config["action_scale"])

    def forward(self, feat):
        return self.action_scale.to(feat.device) * self.net(feat)

class Critic(nn.Module):
    def __init__(self, feat_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], 1)
        )

    def forward(self, feat):
        return self.net(feat)

def imagine_rollout(actor, world_model, start_hidden, horizon):
    imag_feats = []
    hidden = start_hidden
    for _ in range(horizon):
        action = actor(hidden)
        emb_dummy = torch.zeros_like(hidden)
        hidden, reward = world_model.forward_through_dynamics(emb_dummy, action, hidden)
        imag_feats.append((hidden, reward))
    return imag_feats

def make_env():
    env = gym.make(config["env_name"])
    env.reset(seed=42)
    return env

def train():
    env = make_env()
    obs_dim = env.observation_space["observation"].shape[0]
    act_dim = env.action_space.shape[0]

    buffer = ReplayBuffer(config["buffer_size"])
    wm = WorldModel(obs_dim, act_dim).to(config["device"])
    actor = Actor(config["hidden_dim"], act_dim).to(config["device"])
    critic = Critic(config["hidden_dim"]).to(config["device"])

    optim_wm = torch.optim.Adam(wm.parameters(), lr=config["learning_rate"])
    optim_actor = torch.optim.Adam(actor.parameters(), lr=config["learning_rate"])
    optim_critic = torch.optim.Adam(critic.parameters(), lr=config["learning_rate"])

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    total_reward = 0

    for _ in range(config["prefill_steps"]):
        action = env.action_space.sample()
        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        reward = -np.linalg.norm(next_obs_dict["achieved_goal"] - next_obs_dict["desired_goal"])
        buffer.push(obs, action, reward, next_obs, done or truncated)
        obs = next_obs if not done and not truncated else env.reset()[0]["observation"]

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    hidden = torch.zeros(config["batch_size"], config["hidden_dim"]).to(config["device"])

    for step in range(config["steps"]):
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(config["device"])
        with torch.no_grad():
            emb = wm.encoder(obs_tensor)
            hidden = wm.dynamics(torch.cat([emb, torch.zeros((1, act_dim)).to(config["device"])], dim=-1), hidden[:1])
            action = actor(hidden).squeeze(0).cpu().numpy()

        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        reward = -np.linalg.norm(next_obs_dict["achieved_goal"] - next_obs_dict["desired_goal"])
        buffer.push(obs, action, reward, next_obs, done or truncated)
        total_reward += reward

        obs = next_obs if not done and not truncated else env.reset()[0]["observation"]

        if len(buffer) < config["batch_size"]:
            continue

        obs_b, act_b, rew_b, next_obs_b, done_b = buffer.sample(config["batch_size"])
        hidden = torch.zeros(config["batch_size"], config["hidden_dim"]).to(config["device"])
        next_hidden, pred_reward = wm(obs_b, act_b, hidden)
        loss_wm = F.mse_loss(pred_reward, rew_b)
        optim_wm.zero_grad()
        loss_wm.backward()
        optim_wm.step()

        rollout = imagine_rollout(actor, wm, next_hidden.detach(), config["imag_horizon"])
        feats, rewards = zip(*rollout)
        feats = torch.stack(feats)
        rewards = torch.stack(rewards)

        returns = torch.zeros_like(rewards)
        for t in reversed(range(len(rewards))):
            returns[t] = rewards[t] + config["gamma"] * (returns[t + 1] if t + 1 < len(rewards) else 0)

        val_preds = critic(feats.detach())
        loss_critic = F.mse_loss(val_preds.squeeze(-1), returns.detach().squeeze(-1))
        optim_critic.zero_grad()
        loss_critic.backward()
        optim_critic.step()

        val_preds = critic(feats)
        loss_actor = -val_preds.mean()
        optim_actor.zero_grad()
        loss_actor.backward()
        optim_actor.step()

        if (step + 1) % config["log_every"] == 0:
            avg_pred_reward = pred_reward.mean().item()
            print(f"Step {step+1}: Total Reward = {total_reward:.1f}")
            print(f"  Loss_WM: {loss_wm.item():.4f} | Loss_Critic: {loss_critic.item():.4f} | Loss_Actor: {loss_actor.item():.4f} | Pred_Reward_Mean: {avg_pred_reward:.4f}")
            total_reward = 0

    env.close()

if __name__ == "__main__":
    train()


Step 1000: Total Reward = -506.3
  Loss_WM: 0.0078 | Loss_Critic: 0.0392 | Loss_Actor: 0.8847 | Pred_Reward_Mean: -0.5283
Step 2000: Total Reward = -499.2
  Loss_WM: 0.0089 | Loss_Critic: 0.0285 | Loss_Actor: 0.9284 | Pred_Reward_Mean: -0.5278
Step 3000: Total Reward = -508.8
  Loss_WM: 0.0102 | Loss_Critic: 0.0188 | Loss_Actor: 0.9448 | Pred_Reward_Mean: -0.5416
Step 4000: Total Reward = -500.9
  Loss_WM: 0.0095 | Loss_Critic: 0.0149 | Loss_Actor: 0.9642 | Pred_Reward_Mean: -0.4995
Step 5000: Total Reward = -501.3
  Loss_WM: 0.0083 | Loss_Critic: 0.0154 | Loss_Actor: 1.0388 | Pred_Reward_Mean: -0.5331
Step 6000: Total Reward = -493.5
  Loss_WM: 0.0073 | Loss_Critic: 0.0125 | Loss_Actor: 1.0401 | Pred_Reward_Mean: -0.5019
Step 7000: Total Reward = -501.4
  Loss_WM: 0.0064 | Loss_Critic: 0.0099 | Loss_Actor: 1.0578 | Pred_Reward_Mean: -0.5189
Step 8000: Total Reward = -515.7
  Loss_WM: 0.0079 | Loss_Critic: 0.0085 | Loss_Actor: 1.0878 | Pred_Reward_Mean: -0.5187
Step 9000: Total Reward 

NEW FINE TUNED MODEL: 

In [18]:
# [Same imports as before]
# Add these if needed
import os
import gymnasium as gym
import gymnasium_robotics
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

config = {
    "env_name": "FetchReach-v3",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "steps": 100000,
    "hidden_dim": 256,
    "log_every": 500,
    "imag_horizon": 5,
    "batch_size": 128,
    "buffer_size": 100000,
    "prefill_steps": 3000,
    "gamma": 0.95,
    "learning_rate": 1e-4,
    "action_scale": 0.1,
    "log_dir": "logs/dreamer_fetchreach_her_goalaware",
    "her_k": 4
}

class HERReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        self.episode = []

    def push(self, obs, action, reward, next_obs, done, achieved_goal, desired_goal):
        self.episode.append((obs, action, reward, next_obs, done, achieved_goal, desired_goal))
        if done:
            self.add_episode(self.episode)
            self.episode = []

    def add_episode(self, episode):
        for i in range(len(episode)):
            obs, action, reward, next_obs, done, ag, dg = episode[i]
            self.buffer.append((obs, action, reward, next_obs, done, dg))
            future = episode[i:]
            for _ in range(config["her_k"]):
                _, _, _, _, _, f_ag, _ = random.choice(future)
                her_goal = f_ag
                her_reward = -np.linalg.norm(ag - her_goal)
                if her_reward > -0.05:
                    her_reward += 1.0
                self.buffer.append((obs, action, her_reward, next_obs, done, her_goal))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        obs, act, rew, next_obs, _, goal = map(np.stack, zip(*batch))
        relabeled_obs = np.concatenate([obs, goal], axis=-1)
        return (
            torch.tensor(relabeled_obs, dtype=torch.float32).to(config["device"]),
            torch.tensor(act, dtype=torch.float32).to(config["device"]),
            torch.tensor(rew, dtype=torch.float32).unsqueeze(1).to(config["device"]),
            torch.tensor(goal, dtype=torch.float32).to(config["device"])
        )

    def __len__(self):
        return len(self.buffer)

# Models — now accept goal
class WorldModel(nn.Module):
    def __init__(self, input_dim, act_dim):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, config["hidden_dim"]), nn.ReLU())
        self.dynamics = nn.GRUCell(config["hidden_dim"] + act_dim, config["hidden_dim"])
        self.reward_head = nn.Linear(config["hidden_dim"], 1)

    def forward(self, obs_goal, action, hidden):
        emb = self.encoder(obs_goal)
        return self.forward_through_dynamics(emb, action, hidden)

    def forward_through_dynamics(self, emb, action, hidden):
        hidden = self.dynamics(torch.cat([emb, action], dim=-1), hidden)
        reward = self.reward_head(hidden)
        return hidden, reward

class Actor(nn.Module):
    def __init__(self, feat_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], act_dim),
            nn.Tanh()
        )
        self.action_scale = torch.tensor(config["action_scale"])

    def forward(self, feat):
        return self.action_scale.to(feat.device) * self.net(feat)

class Critic(nn.Module):
    def __init__(self, feat_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], 1)
        )

    def forward(self, feat):
        return self.net(feat)

def imagine_rollout(actor, wm, start_hidden, goal_tensor, horizon):
    imag_feats = []
    hidden = start_hidden
    goal_tiled = goal_tensor  # ✅ Fixed

    for _ in range(horizon):
        actor_input = torch.cat([hidden, goal_tiled], dim=-1)
        action = actor(actor_input)
        emb_dummy = torch.zeros_like(hidden)
        hidden, reward = wm.forward_through_dynamics(emb_dummy, action, hidden)
        imag_feats.append((actor_input, reward))  # ✅ Use actor_input for critic
    return imag_feats

def make_env():
    env = gym.make(config["env_name"])
    env.reset(seed=42)
    return env

def train():
    env = make_env()
    writer = SummaryWriter(log_dir=os.path.join(config["log_dir"], datetime.now().strftime("%Y%m%d-%H%M%S")))

    obs_dim = env.observation_space["observation"].shape[0]
    goal_dim = env.observation_space["desired_goal"].shape[0]
    act_dim = env.action_space.shape[0]
    input_dim = obs_dim + goal_dim

    buffer = HERReplayBuffer(config["buffer_size"])
    wm = WorldModel(input_dim, act_dim).to(config["device"])
    actor = Actor(config["hidden_dim"] + goal_dim, act_dim).to(config["device"])
    critic = Critic(config["hidden_dim"] + goal_dim).to(config["device"])

    optim_wm = torch.optim.Adam(wm.parameters(), lr=config["learning_rate"])
    optim_actor = torch.optim.Adam(actor.parameters(), lr=config["learning_rate"])
    optim_critic = torch.optim.Adam(critic.parameters(), lr=config["learning_rate"])

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    dg = obs_dict["desired_goal"]
    episode_reward = 0
    episode_success = 0

    for step in range(config["prefill_steps"]):
        action = env.action_space.sample()
        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        ag = next_obs_dict["achieved_goal"]
        reward = -np.linalg.norm(ag - dg)
        if reward > -0.05:
            reward += 1.0
        buffer.push(obs, action, reward, next_obs, done or truncated, ag, dg)
        obs = next_obs if not (done or truncated) else env.reset()[0]["observation"]
        dg = next_obs_dict["desired_goal"]

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    dg = obs_dict["desired_goal"]

    for step in range(config["steps"]):
        relabeled_obs = np.concatenate([obs, dg])
        obs_tensor = torch.tensor(relabeled_obs, dtype=torch.float32).unsqueeze(0).to(config["device"])
        goal_tensor = torch.tensor(dg, dtype=torch.float32).unsqueeze(0).to(config["device"])
        with torch.no_grad():
            emb = wm.encoder(obs_tensor)
            hidden = wm.dynamics(torch.cat([emb, torch.zeros((1, act_dim)).to(config["device"])], dim=-1),
                                 torch.zeros((1, config["hidden_dim"])).to(config["device"]))
            actor_input = torch.cat([hidden, goal_tensor], dim=-1)
            action = actor(actor_input).squeeze(0).cpu().numpy()

        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        ag = next_obs_dict["achieved_goal"]
        reward = -np.linalg.norm(ag - dg)
        success = reward > -0.05
        if success:
            reward += 1.0

        buffer.push(obs, action, reward, next_obs, done or truncated, ag, dg)
        episode_reward += reward
        episode_success += float(success)

        obs = next_obs if not (done or truncated) else env.reset()[0]["observation"]
        dg = next_obs_dict["desired_goal"]

        if done or truncated:
            writer.add_scalar("Episode/Reward", episode_reward, step)
            writer.add_scalar("Episode/Success", episode_success, step)
            episode_reward = 0
            episode_success = 0

        if len(buffer) < config["batch_size"]:
            continue

        obs_b, act_b, rew_b, goal_b = buffer.sample(config["batch_size"])
        hidden_b = torch.zeros(config["batch_size"], config["hidden_dim"]).to(config["device"])
        input_b = torch.cat([obs_b], dim=-1)
        next_hidden, pred_reward = wm(obs_b, act_b, hidden_b)
        loss_wm = F.mse_loss(pred_reward, rew_b)
        optim_wm.zero_grad()
        loss_wm.backward()
        optim_wm.step()

        rollout = imagine_rollout(actor, wm, next_hidden.detach(), goal_b, config["imag_horizon"])
        feats, rewards = zip(*rollout)
        feats = torch.stack(feats)
        rewards = torch.stack(rewards)

        returns = torch.zeros_like(rewards)
        for t in reversed(range(len(rewards))):
            returns[t] = rewards[t] + config["gamma"] * (returns[t + 1] if t + 1 < len(rewards) else 0)

        val_preds = critic(feats.detach()).squeeze(-1)
        loss_critic = F.mse_loss(val_preds, returns.detach().squeeze(-1))
        optim_critic.zero_grad()
        loss_critic.backward()
        optim_critic.step()

        val_preds = critic(feats).squeeze(-1)
        loss_actor = -val_preds.mean()
        optim_actor.zero_grad()
        loss_actor.backward()
        optim_actor.step()

        if (step + 1) % config["log_every"] == 0:
            writer.add_scalar("Loss/WorldModel", loss_wm.item(), step)
            writer.add_scalar("Loss/Critic", loss_critic.item(), step)
            writer.add_scalar("Loss/Actor", loss_actor.item(), step)
            print(f"[Step {step+1}] WM: {loss_wm.item():.4f}, Critic: {loss_critic.item():.4f}, Actor: {loss_actor.item():.4f}")

    env.close()
    writer.close()

if __name__ == "__main__":
    train()


[Step 500] WM: 0.1831, Critic: 0.0005, Actor: -0.5003
[Step 1000] WM: 0.1298, Critic: 0.0003, Actor: -0.5673
[Step 1500] WM: 0.1445, Critic: 0.0001, Actor: -0.5745


KeyboardInterrupt: 

PICK AND PLACE IMPLEMENTATION DONW: 

In [10]:
os.environ["MUJOCO_GL"] = "egl"

In [24]:
import os
import gymnasium as gym
import gymnasium_robotics
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

config = {
    "env_name": "FetchReach-v3",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "steps": 100_000,
    "hidden_dim": 256,
    "log_every": 250,
    "imag_horizon": 5,
    "batch_size": 128,
    "buffer_size": 100_000,
    "prefill_steps": 10_000,
    "gamma": 0.95,
    "learning_rate": 1e-4,
    "actor_lr": 1e-3,
    "critic_lr": 1e-3,
    "action_scale": 0.3,
    "log_dir": "logs/debug_dreamer_fetchreach",
    "her_k": 4
}

class HERReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        self.episode = []

    def push(self, obs, action, reward, next_obs, done, achieved_goal, desired_goal):
        self.episode.append((obs, action, reward, next_obs, done, achieved_goal, desired_goal))
        if done:
            self.add_episode(self.episode)
            self.episode = []

    def add_episode(self, episode):
        for i in range(len(episode)):
            obs, action, reward, next_obs, done, ag, dg = episode[i]
            self.buffer.append((obs, action, reward, next_obs, done, dg))
            future = episode[i:]
            for _ in range(config["her_k"]):
                _, _, _, _, _, f_ag, _ = random.choice(future)
                her_goal = f_ag
                her_reward = -np.linalg.norm(ag - her_goal)
                if her_reward > -0.05:
                    her_reward += 1.0
                self.buffer.append((obs, action, her_reward, next_obs, done, her_goal))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        obs, act, rew, next_obs, _, goal = map(np.stack, zip(*batch))
        relabeled_obs = np.concatenate([obs, goal], axis=-1)
        if random.random() < 0.01:
            print(f"[HER Sample] Mean reward in batch: {rew.mean():.3f}")
        return (
            torch.tensor(relabeled_obs, dtype=torch.float32).to(config["device"]),
            torch.tensor(act, dtype=torch.float32).to(config["device"]),
            torch.tensor(rew, dtype=torch.float32).unsqueeze(1).to(config["device"]),
            torch.tensor(goal, dtype=torch.float32).to(config["device"])
        )

    def __len__(self):
        return len(self.buffer)

class WorldModel(nn.Module):
    def __init__(self, input_dim, act_dim):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, config["hidden_dim"]), nn.ReLU())
        self.dynamics = nn.GRUCell(config["hidden_dim"] + act_dim, config["hidden_dim"])
        self.reward_head = nn.Linear(config["hidden_dim"], 1)

    def forward(self, obs_goal, action, hidden):
        emb = self.encoder(obs_goal)
        return self.forward_through_dynamics(emb, action, hidden)

    def forward_through_dynamics(self, emb, action, hidden):
        hidden = self.dynamics(torch.cat([emb, action], dim=-1), hidden)
        reward = self.reward_head(hidden)
        return hidden, reward

class Actor(nn.Module):
    def __init__(self, feat_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], act_dim),
            nn.Tanh()
        )
        self.action_scale = torch.tensor(config["action_scale"])

    def forward(self, feat):
        return self.action_scale.to(feat.device) * self.net(feat)

class Critic(nn.Module):
    def __init__(self, feat_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(feat_dim, config["hidden_dim"]),
            nn.ReLU(),
            nn.Linear(config["hidden_dim"], 1)
        )

    def forward(self, feat):
        return self.net(feat)

def imagine_rollout(actor, wm, start_hidden, goal_tensor, horizon):
    imag_feats = []
    hidden = start_hidden
    goal_tiled = goal_tensor
    for _ in range(horizon):
        actor_input = torch.cat([hidden, goal_tiled], dim=-1)
        action = actor(actor_input)
        action += 0.3 * torch.randn_like(action)
        emb_dummy = torch.zeros_like(hidden)
        hidden, reward = wm.forward_through_dynamics(emb_dummy, action, hidden)
        imag_feats.append((actor_input, reward))
    return imag_feats

def make_env():
    env = gym.make(config["env_name"])
    env.reset(seed=42)
    return env

def train():
    env = make_env()
    writer = SummaryWriter(log_dir=os.path.join(config["log_dir"], datetime.now().strftime("%Y%m%d-%H%M%S")))

    obs_dim = env.observation_space["observation"].shape[0]
    goal_dim = env.observation_space["desired_goal"].shape[0]
    act_dim = env.action_space.shape[0]
    input_dim = obs_dim + goal_dim

    buffer = HERReplayBuffer(config["buffer_size"])
    wm = WorldModel(input_dim, act_dim).to(config["device"])
    actor = Actor(config["hidden_dim"] + goal_dim, act_dim).to(config["device"])
    critic = Critic(config["hidden_dim"] + goal_dim).to(config["device"])

    optim_wm = torch.optim.Adam(wm.parameters(), lr=config["learning_rate"])
    optim_actor = torch.optim.Adam(actor.parameters(), lr=1e-3)
    optim_critic = torch.optim.Adam(critic.parameters(), lr=3e-3)

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    dg = obs_dict["desired_goal"]
    episode_reward, episode_success = 0, 0
    success_history = deque(maxlen=50)

    # Prefill
    for step in range(config["prefill_steps"]):
        action = env.action_space.sample()
        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        ag = next_obs_dict["achieved_goal"]
        reward = -np.linalg.norm(ag - dg)

        dist_to_goal = np.linalg.norm(ag - dg)
        print(f"[Step {step}] Distance to goal: {dist_to_goal:.4f}")

        if reward > -0.05:
            reward += 1.0
        buffer.push(obs, action, reward, next_obs, done or truncated, ag, dg)
        obs = next_obs if not (done or truncated) else env.reset()[0]["observation"]
        dg = next_obs_dict["desired_goal"]

    obs_dict, _ = env.reset()
    obs = obs_dict["observation"]
    dg = obs_dict["desired_goal"]

    for step in range(config["steps"]):
        relabeled_obs = np.concatenate([obs, dg])
        obs_tensor = torch.tensor(relabeled_obs, dtype=torch.float32).unsqueeze(0).to(config["device"])
        goal_tensor = torch.tensor(dg, dtype=torch.float32).unsqueeze(0).to(config["device"])
        with torch.no_grad():
            emb = wm.encoder(obs_tensor)
            hidden = wm.dynamics(
                torch.cat([emb, torch.zeros((1, act_dim)).to(config["device"])], dim=-1),
                torch.zeros((1, config["hidden_dim"])).to(config["device"])
            )
            actor_input = torch.cat([hidden, goal_tensor], dim=-1)
            action = actor(actor_input).squeeze(0).cpu().numpy()
            # Add Gaussian noise for exploration
            noise = np.random.normal(0, 0.1, size=action.shape)
            action = np.clip(action + noise, -1.0, 1.0)

        next_obs_dict, _, done, truncated, _ = env.step(action)
        next_obs = next_obs_dict["observation"]
        ag = next_obs_dict["achieved_goal"]

        # Distance to goal logging
        dist_to_goal = np.linalg.norm(ag - dg)
        reward = -dist_to_goal
        if reward > -0.05:
            reward += 1.0
        else:
            reward += 0.2 * np.exp(-10 * abs(reward))  # shaped reward bonus

        success = reward > 0.9
        buffer.push(obs, action, reward, next_obs, done or truncated, ag, dg)
        episode_reward += reward
        episode_success += float(success)

        obs = next_obs if not (done or truncated) else env.reset()[0]["observation"]
        dg = next_obs_dict["desired_goal"]

        if done or truncated:
            success_history.append(episode_success)
            writer.add_scalar("Episode/Reward", episode_reward, step)
            writer.add_scalar("Episode/Success", episode_success, step)
            print(f"[Episode Done @ Step {step}] Reward: {episode_reward:.3f}, Success: {episode_success:.1f}, MA(50): {np.mean(success_history):.2f}")
            episode_reward, episode_success = 0, 0

        if len(buffer) < config["batch_size"]:
            continue

        obs_b, act_b, rew_b, goal_b = buffer.sample(config["batch_size"])
        hidden_b = torch.zeros(config["batch_size"], config["hidden_dim"]).to(config["device"])
        next_hidden, pred_reward = wm(obs_b, act_b, hidden_b)
        loss_wm = F.mse_loss(pred_reward, rew_b)
        optim_wm.zero_grad()
        loss_wm.backward()
        optim_wm.step()

        rollout = imagine_rollout(actor, wm, next_hidden.detach(), goal_b, config["imag_horizon"])
        feats, rewards = zip(*rollout)
        feats = torch.stack(feats)
        rewards = torch.stack(rewards)

        returns = torch.zeros_like(rewards)
        for t in reversed(range(len(rewards))):
            returns[t] = rewards[t] + config["gamma"] * (returns[t + 1] if t + 1 < len(rewards) else 0)

        feats_flat = feats.view(-1, feats.shape[-1])
        returns_flat = returns.view(-1).detach()  # Only detach returns here

        # --- Critic Update ---
        val_preds = critic(feats_flat).squeeze(-1)  # No .detach() — critic needs grads
        loss_critic = F.mse_loss(val_preds, returns_flat)
        optim_critic.zero_grad()
        loss_critic.backward()
        optim_critic.step()

        # --- Actor Update ---
        val_preds_actor = critic(feats_flat.detach()).squeeze(-1)
        loss_actor = -val_preds_actor.mean()
        optim_actor.zero_grad()
        loss_actor.backward()
        optim_actor.step()

        if step % config["log_every"] == 0:
            print(f"[Step {step}] WM: {loss_wm.item():.4f}, Critic: {loss_critic.item():.4f}, Actor: {loss_actor.item():.4f}")
            print(f"Returns mean: {returns.mean().item():.4f}, ValPreds mean: {val_preds.mean().item():.4f}")
            writer.add_scalar("Loss/WorldModel", loss_wm.item(), step)
            writer.add_scalar("Loss/Critic", loss_critic.item(), step)
            writer.add_scalar("Loss/Actor", loss_actor.item(), step)
            writer.add_scalar("Debug/GoalDistance", dist_to_goal, step)

    env.close()
    writer.close()


if __name__ == "__main__":
    train()


[Step 0] Distance to goal: 0.3889
[Step 1] Distance to goal: 0.4234
[Step 2] Distance to goal: 0.4516
[Step 3] Distance to goal: 0.4573
[Step 4] Distance to goal: 0.4541
[Step 5] Distance to goal: 0.4800
[Step 6] Distance to goal: 0.5031
[Step 7] Distance to goal: 0.4895
[Step 8] Distance to goal: 0.4827
[Step 9] Distance to goal: 0.4750
[Step 10] Distance to goal: 0.4733
[Step 11] Distance to goal: 0.4862
[Step 12] Distance to goal: 0.4965
[Step 13] Distance to goal: 0.5062
[Step 14] Distance to goal: 0.5110
[Step 15] Distance to goal: 0.5301
[Step 16] Distance to goal: 0.5558
[Step 17] Distance to goal: 0.5611
[Step 18] Distance to goal: 0.5445
[Step 19] Distance to goal: 0.5750
[Step 20] Distance to goal: 0.5823
[Step 21] Distance to goal: 0.5858
[Step 22] Distance to goal: 0.5887
[Step 23] Distance to goal: 0.5938
[Step 24] Distance to goal: 0.6322
[Step 25] Distance to goal: 0.6091
[Step 26] Distance to goal: 0.6060
[Step 27] Distance to goal: 0.6343
[Step 28] Distance to goal: 0.