In [None]:
import gymnasium as gym
import numpy as np
import torch
from collections import deque
import torch.nn as nn
from time import sleep, time
import os
import random
import torchrl
import gymnasium as gym
from gymnasium import Wrapper
from mlagents_envs.registry import default_registry
from gymnasium import envs


if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
else:
    DEVICE = torch.device("cpu")

print(envs.registry.keys())

dict_keys(['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'phys2d/CartPole-v0', 'phys2d/CartPole-v1', 'phys2d/Pendulum-v0', 'LunarLander-v3', 'LunarLanderContinuous-v3', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v3', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v1', 'CliffWalkingSlippery-v1', 'Taxi-v3', 'tabular/Blackjack-v0', 'tabular/CliffWalking-v0', 'Reacher-v2', 'Reacher-v4', 'Reacher-v5', 'Pusher-v2', 'Pusher-v4', 'Pusher-v5', 'InvertedPendulum-v2', 'InvertedPendulum-v4', 'InvertedPendulum-v5', 'InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v4', 'InvertedDoublePendulum-v5', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HalfCheetah-v4', 'HalfCheetah-v5', 'Hopper-v2', 'Hopper-v3', 'Hopper-v4', 'Hopper-v5', 'Swimmer-v2', 'Swimmer-v3', 'Swimmer-v4', 'Swimmer-v5', 'Walker2d-v2', 'Walker2d-v3', 'Walker2d-v4', 'Walker2d-v5', 'Ant-v2', 'Ant-v3', 'Ant-v4', 'Ant-v5', 'Humanoid-v2', 'Humanoid-v3', 

In [None]:
LEARNING_RATE = 3e-4
DISCOUNT_FACTOR = 0.99
STEPS_PER_EPOCH = 6000
REPLAY_SIZE = 1000000
BATCH_SIZE = 256
RESOLUTION = (84, 84)
FRAME_REPEAT = 4
WARM_UP_STEPS = 5000
SCALE = 100
POLICY_DELAY = 3

In [None]:
from pathlib import Path
from mlagents_envs.base_env import ActionTuple

model_savefile = Path("./checkpoints/unity")
model_savefile_latest = Path("./checkpoints/unity/latest")
model_savefile.mkdir(parents=True, exist_ok=True)  # ensures folder exists
(model_savefile / "best").mkdir(exist_ok=True)
(model_savefile / "latest").mkdir(exist_ok=True)
def compute_obs_stats(env, behavior_name, n_samples=5000):
    obs_buffer = []
    env.reset()
    for _ in range(n_samples):
        decision_steps, _ = env.get_steps(behavior_name)
        for agent_id in decision_steps.agent_id:
            obs = decision_steps[agent_id].obs
            obs_flat = np.concatenate([o.flatten() for o in obs], axis=-1)
            obs_buffer.append(obs_flat)
        # Take random actions
        for agent_id in decision_steps.agent_id:
            action_dim = env.behavior_specs[behavior_name].action_spec.continuous_size
            action = np.random.uniform(-1, 1, size=(action_dim,))
            action_tuple = ActionTuple(continuous=np.expand_dims(action, axis=0))
            env.set_action_for_agent(behavior_name, agent_id, action_tuple)
        env.step()

    obs_buffer = np.array(obs_buffer)
    obs_mean = obs_buffer.mean(axis=0)
    obs_std = obs_buffer.std(axis=0) + 1e-6
    return obs_mean, obs_std




def run(game, agent, memory, num_epochs, steps_per_epoch=2000, behavior_name=None, load_model = False):
    UTD_RATIO = 1
    start_time = time()

    # If behavior_name is not provided, pick the first one(although we prob only use one behavior for this ex)
    if behavior_name is None:
        behavior_name = list(game.behavior_specs.keys())[0]
    best_mean_score = -np.inf
    if load_model:
      obs_mean = np.load(model_savefile / "obs_mean.npy")
      obs_std = np.load(model_savefile / "obs_std.npy")
      print("obs_mean and obs_std loaded")

    else:
      obs_mean, obs_std = compute_obs_stats(game, behavior_name)
      np.save(model_savefile / "obs_mean.npy", obs_mean)
      np.save(model_savefile / "obs_std.npy", obs_std)
    print(f"Obs mean and obs std computed as {obs_mean} and {obs_std}")
    print(f"Training behavior: {behavior_name}")
    full_global_step = 0
    for epoch in range(num_epochs):
        global_step = 0
        episode_rewards = {}
        game.reset()
        train_scores = []

        last_obs = {}
        last_action = {}
        while global_step < steps_per_epoch:
            decision_steps, terminal_steps = game.get_steps(behavior_name)

            for agent_id in decision_steps.agent_id:
                obs = decision_steps[agent_id].obs
                obs_flat = np.concatenate([o.flatten() for o in obs], axis=-1)
                obs_norm = (obs_flat - obs_mean) / obs_std

                action = agent.get_action(obs_norm)
                action = np.expand_dims(action, axis=0)  # shape (1, act_dim)
                action_tuple = ActionTuple(continuous=action)
                game.set_action_for_agent(behavior_name, agent_id, action_tuple)

                last_obs[agent_id] = obs_norm
                last_action[agent_id] = action

            game.step()
            next_decisions, next_terminals = game.get_steps(behavior_name)

            for agent_id in decision_steps.agent_id:
                if agent_id in next_decisions.agent_id:
                    n_obs = next_decisions[agent_id].obs
                    done = False
                    reward = next_decisions[agent_id].reward
                elif agent_id in next_terminals.agent_id:
                    n_obs = next_terminals[agent_id].obs
                    done = True
                    reward = next_terminals[agent_id].reward
                else:
                  continue

                n_obs_flat = np.concatenate([o.flatten() for o in n_obs], axis=-1)
                n_obs_norm = (n_obs_flat - obs_mean) / obs_std

                memory.add({
                    "obs": torch.tensor(last_obs[agent_id], dtype=torch.float32).unsqueeze(0),
                    "action": torch.tensor(last_action[agent_id], dtype=torch.float32),  # no unsqueeze here
                    "reward": torch.tensor(reward, dtype=torch.float32).unsqueeze(0),
                    "next_obs": torch.tensor(n_obs_norm, dtype=torch.float32).unsqueeze(0),
                    "done": torch.tensor(done, dtype=torch.bool).unsqueeze(0)
                })

        # Episode tracking
                episode_rewards.setdefault(agent_id, 0)
                episode_rewards[agent_id] += reward
                if done:
                    train_scores.append(episode_rewards[agent_id])
                    episode_rewards[agent_id] = 0
                    last_obs.pop(agent_id, None)
                    last_action.pop(agent_id, None)




    # SAC updates
            if len(memory) > BATCH_SIZE and full_global_step > WARM_UP_STEPS:
                metrics = agent.update(full_global_step)
                if full_global_step % 300 == 0:
                    print(f"Step {full_global_step} | Alpha: {metrics['alpha']:.3f} | "
                          f"Actor Loss: {metrics['actor_loss']:.3f} | "
                          f"Critic Loss: {metrics['critic_loss']:.3f}")
            global_step += 1
            full_global_step += 1

        # Save model periodically
        # Save best separately

        if len(train_scores) > 0 and np.mean(train_scores) > best_mean_score:
            agent.save_model(model_savefile / "best")
            best_mean_score = np.mean(train_scores)
            print(f"New best model saved at epoch {epoch} (mean={best_mean_score:.2f})")

# Always save latest
        if epoch % 2 == 0:
            agent.save_model(model_savefile / "latest")

        # Logging
        if len(train_scores) > 0:
            print(
                f"[{behavior_name}] Epoch {epoch} | "
                f"Mean: {np.mean(train_scores):.1f} +/- {np.std(train_scores):.1f} | "
                f"Min: {np.min(train_scores):.1f} | Max: {np.max(train_scores):.1f}"
            )

        elapsed = (time() - start_time) / 60.0
        print(f"Total elapsed time for epoch {epoch}: {elapsed:.2f} minutes\n")

    game.close()
    return agent, game



In [None]:
from torchrl.data import PrioritizedReplayBuffer, LazyTensorStorage, ReplayBuffer
import torch.nn.functional as F   
alpha = 0.7  # prioritization exponent
beta = 0.5   # importance sampling exponent

storage = LazyTensorStorage(REPLAY_SIZE)
memoryBuffer = ReplayBuffer(
    storage=storage
)

In [None]:
class Actor(nn.Module):
    def __init__(self, in_channels, action_dim):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(in_channels, 512), nn.ReLU(),
            nn.Linear(512, 512), nn.ReLU(),
            nn.Linear(512, 256), nn.ReLU(),
            nn.Linear(256, 256)
        )
        self.mean_layer = nn.Linear(256, action_dim)
        self.log_std_layer = nn.Linear(256, action_dim)

    def forward(self, x):
        x = self.linear(x)
        mean = self.mean_layer(x)
        log_std = self.log_std_layer(x)
        log_std = torch.clamp(log_std, min=-20, max=2) #clamp it to ensure stability
        #return mean log_std to define prob distribution which we sample from in sample()
        return mean, log_std

    def sample(self, state):
        mean, log_std = self.forward(state)
        std = torch.exp(log_std)
        normal = torch.distributions.Normal(mean, std)
        z = normal.rsample()   # use rsample for reparameterization, this way grad. can be computed
        action = torch.tanh(z)
        # log prob with tanh correction
        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + 1e-6)
        log_prob = log_prob.sum(dim=-1, keepdim=True)
        return action, log_prob, z





In [None]:
class CrossQCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        input_dim = state_dim + action_dim
        self.random_proj = nn.Linear(input_dim, 512)
        nn.init.orthogonal_(self.random_proj.weight)
        for param in self.random_proj.parameters():
            param.requires_grad = False
        
        self.head1 = nn.Sequential(
            nn.Linear(512, 2048), nn.ReLU(),
            nn.BatchNorm1d(num_features=2048, momentum=0.01, track_running_stats=True),
            nn.Linear(2048, 2048), nn.ReLU(),
            nn.BatchNorm1d(num_features=2048, momentum=0.01, track_running_stats=True),
            nn.Linear(2048, 1)
        )
        self.head2 = nn.Sequential(
            nn.Linear(512, 2048), nn.ReLU(),
            nn.BatchNorm1d(num_features=2048, momentum=0.01, track_running_stats=True),
            nn.Linear(2048, 2048), nn.ReLU(),
            nn.BatchNorm1d(num_features=2048, momentum=0.01, track_running_stats=True),
            nn.Linear(2048, 1)
        )
    
    def forward(self, state, action, head= None):
        state = state.view(state.size(0), -1)
        action = action.view(action.size(0), -1)
        x = torch.cat([state, action], dim=1)
        features = torch.relu(self.random_proj(x))
        if(head == 1):
            return self.head1(features)
        if(head == 2):
            return self.head2(features)
        else:
            return self.head1(features), self.head2(features)
        
    
    


In [None]:
class CrossQ(nn.Module):
    def __init__(self, action_dim, in_channels, buffer, batch_size = BATCH_SIZE, discount_factor = DISCOUNT_FACTOR):
        super().__init__()
        self.buffer = buffer
        self.Actor = Actor(in_channels, action_dim)
        self.Critic = CrossQCritic(in_channels, action_dim)
        self.batch_size = batch_size
        self.gamma = discount_factor
        self.tau = 0.005
        self.lr = 3e-4
        self.criticOptim = torch.optim.Adam(self.Critic.parameters(), lr = LEARNING_RATE, betas=(0.5, 0.999))
        self.ActorOptim = torch.optim.Adam(self.Actor.parameters(), lr = LEARNING_RATE, betas=(0.5, 0.999))
        self.Actor.to(DEVICE)
        self.target_entropy = -action_dim
        self.log_alpha = torch.zeros(1, requires_grad=True, device=DEVICE)
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=LEARNING_RATE)
        self.Critic.to(DEVICE)
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(DEVICE)
        action, _, _ = self.Actor.sample(state)
        return action.detach().cpu().numpy()[0]
    
    @property
    def alpha(self):
        return self.log_alpha.exp()
    
    def update(self, global_step):
        sampled_data = self.buffer.sample(self.batch_size)
        observations = sampled_data["obs"]  # Current states
        actions = sampled_data["action"]  # Actions taken
        rewards = sampled_data["reward"]  # Rewards received
        next_observations = sampled_data["next_obs"]  # Next states
        dones = sampled_data["done"]

        observations = observations.to(DEVICE)
        actions = actions.to(DEVICE)
        rewards = rewards.to(DEVICE)
        next_observations = next_observations.to(DEVICE)
        dones = dones.to(DEVICE)
        observations = observations.view(observations.size(0), -1)
        next_observations = next_observations.view(next_observations.size(0), -1)
        actions = actions.view(actions.size(0), -1)

        next_action, next_log_prob, _ = self.Actor.sample(next_observations)

        #cross q concat 
        combined_states = torch.cat([observations, next_observations], dim=0)
        combined_actions = torch.cat([actions, next_action], dim=0)
        
        q1_combined, q2_combined = self.Critic(combined_states, combined_actions, head = None)

        batch_size = observations.size(0)
        q1_current = q1_combined[:batch_size]
        q2_current = q2_combined[:batch_size]  # For critic loss
        q1_next = q1_combined[batch_size:]
        q2_next = q2_combined[batch_size:]

        #compute target_Q
        with torch.no_grad():
            random_head = torch.randint(low=0, high=2, size=(1,)).item() #randomly samples
            q_next = q1_next if random_head == 0 else q2_next
            next_v = q_next - self.alpha * next_log_prob
            target_q = rewards + (1 - dones.float()) * self.gamma * next_v

        new_action, log_prob, _ = self.Actor.sample(observations)
        #alpha loss
        alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()

        #critic loss
        critic_loss = F.mse_loss(q1_current, target_q) + F.mse_loss(q2_current, target_q)
        self.criticOptim.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.Critic.parameters(), max_norm=1.0)
        self.criticOptim.step()
        actor_loss = None
        #actor loss
        if global_step % POLICY_DELAY == 0:
            q1_new, q2_new = self.Critic(observations, new_action, head=None)
            q_new_action = torch.min(q1_new, q2_new)
            actor_loss = (self.alpha * log_prob - q_new_action).mean()
            self.ActorOptim.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.Actor.parameters(), max_norm=1.0)
            self.ActorOptim.step()

        return {
            'alpha': self.alpha.item(),
            'actor_loss': actor_loss.item() if actor_loss is not None else 0.0,
            'critic_loss': critic_loss.item(),
        }



    def save_model(self, model_path):
        if model_path is None:
            model_path = model_savefile
        torch.save(self.Actor.state_dict(), model_path / "Actor.pth")
        torch.save(self.log_alpha, model_path / "log_alpha.pth")
        torch.save(self.Critic.state_dict(), model_path / "Critic.pth")
        torch.save(self.ActorOptim.state_dict(), model_path / "ActorOptim.pth")
        torch.save(self.criticOptim.state_dict(), model_path / "CriticOptim.pth")
        torch.save(self.alpha_optimizer.state_dict(), model_path / "AlphaOptim.pth")

    
    def load_model(self):
        self.Actor.load_state_dict(torch.load(model_savefile_latest / "Actor.pth", map_location=DEVICE))
        self.log_alpha = torch.load(model_savefile_latest / "log_alpha.pth", map_location=DEVICE)
        self.Critic.load_state_dict(torch.load(model_savefile_latest / "Critic.pth", map_location=DEVICE))
        self.ActorOptim.load_state_dict(torch.load(model_savefile_latest / "ActorOptim.pth", map_location=DEVICE))
        self.criticOptim.load_state_dict(torch.load(model_savefile_latest / "CriticOptim.pth", map_location=DEVICE))
        self.alpha_optimizer.load_state_dict(torch.load(model_savefile_latest / "AlphaOptim.pth", map_location=DEVICE))



In [29]:
import gc
import time

# Force garbage collection to clean up any lingering environment references
gc.collect()
time.sleep(2)  # Give it a moment to clean up

print("Ready to create new environment")

Ready to create new environment


In [None]:
import gymnasium as gym
from gymnasium import Wrapper
from mlagents_envs.registry import default_registry


load_model = False
skip_learning = False



env_id = "Crawler"
env = default_registry[env_id].make(worker_id=16, no_graphics = True)  # Specify a unique worker_id cuz i have no clue how to shut down the env will look into it
env.reset()
behavior_name = list(env.behavior_specs)[0]
spec = env.behavior_specs[behavior_name]
action_dim = spec.action_spec.continuous_size
in_channels = sum(np.prod(o.shape) for o in spec.observation_specs)
agent = CrossQ(action_dim, in_channels, memoryBuffer)
agent.to(DEVICE)
if load_model:
    agent.load_model()
if not skip_learning:
    agent, game = run(
        env,
        agent,
        memoryBuffer,
        num_epochs= 50,
        steps_per_epoch=STEPS_PER_EPOCH,
        behavior_name = behavior_name,
        load_model = load_model
    )

    print("======================================")
    print("Training finished.")


no boot config - using default values
[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=104857

UnityEnvironmentException: Environment shut down with return code 1.

In [None]:
episodes_to_watch = 5
env_id = "Crawler"
env = default_registry[env_id].make(worker_id=7)
env.reset()
behavior_name = list(env.behavior_specs)[0]
spec = env.behavior_specs[behavior_name]
action_dim = spec.action_spec.continuous_size
in_channels = sum(np.prod(o.shape) for o in spec.observation_specs)

# Create agent and load weights
agent = CrossQ(action_dim, in_channels, memoryBuffer)
print("Loading saved model weights...")
agent.Actor.load_state_dict(torch.load(model_savefile / "best" / "Actor.pth", map_location=DEVICE))
agent.Critic.load_state_dict(torch.load(model_savefile / "best" / "Critic.pth", map_location=DEVICE))
agent.log_alpha = torch.load(model_savefile / "best" / "log_alpha.pth", map_location=DEVICE)

# ✅ Set to eval mode for inference
agent.Actor.eval()
agent.Critic.eval()

print("Model weights loaded successfully.")

# Load observation statistics
obs_mean = np.load(model_savefile / "obs_mean.npy")
obs_std = np.load(model_savefile / "obs_std.npy")

max_steps = 3000
for i in range(episodes_to_watch):
    env.reset()
    agent_rewards = {}
    for step in range(max_steps):
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        for agent_id in decision_steps.agent_id:
            obs = decision_steps[agent_id].obs
            obs_flat = np.concatenate([o.flatten() for o in obs])
            obs_norm = (obs_flat - obs_mean) / obs_std

            action = agent.get_action(obs_norm)
            action = np.expand_dims(action, axis=0)
            action = ActionTuple(continuous=action)
            env.set_action_for_agent(behavior_name, agent_id, action)
            agent_rewards.setdefault(agent_id, 0)
            reward = decision_steps[agent_id].reward
            agent_rewards[agent_id] += reward

        for agent_id in terminal_steps:
            agent_rewards.setdefault(agent_id, 0)
            agent_rewards[agent_id] += terminal_steps[agent_id].reward
        
        env.step()

        if len(decision_steps) == 0 and len(terminal_steps) == 0:
            break
    
    rewards = list(agent_rewards.values())
    print(f"Episode {i} | Mean: {np.mean(rewards):.1f} | "
          f"Max: {np.max(rewards):.1f} | Min: {np.min(rewards):.1f}")

print("Watching ended")
env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

KeyboardInterrupt: 