In [41]:
#Imports & hyperparameters
"""
This cell imports the libraries needed for data loading and model training, 
sets up the compute device, specifies the path to the expert data, 
and defines key hyperparameters for the behavior cloning pipeline
"""

import pickle
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

# Training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to expert trajectories (simple_push)
data_path = "Expert_data/expert_data_rllib_simple_push.pickle"

# BC hyperparameters
batch_size = 64
lr = 1e-3
epochs = 50
hidden_dim = 64

print("Device:", device)
print("Data path:", data_path)

Device: cpu
Data path: Expert_data/expert_data_rllib_simple_push.pickle


In [42]:
# Load & inspect expert_data
"""
This cell:
1. Opens and loads the pickled expert trajectory data from disk.
2. Prints a sanity check showing the data type and list of agents.
3. Iterates over each agent’s data to display:
   - The total number of (state, action) transitions collected.
   - The shape of one example state and action for verification.
"""

# Load the pickled expert trajectories
with open(data_path, "rb") as f:
    expert_data = pickle.load(f)

# Sanity check
print("expert_data type:", type(expert_data))
print("Agents available:", list(expert_data.keys()))

# For each agent, print number of transitions and example shapes
for agent, data in expert_data.items():
    n_states  = len(data["states"])
    n_actions = len(data["actions"])
    # Peek at shapes
    state_shape  = np.array(data["states"][0]).shape
    action_shape = np.array(data["actions"][0]).shape
    print(f"  {agent}: {n_states} transitions, state shape={state_shape}, action shape={action_shape}")

expert_data type: <class 'dict'>
Agents available: ['adversary_0', 'agent_0']
  adversary_0: 1200 transitions, state shape=(8,), action shape=()
  agent_0: 1200 transitions, state shape=(19,), action shape=()


In [43]:
#Define Dataset & DataLoader for Each Agent

"""
This cell:
1. Defines a PyTorch Dataset (`SingleAgentExpertDataset`) that wraps a single agent’s expert trajectories:
   - Converts lists of NumPy state and action arrays into Torch tensors.
   - Implements `__len__` and `__getitem__` for DataLoader compatibility.
2. Creates one dataset and DataLoader per agent in `expert_data`, using the specified batch size and shuffling.
3. Prints the shape of a sample batch (states and actions) for each agent to verify correct batching.
"""
class SingleAgentExpertDataset(Dataset):
    def __init__(self, states, actions):
        # Convert lists of numpy arrays into torch tensors
        self.states  = torch.from_numpy(np.array(states)).float()
        self.actions = torch.from_numpy(np.array(actions)).long()

    def __len__(self):
        return len(self.states)

    def __getitem__(self, idx):
        return self.states[idx], self.actions[idx]

# Instantiate datasets and loaders
datasets = {}
loaders  = {}

for agent, data in expert_data.items():
    ds     = SingleAgentExpertDataset(data["states"], data["actions"])
    loader = DataLoader(ds, batch_size=batch_size, shuffle=True)
    datasets[agent] = ds
    loaders[agent]  = loader

    # Print a batch shape for sanity
    s_batch, a_batch = next(iter(loader))
    print(f"{agent} batch shapes → states: {s_batch.shape}, actions: {a_batch.shape}")

adversary_0 batch shapes → states: torch.Size([64, 8]), actions: torch.Size([64])
agent_0 batch shapes → states: torch.Size([64, 19]), actions: torch.Size([64])


In [44]:
# Behavior Cloning Policy Network Definition & Instantiation
"""
This cell:
1. Imports PyTorch’s neural network module (`nn`) for layer definitions.
2. Defines the `BCPolicy` class as a multi-layer perceptron with:
   - An input layer matching the observation dimension (`obs_dim`).
   - Two hidden layers of size `hidden_dim` with ReLU activations.
   - An output layer producing raw logits for each discrete action (`act_dim`).
3. Iterates over each agent’s dataset (`datasets`):
   a. Reads the observation dimension from `ds.states.shape[1]`.
   b. Computes the number of actions as `max(action) + 1`.
   c. Instantiates a `BCPolicy` with those dimensions and moves it to the compute device.
   d. Stores each policy in the `policies` dictionary for subsequent training.
4. Prints each agent’s network architecture and parameter dimensions for verification.
"""

import torch.nn as nn

class BCPolicy(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, act_dim)
        )

    def forward(self, x):
        return self.net(x)

# Instantiate one BCPolicy per agent
policies = {}
for agent, ds in datasets.items():
    obs_dim = ds.states.shape[1]
    # assume actions are 0...act_dim-1
    act_dim = int(ds.actions.max().item()) + 1
    policy = BCPolicy(obs_dim, act_dim, hidden_dim).to(device)
    policies[agent] = policy
    print(f"{agent} policy → obs_dim: {obs_dim}, act_dim: {act_dim}")
    print(policy)

adversary_0 policy → obs_dim: 8, act_dim: 5
BCPolicy(
  (net): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=5, bias=True)
  )
)
agent_0 policy → obs_dim: 19, act_dim: 5
BCPolicy(
  (net): Sequential(
    (0): Linear(in_features=19, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=5, bias=True)
  )
)


In [45]:
#Supervised Training Loop for Behavior Cloning Policies

"""
This cell:
1. Imports PyTorch’s `nn` module for loss definition.
2. Creates an Adam optimizer for each agent’s policy network with the specified learning rate.
3. Defines the cross-entropy loss function to train the policy to match expert actions.
4. Iterates through each agent:
   a. Prints a header indicating which agent is being trained.
   b. Runs a training loop over the specified number of epochs.
   c. For each batch of expert states and actions:
      - Moves data to the correct device.
      - Computes action logits via the agent’s BCPolicy.
      - Calculates the cross-entropy loss between logits and expert actions.
      - Performs a backward pass and optimizer step.
      - Accumulates batch losses.
   d. Computes and prints the average loss at epoch 1 and every 10 epochs thereafter for monitoring.
"""
import torch.nn as nn

# Set up optimizers and loss
optimizers = {
    agent: torch.optim.Adam(policy.parameters(), lr=lr)
    for agent, policy in policies.items()
}
loss_fn = nn.CrossEntropyLoss()

# Training loop
for agent in policies:
    print(f"\n=== Training BC policy for {agent} ===")
    policy    = policies[agent]
    optimizer = optimizers[agent]
    loader    = loaders[agent]
    
    for epoch in range(1, epochs+1):
        total_loss = 0.0
        for states, actions in loader:
            states, actions = states.to(device), actions.to(device)
            logits = policy(states)
            loss   = loss_fn(logits, actions)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(loader)
        # Print every 10 epochs (and first epoch)
        if epoch == 1 or epoch % 10 == 0:
            print(f"{agent}  Epoch {epoch:>2}/{epochs}: loss = {avg_loss:.4f}")


=== Training BC policy for adversary_0 ===
adversary_0  Epoch  1/50: loss = 1.5706
adversary_0  Epoch 10/50: loss = 0.9721
adversary_0  Epoch 20/50: loss = 0.7313
adversary_0  Epoch 30/50: loss = 0.6237
adversary_0  Epoch 40/50: loss = 0.5498
adversary_0  Epoch 50/50: loss = 0.5015

=== Training BC policy for agent_0 ===
agent_0  Epoch  1/50: loss = 1.5799
agent_0  Epoch 10/50: loss = 1.0096
agent_0  Epoch 20/50: loss = 0.6132
agent_0  Epoch 30/50: loss = 0.4235
agent_0  Epoch 40/50: loss = 0.3311
agent_0  Epoch 50/50: loss = 0.2738


In [46]:
#Save & Verify Behavior Cloning Policy Checkpoints
"""
This cell:
1. Iterates over each trained policy in `policies`:
   a. Constructs a filename `bc_<agent>.pt`.
   b. Saves the policy’s state dictionary to disk.
   c. Records the file path in `model_paths`.
   d. Prints a confirmation message.
2. Reloads each saved policy to ensure integrity:
   a. Retrieves the observation and action dimensions from the corresponding dataset.
   b. Reinstantiates a fresh `BCPolicy` with the same dimensions.
   c. Loads the saved state dictionary into this new model.
   d. Switches the model to evaluation mode.
   e. Prints a success message confirming that the checkpoint loads correctly.
"""

model_paths = {}

for agent, policy in policies.items():
    path = f"bc_{agent}.pt"
    torch.save(policy.state_dict(), path)
    model_paths[agent] = path
    print(f"Saved {agent} policy to {path}")

# Reload to verify
for agent, path in model_paths.items():
    obs_dim = datasets[agent].states.shape[1]
    act_dim = int(datasets[agent].actions.max().item()) + 1
    # recreate the model
    check_policy = BCPolicy(obs_dim, act_dim, hidden_dim).to(device)
    check_policy.load_state_dict(torch.load(path))
    check_policy.eval()
    print(f"Reloaded {agent} policy from {path} successfully.")

Saved adversary_0 policy to bc_adversary_0.pt
Saved agent_0 policy to bc_agent_0.pt
Reloaded adversary_0 policy from bc_adversary_0.pt successfully.
Reloaded agent_0 policy from bc_agent_0.pt successfully.


In [47]:
"""
Cell 7: Environment‐based Evaluation Stub for BC Policies (Guarded)

This cell attempts to perform true environment rollouts of your BC policies 
in the `simple_push` environment. If the required `pygame` dependency is missing, 
it will skip rather than error out, allowing the rest of the notebook to run smoothly.

Steps:
1. Try importing the MPE `simple_push_v3` environment.
2. If the import fails (pygame not installed), set a flag and print a warning.
3. Only if the environment is available, run `evaluate_bc_agents`:
   a. Create the parallel environment.
   b. Loop over episodes, collecting actions from each BC policy.
   c. Step the environment, accumulate per‐agent returns.
   d. After all episodes, print average returns for each agent.
"""
# Attempt to import the environment
try:
    from pettingzoo.mpe import simple_push_v3
    env_available = True
except ImportError:
    env_available = False
    print("⚠️ Skipping environment‐based evaluation: `pygame` (and SDL2) not installed.")

# Only run rollouts if the env is actually available
if env_available:
    def evaluate_bc_agents(policies, num_episodes=20):
        """
        Roll out BC policies in parallel simple_push env and print avg. returns.
        """
        env = simple_push_v3.parallel_env(continuous_actions=True, max_cycles=25)
        returns = {agent: [] for agent in env.agents}

        for ep in range(num_episodes):
            obs, _ = env.reset()
            done = {a: False for a in env.agents}
            ep_rewards = {a: 0.0 for a in env.agents}

            while not all(done.values()):
                actions = {}
                for agent in env.agents:
                    state = torch.from_numpy(obs[agent]).float().to(device)
                    logits = policies[agent](state)
                    actions[agent] = torch.argmax(logits).item()
                obs, rewards, terminations, truncations, _ = env.step(actions)
                for agent in env.agents:
                    ep_rewards[agent] += rewards.get(agent, 0)
                done = {
                    a: terminations.get(a, False) or truncations.get(a, False)
                    for a in env.agents
                }

            for agent in env.agents:
                returns[agent].append(ep_rewards[agent])

        env.close()

        # Print average returns
        for agent, vals in returns.items():
            avg_ret = sum(vals) / len(vals)
            print(f"{agent} BC avg return over {num_episodes} eps: {avg_ret:.2f}")

    # Execute the evaluation
    evaluate_bc_agents(policies, num_episodes=20)

⚠️ Skipping environment‐based evaluation: `pygame` (and SDL2) not installed.


In [48]:
# Held-out Expert Accuracy (no env needed)
"""This cell:
1. Ensures scikit-learn’s `accuracy_score` is available, installing it via pip if missing.
2. Prints a header for the held-out classification accuracy evaluation.
3. Defines `n_test` as the number of final samples to reserve for testing.
4. Iterates over each agent’s dataset:
   a. Converts the last `n_test` states and actions into tensors and arrays.
   b. Runs a forward pass through the trained BC policy to obtain predicted actions.
   c. Computes classification accuracy against the held-out expert actions.
   d. Prints each agent’s accuracy as a percentage, giving a quick numeric measure of BC performance.
"""
# Install scikit-learn if missing
import sys
try:
    from sklearn.metrics import accuracy_score
except ImportError:
    !{sys.executable} -m pip install scikit-learn
    from sklearn.metrics import accuracy_score

print("Held-out expert classification accuracy per agent:")

# Use the last 200 samples as a “test” split
n_test = 200

for agent, ds in datasets.items():
    states  = ds.states.cpu().numpy()
    actions = ds.actions.cpu().numpy()

    test_states  = torch.from_numpy(states[-n_test:]).to(device)
    test_actions = actions[-n_test:]

    with torch.no_grad():
        logits = policies[agent](test_states)
        preds  = torch.argmax(logits, dim=-1).cpu().numpy()

    acc = accuracy_score(test_actions, preds)
    print(f"  {agent}: {acc*100:5.2f}%")

Held-out expert classification accuracy per agent:
  adversary_0: 86.00%
  agent_0: 93.00%
