In [1]:
#Imports & hyperparameters


import pickle
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

# Training device
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to expert trajectories (simple_push)
data_path  = "Expert_data/expert_data_rllib_simple_push.pickle"

# BC hyperparameters
batch_size = 64
lr         = 1e-3
epochs     = 50
hidden_dim = 64

print("Device:", device)
print("Data path:", data_path)

Device: cpu
Data path: Expert_data/expert_data_rllib_simple_push.pickle


In [2]:
# Load & inspect expert_data
# Load the pickled expert trajectories
with open(data_path, "rb") as f:
    expert_data = pickle.load(f)

# Sanity check
print("expert_data type:", type(expert_data))
print("Agents available:", list(expert_data.keys()))

# For each agent, print number of transitions and example shapes
for agent, data in expert_data.items():
    n_states  = len(data["states"])
    n_actions = len(data["actions"])
    # Peek at shapes
    state_shape  = np.array(data["states"][0]).shape
    action_shape = np.array(data["actions"][0]).shape
    print(f"  {agent}: {n_states} transitions, state shape={state_shape}, action shape={action_shape}")

expert_data type: <class 'dict'>
Agents available: ['adversary_0', 'agent_0']
  adversary_0: 1200 transitions, state shape=(8,), action shape=()
  agent_0: 1200 transitions, state shape=(19,), action shape=()


In [3]:
# SingleAgentExpertDataset + DataLoaders
class SingleAgentExpertDataset(Dataset):
    def __init__(self, states, actions):
        # Convert lists of numpy arrays into torch tensors
        self.states  = torch.from_numpy(np.array(states)).float()
        self.actions = torch.from_numpy(np.array(actions)).long()

    def __len__(self):
        return len(self.states)

    def __getitem__(self, idx):
        return self.states[idx], self.actions[idx]

# Instantiate datasets and loaders
datasets = {}
loaders  = {}

for agent, data in expert_data.items():
    ds     = SingleAgentExpertDataset(data["states"], data["actions"])
    loader = DataLoader(ds, batch_size=batch_size, shuffle=True)
    datasets[agent] = ds
    loaders[agent]  = loader

    # Print a batch shape for sanity
    s_batch, a_batch = next(iter(loader))
    print(f"{agent} batch shapes → states: {s_batch.shape}, actions: {a_batch.shape}")

adversary_0 batch shapes → states: torch.Size([64, 8]), actions: torch.Size([64])
agent_0 batch shapes → states: torch.Size([64, 19]), actions: torch.Size([64])


In [4]:
# Behavior Cloning policy network definition
import torch.nn as nn

class BCPolicy(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, act_dim)
        )

    def forward(self, x):
        return self.net(x)

# Instantiate one BCPolicy per agent
policies = {}
for agent, ds in datasets.items():
    obs_dim = ds.states.shape[1]
    # assume actions are 0...act_dim-1
    act_dim = int(ds.actions.max().item()) + 1
    policy = BCPolicy(obs_dim, act_dim, hidden_dim).to(device)
    policies[agent] = policy
    print(f"{agent} policy → obs_dim: {obs_dim}, act_dim: {act_dim}")
    print(policy)

adversary_0 policy → obs_dim: 8, act_dim: 5
BCPolicy(
  (net): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=5, bias=True)
  )
)
agent_0 policy → obs_dim: 19, act_dim: 5
BCPolicy(
  (net): Sequential(
    (0): Linear(in_features=19, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=5, bias=True)
  )
)


In [5]:
# train each agent’s BCPolicy via supervised learning
import torch.nn as nn

# Set up optimizers and loss
optimizers = {
    agent: torch.optim.Adam(policy.parameters(), lr=lr)
    for agent, policy in policies.items()
}
loss_fn = nn.CrossEntropyLoss()

# Training loop
for agent in policies:
    print(f"\n=== Training BC policy for {agent} ===")
    policy    = policies[agent]
    optimizer = optimizers[agent]
    loader    = loaders[agent]
    
    for epoch in range(1, epochs+1):
        total_loss = 0.0
        for states, actions in loader:
            states, actions = states.to(device), actions.to(device)
            logits = policy(states)
            loss   = loss_fn(logits, actions)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(loader)
        # Print every 10 epochs (and first epoch)
        if epoch == 1 or epoch % 10 == 0:
            print(f"{agent}  Epoch {epoch:>2}/{epochs}: loss = {avg_loss:.4f}")


=== Training BC policy for adversary_0 ===
adversary_0  Epoch  1/50: loss = 1.5626
adversary_0  Epoch 10/50: loss = 0.9564
adversary_0  Epoch 20/50: loss = 0.7226
adversary_0  Epoch 30/50: loss = 0.6269
adversary_0  Epoch 40/50: loss = 0.5666
adversary_0  Epoch 50/50: loss = 0.5109

=== Training BC policy for agent_0 ===
agent_0  Epoch  1/50: loss = 1.5674
agent_0  Epoch 10/50: loss = 1.0059
agent_0  Epoch 20/50: loss = 0.5708
agent_0  Epoch 30/50: loss = 0.3979
agent_0  Epoch 40/50: loss = 0.3105
agent_0  Epoch 50/50: loss = 0.2616


In [6]:
# save and reload BC policy checkpoints
model_paths = {}

for agent, policy in policies.items():
    path = f"bc_{agent}.pt"
    torch.save(policy.state_dict(), path)
    model_paths[agent] = path
    print(f"Saved {agent} policy to {path}")

# Reload to verify
for agent, path in model_paths.items():
    obs_dim = datasets[agent].states.shape[1]
    act_dim = int(datasets[agent].actions.max().item()) + 1
    # recreate the model
    check_policy = BCPolicy(obs_dim, act_dim, hidden_dim).to(device)
    check_policy.load_state_dict(torch.load(path))
    check_policy.eval()
    print(f"Reloaded {agent} policy from {path} successfully.")

Saved adversary_0 policy to bc_adversary_0.pt
Saved agent_0 policy to bc_agent_0.pt
Reloaded adversary_0 policy from bc_adversary_0.pt successfully.
Reloaded agent_0 policy from bc_agent_0.pt successfully.


In [7]:
# Evaluation stub for BC policies
try:
    from pettingzoo.mpe import simple_push_v3
except ImportError:
    raise ImportError(
        "PettingZoo (and its pygame dependency) is required for evaluation. "
        "On macOS: `brew install sdl2 sdl2_image sdl2_ttf sdl2_mixer` "
        "then `pip install pygame pettingzoo`."
    )

def evaluate_bc_agents(policies, num_episodes=20):
    """
    Evaluate BC policies for each agent in simple_push.
    """
    env = simple_push_v3.parallel_env(continuous_actions=True, max_cycles=25)
    returns = {agent: [] for agent in env.agents}

    for ep in range(num_episodes):
        obs, _ = env.reset()
        done = {a: False for a in env.agents}
        ep_rewards = {a: 0.0 for a in env.agents}

        while not all(done.values()):
            actions = {}
            for agent in env.agents:
                state = torch.from_numpy(obs[agent]).float().to(device)
                logits = policies[agent](state)
                actions[agent] = torch.argmax(logits).item()
            obs, rewards, terminations, truncations, _ = env.step(actions)
            for agent in env.agents:
                ep_rewards[agent] += rewards.get(agent, 0)
            done = {
                a: terminations.get(a, False) or truncations.get(a, False)
                for a in env.agents
            }

        for agent in env.agents:
            returns[agent].append(ep_rewards[agent])

    env.close()

    # Print average returns
    for agent, vals in returns.items():
        avg_ret = sum(vals) / len(vals)
        print(f"{agent} BC avg return over {num_episodes} eps: {avg_ret:.2f}")

# Run evaluation
evaluate_bc_agents(policies, num_episodes=20)

ImportError: PettingZoo (and its pygame dependency) is required for evaluation. On macOS: `brew install sdl2 sdl2_image sdl2_ttf sdl2_mixer` then `pip install pygame pettingzoo`.

In [8]:
# Held-out Expert Accuracy (no env needed)
# Install scikit-learn if missing
import sys
try:
    from sklearn.metrics import accuracy_score
except ImportError:
    !{sys.executable} -m pip install scikit-learn
    from sklearn.metrics import accuracy_score

print("Held-out expert classification accuracy per agent:")

# Use the last 200 samples as a “test” split
n_test = 200

for agent, ds in datasets.items():
    states  = ds.states.cpu().numpy()
    actions = ds.actions.cpu().numpy()

    test_states  = torch.from_numpy(states[-n_test:]).to(device)
    test_actions = actions[-n_test:]

    with torch.no_grad():
        logits = policies[agent](test_states)
        preds  = torch.argmax(logits, dim=-1).cpu().numpy()

    acc = accuracy_score(test_actions, preds)
    print(f"  {agent}: {acc*100:5.2f}%")

Held-out expert classification accuracy per agent:
  adversary_0: 87.00%
  agent_0: 91.50%
