# 🧠 DDPG Training: 3-Bot Line Formation
We train a DDPG agent to control 3 spinning bots to maintain alignment in a straight line. The custom environment handles physical dynamics and returns rewards based on how collinear the bots remain.

In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_checker import check_env

from multibot_cluster_env import MultiBotClusterEnv

## ✅ Initialize Environment
We use a task-specific setup: `task='line'` to encourage bots to align.

In [2]:
env_raw = MultiBotClusterEnv(num_bots=3, task='shape')
check_env(env_raw)
env = DummyVecEnv([lambda: env_raw])



## ⚙️ DDPG Agent Setup with Action Noise
To help exploration, we inject Gaussian noise into the actions.

In [3]:
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG(
    policy="MlpPolicy",
    env=env,
    action_noise=action_noise,
    verbose=1,
    tensorboard_log="./ddpg_tensorboard/"
)

Using cpu device


## 🚀 Training the Agent

In [4]:
model.learn(total_timesteps=10000)
model.save("ddpg_line_bot")

Logging to ./ddpg_tensorboard/DDPG_4
---------------------------------
| time/              |          |
|    episodes        | 4        |
|    fps             | 296      |
|    time_elapsed    | 2        |
|    total_timesteps | 800      |
| train/             |          |
|    actor_loss      | -23.7    |
|    critic_loss     | 0.00524  |
|    learning_rate   | 0.001    |
|    n_updates       | 699      |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 8        |
|    fps             | 279      |
|    time_elapsed    | 5        |
|    total_timesteps | 1600     |
| train/             |          |
|    actor_loss      | -47.2    |
|    critic_loss     | 0.0481   |
|    learning_rate   | 0.001    |
|    n_updates       | 1499     |
---------------------------------
---------------------------------
| time/              |          |
|    episodes        | 12       |
|    fps             | 275      |
|    time_e

KeyboardInterrupt: 

## 📊 Evaluate Trained Policy

In [None]:
from stable_baselines3 import DDPG

# Load the trained model
model = DDPG.load("ddpg_line_bot")

# Unwrap the VecEnv to access the raw Gymnasium environment
env_eval = env.envs[0]

# Gymnasium reset returns (obs, info)
obs, _ = env_eval.reset()
trajectory = []

for _ in range(2000):
    action, _ = model.predict(obs, deterministic=True)

    # Gymnasium step returns (obs, reward, terminated, truncated, info)
    obs, reward, terminated, truncated, _ = env_eval.step(action)
    
    trajectory.append(env_eval.state.reshape(3, 2))

    if terminated or truncated:
        break

trajectory = np.array(trajectory)

## 📈 Plot Bot Trajectories

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
for i in range(3):
    ax.plot(trajectory[:, i, 0], trajectory[:, i, 1], label=f"Bot {i+1}")
    ax.plot(trajectory[0, i, 0], trajectory[0, i, 1], 'o', alpha=0.5)
    ax.plot(trajectory[-1, i, 0], trajectory[-1, i, 1], 'x', alpha=0.8)

ax.set_aspect('equal')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('DDPG-trained line formation')
ax.legend()
ax.grid(True)
plt.show()

## ✅ Next Steps
- Try `task='translate'` or `task='shape'`
- Compare DDPG with SAC or TD3
- Log metrics and evaluate performance curves
- Increase number of bots and experiment with curriculum learning