# Continuous Control — DDPG with 20 Parallel Agents

Train a double-jointed arm to track moving target locations using **Deep Deterministic Policy Gradients (DDPG)**. The 20-agent version shares a single replay buffer and network to accelerate learning.

**Solve condition**: Average score ≥ 30 over 100 consecutive episodes.

### 1. Import Packages and Start the Environment

In [None]:
from unityagents import UnityEnvironment
import numpy as np
import torch
import matplotlib.pyplot as plt
from collections import deque

%matplotlib inline

In [ ]:
env = UnityEnvironment(file_name='Reacher.app')

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### 2. Examine the State and Action Spaces

In [ ]:
# Reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# Number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# Size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# Examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

### 3. Take Random Actions in the Environment

In [ ]:
env_info = env.reset(train_mode=True)[brain_name]
states = env_info.vector_observations
scores = np.zeros(num_agents)
while True:
    actions = np.random.randn(num_agents, action_size)
    actions = np.clip(actions, -1, 1)
    env_info = env.step(actions)[brain_name]
    next_states = env_info.vector_observations
    rewards = env_info.rewards
    dones = env_info.local_done
    scores += env_info.rewards
    states = next_states
    if np.any(dones):
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

### 4. Train the Agent with DDPG

In [ ]:
from ddpg_agent import Agent

agent = Agent(state_size=state_size, action_size=action_size,
              num_agents=num_agents, random_seed=42)

In [None]:
def ddpg_train(n_episodes=300, max_t=1000, print_every=10):
    """Train DDPG agent in the 20-agent Reacher environment.
    
    Returns:
        all_scores: list of average scores (across 20 agents) per episode
    """
    all_scores = []
    scores_window = deque(maxlen=100)
    solved = False
    
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        scores = np.zeros(num_agents)
        
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            agent.step(states, actions, rewards, next_states, dones, t)
            states = next_states
            scores += rewards
            if np.any(dones):
                break
        
        avg_score = np.mean(scores)
        all_scores.append(avg_score)
        scores_window.append(avg_score)
        rolling_avg = np.mean(scores_window)
        
        print('\rEpisode {}\tAverage: {:.2f}\tScore: {:.2f}'.format(
            i_episode, rolling_avg, avg_score), end='')
        
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage: {:.2f}'.format(
                i_episode, rolling_avg))
        
        if rolling_avg >= 30.0 and not solved:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(
                i_episode - 100, rolling_avg))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            solved = True
    
    # Save final weights if we haven't solved yet (or for the final state)
    if not solved:
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
    
    return all_scores

scores = ddpg_train()

### 5. Plot Results

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

# Episode scores
ax.plot(np.arange(1, len(scores) + 1), scores, alpha=0.3, color='steelblue', label='Episode Score')

# 100-episode rolling average
if len(scores) >= 100:
    rolling = [np.mean(scores[max(0, i-100):i]) for i in range(1, len(scores) + 1)]
    ax.plot(np.arange(1, len(scores) + 1), rolling, color='darkblue', linewidth=2, label='100-Episode Average')

# Solve threshold
ax.axhline(y=30, color='red', linestyle='--', alpha=0.7, label='Solve Threshold (30)')

ax.set_xlabel('Episode')
ax.set_ylabel('Average Score (20 Agents)')
ax.set_title('DDPG Training — Continuous Control (20 Agents)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('scores_plot.png', dpi=150, bbox_inches='tight')
plt.show()

### 6. Test the Trained Agent

Load saved weights and run 100 greedy episodes (no exploration noise) to verify performance.

In [None]:
# Load trained weights
from ddpg_agent import Agent as TestAgent

test_agent = TestAgent(state_size=state_size, action_size=action_size,
                       num_agents=num_agents, random_seed=42)
test_agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth', weights_only=True))

# Run 100 greedy test episodes
n_test = 100
test_scores = []

for i in range(1, n_test + 1):
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    episode_scores = np.zeros(num_agents)
    
    while True:
        actions = test_agent.act(states, add_noise=False)  # no exploration noise
        env_info = env.step(actions)[brain_name]
        states = env_info.vector_observations
        episode_scores += env_info.rewards
        if np.any(env_info.local_done):
            break
    
    avg = np.mean(episode_scores)
    test_scores.append(avg)
    print('\rTest Episode {}/{}\tScore: {:.2f}'.format(i, n_test, avg), end='')

print('\n\nGreedy Test Results ({} episodes):'.format(n_test))
print('  Average: {:.2f}'.format(np.mean(test_scores)))
print('  Std Dev: {:.2f}'.format(np.std(test_scores)))
print('  Min:     {:.2f}'.format(np.min(test_scores)))
print('  Max:     {:.2f}'.format(np.max(test_scores)))

In [None]:
env.close()