# Collaboration and Competition

---

This notebook trains a self-play DDPG agent to solve the Unity Tennis environment, where two agents learn to keep a ball in play over a net.

### 1. Start the Environment

In [None]:
from unityagents import UnityEnvironment
import numpy as np
import torch
import time
import matplotlib.pyplot as plt
from collections import deque
from maddpg_agent import Agent

%matplotlib inline

In [None]:
env = UnityEnvironment(file_name='Tennis.app')

In [None]:
# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### 2. Examine the State and Action Spaces

Two agents control rackets to bounce a ball over a net:
- **Reward**: +0.1 for hitting the ball over the net, -0.01 if ball hits ground or goes out of bounds
- **Observation**: 24 dimensions per agent (3 stacked frames × 8 variables: ball/racket position and velocity)
- **Action**: 2 continuous values per agent (movement toward/away from net, jumping), each in [-1, 1]
- **Solve condition**: Average of max(agent1_score, agent2_score) >= 0.5 over 100 consecutive episodes

In [None]:
# Reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# Number of agents
num_agents = len(env_info.agents)
print(f'Number of agents: {num_agents}')

# Size of each action
action_size = brain.vector_action_space_size
print(f'Size of each action: {action_size}')

# Examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print(f'There are {num_agents} agents. Each observes a state with length: {state_size}')
print(f'The state for the first agent looks like:\n{states[0]}')

### 3. Train the Agent

We use **self-play DDPG**: both agents share one actor, one critic, and one replay buffer. This works because Tennis is symmetric — an optimal policy for one side is optimal for the other.

Key adaptation from the Continuous Control project: **noise decay** (σ × 0.9995 per episode, with floor 0.01). In sparse-reward environments, the agent needs heavy exploration early but must quiet down once it finds a fragile cooperative equilibrium.

In [None]:
agent = Agent(state_size=state_size, action_size=action_size,
              num_agents=num_agents, random_seed=42)

In [None]:
n_episodes = 5000
max_t = 1000
all_scores = []
scores_window = deque(maxlen=100)
solved = False
t_start = time.time()

for i_episode in range(1, n_episodes + 1):
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    agent.reset()
    scores = np.zeros(num_agents)

    for t in range(max_t):
        actions = agent.act(states)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        agent.step(states, actions, rewards, next_states, dones)
        states = next_states
        scores += rewards
        if np.any(dones):
            break

    # Tennis scoring: max over the two agents
    episode_score = np.max(scores)
    all_scores.append(episode_score)
    scores_window.append(episode_score)
    rolling_avg = np.mean(scores_window)

    # Decay exploration noise
    agent.decay_noise()

    if i_episode % 100 == 0:
        elapsed = time.time() - t_start
        print(f'Episode {i_episode}\tAvg: {rolling_avg:.3f}\tNoise σ: {agent.noise_sigma:.4f}\t({elapsed:.0f}s)')

    if rolling_avg >= 0.5 and not solved:
        elapsed = time.time() - t_start
        print(f'\n*** Solved at episode {i_episode}!  100-ep avg: {rolling_avg:.3f} ({elapsed:.0f}s) ***')
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        solved = True
        break  # Stop training once solved

if not solved:
    print(f'\nDid not solve. Final 100-ep avg: {np.mean(scores_window):.3f}')
    torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
    torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

np.save('scores.npy', np.array(all_scores))

### 4. Plot Training Scores

In [None]:
scores = np.load('scores.npy')

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(np.arange(1, len(scores) + 1), scores, alpha=0.3, color='steelblue', label='Episode Score')
if len(scores) >= 100:
    rolling = [np.mean(scores[max(0, i-100):i]) for i in range(1, len(scores) + 1)]
    ax.plot(np.arange(1, len(scores) + 1), rolling, color='darkblue', linewidth=2, label='100-Episode Average')
ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.7, label='Solve Threshold (0.5)')
ax.set_xlabel('Episode')
ax.set_ylabel('Score (Max over Agents)')
ax.set_title('Self-Play DDPG Training — Tennis')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('scores_plot.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved scores_plot.png')

### 5. Greedy Evaluation

In [None]:
# Load best checkpoint
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth', weights_only=True))

test_scores = []
for i in range(1, 101):
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    ep_scores = np.zeros(num_agents)
    while True:
        actions = agent.act(states, add_noise=False)
        env_info = env.step(actions)[brain_name]
        states = env_info.vector_observations
        ep_scores += env_info.rewards
        if np.any(env_info.local_done):
            break
    test_scores.append(np.max(ep_scores))
    if i % 20 == 0:
        print(f'Test {i}/100  Score: {np.max(ep_scores):.2f}')

print(f'\nGreedy Test Results (100 episodes):')
print(f'  Average: {np.mean(test_scores):.3f}')
print(f'  Std Dev: {np.std(test_scores):.3f}')
print(f'  Min:     {np.min(test_scores):.3f}')
print(f'  Max:     {np.max(test_scores):.3f}')

In [None]:
env.close()