# Proximal Policy Optimization (PPO)

In [None]:
import os
import torch
import torch.nn as nn
import math
import numpy as np

from rl_helpers.scalers import *
from torch.distributions import Bernoulli
from delivery_drone.game.socket_client import DroneGameClient, DroneState

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
client = DroneGameClient(port=5555)
client.connect()

In [None]:
from rl_helpers.scalers import *

### State Variables

| Variable | Min | Max | Typical Range | Critical Threshold |
|----------|-----|-----|---------------|-------------------|
| drone_x | 0 | 1 | [0, 1] | - |
| drone_y | 0 | 1 | [0, 1] | - |
| drone_vx | -2.5 | 2.5 | [-1.5, 1.5] | - |
| drone_vy | -2.5 | 3.0 | [-0.5, 2.5] | - |
| drone_angle | -1 | 1 | [-0.5, 0.5] | < 0.111 for landing |
| drone_angular_vel | -1.5 | 1.5 | [-0.8, 0.8] | - |
| drone_fuel | 0 | 1 | [0, 1] | Episode ends at 0 |
| platform_x | 0 | 1 | [0.1, 0.9] | - |
| platform_y | 0 | 1 | [0.58, 0.92] | Platform spawns at [350, 550]px |
| distance_to_platform | 0 | 1.41 | [0, 1.2] | - |
| dx_to_platform | -1.125 | 1.125 | [-1, 1] | < ±0.0625 for landing |
| dy_to_platform | -1.083 | 1.083 | [-0.5, 0.8] | - |
| speed | 0 | 3.9 | [0, 3.0] | < 0.3 for landing |

In [None]:
def calc_velocity_alignment(state: DroneState):
    """
    Calculate how well the drone's velocity is aligned with optimal direction to platform.
    Returns cosine similarity: 1.0 = perfect alignment, -1.0 = opposite direction
    """
    # Optimal direction: from drone to platform
    optimal_dx = state.dx_to_platform
    optimal_dy = state.dy_to_platform
    optimal_norm = math.sqrt(optimal_dx**2 + optimal_dy**2)

    if optimal_norm < 1e-6:  # Already at platform
        return 1.0

    optimal_dx /= optimal_norm
    optimal_dy /= optimal_norm

    # Current velocity direction
    velocity_norm = state.speed
    if velocity_norm < 1e-6:  # Not moving
        return 0.0

    velocity_dx = state.drone_vx / velocity_norm
    velocity_dy = state.drone_vy / velocity_norm

    # Cosine similarity
    return velocity_dx * optimal_dx + velocity_dy * optimal_dy

In [None]:
# Velocity-magnitude-weighted distance reward
def calc_reward(state: DroneState, prev_state: DroneState = None):
    rewards = {}
    total_reward = 0

    # Time penalty
    minimum_time_penalty = 0
    maximum_time_penalty = 1
    rewards['time_penalty'] = -inverse_quadratic(
        state.distance_to_platform,
        decay=100,
        scaler=maximum_time_penalty-minimum_time_penalty) - minimum_time_penalty

    # Distance-based time penalty
    # Penalty gets smaller as drone gets closer to platform
    # Uses inverse quadratic function: higher penalty when far, reduces as distance decreases
    # Minimum penalty of 0.5, maximum of 2.0 per timestep
    total_reward += -0.5#rewards['time_penalty']

    velocity_alignment = calc_velocity_alignment(state)
    dist = state.distance_to_platform
    
    rewards['distance'] = 0
    rewards['hovering'] = 0

    if prev_state is not None:
        distance_delta = prev_state.distance_to_platform - state.distance_to_platform
        speed = state.speed
        
        # Calculate velocity toward platform
        if dist > 1e-6:
            velocity_toward_platform = (
                state.drone_vx * state.dx_to_platform +
                state.drone_vy * state.dy_to_platform
            ) / dist
        else:
            velocity_toward_platform = 0.0
        
        MIN_MEANINGFUL_SPEED = 0.15  # Require meaningful velocity
        
        # Only reward if FAST and ALIGNED and making PROGRESS
        if speed >= MIN_MEANINGFUL_SPEED and velocity_toward_platform > 0.1 and dist > 0.065:
            # Good: Moving fast toward platform
            speed_multiplier = 1.0 + speed * 2.0
            rewards['distance'] = float(np.clip(distance_delta * 1000 * speed_multiplier, -2, 5))
        elif distance_delta < -0.001:
            # BAD: Moving away from platform (distance increasing)
            rewards['distance'] = -2.0 * abs(distance_delta) * 1000  # Harsh penalty
            rewards['hovering'] = 0  # Don't double-penalize
        elif speed < 0.05:
            # Hovering
            rewards['hovering'] = -1.0
        elif speed < MIN_MEANINGFUL_SPEED:
            # Too slow
            rewards['hovering'] = -0.3
        else:
            rewards['distance'] = 0.0
            
    total_reward += rewards['distance']
    total_reward += rewards['hovering']

    # Angle penalty (define a distance based max threshold)
    abs_angle = abs(state.drone_angle)
    max_angle = 0.20
    max_permissible_angle = ((max_angle-0.111)*dist) + 0.111
    excess = abs_angle - max_permissible_angle # excess angle
    rewards['angle'] = -max(excess, 0) # maximum reward is 0 (we dont want it to reward hack for stability)

    total_reward += rewards['angle']

    # Speed - penalize excessive speed
    rewards['speed'] = 0
    speed = state.speed
    max_speed = 0.6
    if dist < 1:
        rewards['speed'] = -2 * max(speed-0.1, 0)
    else:
        rewards['speed'] = -1 * max(speed-max_speed, 0)
    total_reward += rewards['speed']

    # Penalize being below platform
    rewards['vertical_position'] = 0
    if state.dy_to_platform > 0:  # Platform is below drone (drone is above - GOOD)
        rewards['vertical_position'] = 0
    else:  # Drone is below platform (BAD!)
        rewards['vertical_position'] = state.dy_to_platform * 4.0  # Negative penalty
    total_reward += rewards['vertical_position']

    # Terminal
    rewards['terminal'] = 0
    if state.landed:
        rewards['terminal'] = 800.0 + state.drone_fuel * 100.0
    elif state.crashed:
        rewards['terminal'] = -200.0
        # Extra penalty for crashing far from target
        if state.distance_to_platform > 0.3:
            rewards['terminal'] -= 100.0
    total_reward += rewards['terminal']

    rewards['total'] = total_reward
    return rewards

In [None]:
client.reset()
state=None

In [None]:
prev_state = state if state else None
client.step(
    dict(
        main_thrust=0,
        left_thrust=0,
        right_thrust=0
    )
)
state = client.get_state()
display(state.__dict__)
calc_reward(state, prev_state)

In [None]:
def state_to_array(state, device='cpu'):
    """Convert DroneState dataclass to numpy array"""
    data = np.array([
        state.drone_x,
        state.drone_y,
        state.drone_vx,
        state.drone_vy,
        state.drone_angle,
        state.drone_angular_vel,
        state.drone_fuel,
        state.platform_x,
        state.platform_y,
        state.distance_to_platform,
        state.dx_to_platform,
        state.dy_to_platform,
        state.speed,
        float(state.landed),
        float(state.crashed)
    ])
    
    return torch.tensor(data, dtype=torch.float32, device=device)

In [None]:
class DroneGamerBoi(nn.Module):
    def __init__(self, state_dim=15):
        super().__init__()
        
        self.network = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.LayerNorm(128),  # Add normalization
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Linear(64, 3),
            nn.Sigmoid()
        )
        
    def forward(self, state):
        if isinstance(state, DroneState):
            state = state_to_array(state, device=device)
        
        return self.network(state)

In [None]:
class DroneTeacherBoi(nn.Module):
    def __init__(self, state_dim=15):
        super().__init__()
        
        self.network = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.LayerNorm(128),  # Add normalization
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Linear(64, 1) # output will be just a scalar
        )
        
    def forward(self, state):
        if isinstance(state, DroneState):
            state = state_to_array(state, device=device)
        
        return (self.network(state)
                .squeeze(-1)) # this will convert the tensor from [B, 1] to [B,]

## Proximal Policy Optimization (PPO): How It Works

PPO is an advanced Actor-Critic method that solves a critical problem: **how to update the policy safely without accidentally destroying what it has already learned**.

### The Core Problem PPO Solves

**In Basic Actor-Critic:**
- One bad gradient update can collapse the policy (make it deterministic or terrible)
- If we take too large a step, the policy might never recover
- No mechanism to prevent catastrophic updates

**PPO's solution:** **Clip the policy updates** to stay within a "trust region" - only allow small, controlled changes.

---

## Key Innovation: The Clipped Objective

### 1. Policy Ratio

PPO tracks how much the policy has changed by computing a ratio:

$$r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\text{old}}}(a_t|s_t)}$$

**What this ratio means:**
- $r_t = 1.0$: Policy unchanged for this state-action pair
- $r_t > 1.0$: New policy makes action $a_t$ MORE likely
- $r_t < 1.0$: New policy makes action $a_t$ LESS likely

### 2. Clipped Surrogate Objective

Instead of the standard policy gradient loss, PPO uses:

$$L^{\text{CLIP}}(\theta) = \mathbb{E}_t\left[\min\left(r_t(\theta) \cdot A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \cdot A_t\right)\right]$$

Where:
- $A_t$ = Advantage (GAE: how good this action was vs average)
- $\epsilon$ = Clipping parameter (typically **0.2**)
- $\text{clip}(r_t, 1-\epsilon, 1+\epsilon)$ = Constrain ratio to $[0.8, 1.2]$

### 3. What the Clipping Does

The `min()` creates a **pessimistic bound** - we take the worst-case of the two objectives:

**Case 1: Positive Advantage ($A_t > 0$)** - Good action, want to increase probability

| Ratio $r_t$ | Effect | Clipping Active? |
|-------------|--------|------------------|
| $r_t < 0.8$ | Increase probability | No (safe) |
| $0.8 \leq r_t \leq 1.2$ | Increase probability | No (safe) |
| $r_t > 1.2$ | **STOP increasing** | **Yes** (already changed +20%) |

**Case 2: Negative Advantage ($A_t < 0$)** - Bad action, want to decrease probability

| Ratio $r_t$ | Effect | Clipping Active? |
|-------------|--------|------------------|
| $r_t < 0.8$ | **STOP decreasing** | **Yes** (already changed -20%) |
| $0.8 \leq r_t \leq 1.2$ | Decrease probability | No (safe) |
| $r_t > 1.2$ | Decrease probability | No (safe) |

**The genius:** Clipping prevents the policy from changing too drastically. If an action probability has already changed by ±20%, we stop pushing it further.

---

## PPO Algorithm Flow

### Phase 1: Rollout Collection (Like REINFORCE)

```
1. Run current policy π_θ_old for N steps across parallel games
2. Store: states, actions, log_probs, rewards, dones, values
3. Compute GAE advantages for variance reduction
```

### Phase 2: Multi-Epoch Updates (Key Difference!)

Unlike Actor-Critic (1 update per transition), PPO reuses data:

```
For epoch in 1..K (typically K=4-10):
    Shuffle collected data into minibatches
    
    For each minibatch:
        # Compute new policy probabilities
        new_log_probs = log π_θ(actions | states)
        
        # Policy ratio
        ratio = exp(new_log_probs - old_log_probs)
        
        # Clipped objective
        surr1 = ratio * advantages
        surr2 = clip(ratio, 1-ε, 1+ε) * advantages
        policy_loss = -min(surr1, surr2).mean()
        
        # Entropy bonus for exploration
        entropy = -sum(probs * log(probs))
        policy_loss -= entropy_coef * entropy
        
        # Update policy
        policy_optimizer.step()
        
        # Value loss (simple MSE)
        value_loss = (values - returns)²
        critic_optimizer.step()
```

### Phase 3: Replace Old Policy

```
θ_old ← θ  # Save current policy for next rollout
```

---

## Generalized Advantage Estimation (GAE)

PPO typically uses GAE instead of simple TD error for lower variance:

### Simple TD Error (Actor-Critic):
$$A_t = \delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)$$

### GAE (PPO):
$$A_t^{\text{GAE}} = \sum_{l=0}^{\infty} (\gamma \lambda)^l \delta_{t+l}$$

Where $\lambda \in [0, 1]$ controls bias-variance tradeoff:
- $\lambda = 0$: Pure TD (low variance, high bias)
- $\lambda = 1$: Monte Carlo (high variance, low bias)  
- $\lambda = 0.95$: **PPO sweet spot** ✓

**Practical computation (backward pass):**
```python
advantages = []
gae = 0
for t in reversed(range(T)):
    delta = rewards[t] + gamma * values[t+1] - values[t]
    gae = delta + gamma * lambda_ * gae
    advantages.insert(0, gae)
```

---

## PPO vs Actor-Critic vs REINFORCE

| Feature | REINFORCE | Actor-Critic | **PPO** |
|---------|-----------|--------------|---------|
| **Update frequency** | End of episode | Every step | Every N steps |
| **Data reuse** | No | No | **Yes (K epochs)** ✓ |
| **Variance** | Very high | Medium | **Low (GAE)** ✓ |
| **Stability** | Unstable | Can be unstable | **Very stable (clipping)** ✓ |
| **Sample efficiency** | Poor | Good | **Best** ✓ |
| **Prevents collapse** | No | No | **Yes (trust region)** ✓ |

---

## Why PPO Wins

### 1. **Prevents Policy Collapse**

**Without clipping:**
```python
# One huge gradient can ruin the policy
loss = -log_prob * advantage  # No bounds!
# If advantage is +100, gradient is HUGE → policy becomes deterministic
```

**With clipping:**
```python
ratio = new_prob / old_prob
clipped_ratio = clip(ratio, 0.8, 1.2)
loss = -min(ratio * advantage, clipped_ratio * advantage)
# Even if advantage is +100, change is limited to ±20%
```

### 2. **Data Efficiency**

- **Actor-Critic**: Use each transition once → throw away
- **PPO**: Reuse data 4-10 times → fewer environment interactions

### 3. **Stable Across Tasks**

- Works out-of-the-box on many tasks
- Requires minimal hyperparameter tuning
- Industry standard (OpenAI Five, AlphaStar, robotics)

---

## Mathematical Intuition: Why min()?

```python
L = -min(ratio * A, clip(ratio, 1-ε, 1+ε) * A)
     ↑
   Pessimistic: take minimum (worst-case) of two terms
```

**If $A_t > 0$ (good action):**
- Unclipped: $r_t \cdot A$ encourages increasing probability
- Clipped: Caps increase once $r_t > 1.2$
- **min() chooses:** Once ratio exceeds 1.2, gradient becomes zero (stops increasing)

**If $A_t < 0$ (bad action):**
- Unclipped: $r_t \cdot A$ (negative) encourages decreasing probability
- Clipped: Caps decrease once $r_t < 0.8$  
- **min() chooses more negative:** Once ratio drops below 0.8, gradient becomes zero (stops decreasing)

The `min()` is **conservative** - it says: "Don't be overconfident about policy improvements."

---

## PPO Hyperparameters

```python
# Policy
learning_rate_actor = 3e-4       # Standard PPO LR
clip_epsilon = 0.2               # Trust region (±20%)

# Value function
learning_rate_critic = 1e-3      # Often higher than actor
value_loss_coef = 0.5            # Weight of critic loss

# Training
num_epochs = 10                  # Reuse data this many times
minibatch_size = 64              # For SGD updates
rollout_steps = 2048             # Steps to collect before update

# Advantage
gamma = 0.99                     # Discount factor
gae_lambda = 0.95                # GAE parameter

# Regularization
entropy_coef = 0.01              # Exploration bonus
max_grad_norm = 0.5              # Gradient clipping
```

### Key Effects

| Parameter | Too Low | Too High | Sweet Spot |
|-----------|---------|----------|------------|
| `clip_epsilon` | Policy barely changes | Allows catastrophic updates | **0.2** |
| `num_epochs` | Wastes data | Overfits to old data | **4-10** |
| `gae_lambda` | High bias | High variance | **0.95** |
| `entropy_coef` | No exploration | Too random | **0.01** |

---

## When to Use PPO

### ✅ Use PPO when:
- You want **stable, reliable training** (most important!)
- You can collect batches of data before updating
- Working on **continuous control** (robotics, drone landing)
- Want **sample efficiency** (fewer env interactions)
- Need algo that **works out-of-the-box**

### ❌ Use Actor-Critic when:
- Need **immediate online learning** (can't wait for batches)
- Environment is cheap to simulate (sample efficiency less critical)
- Prefer simplicity over performance

---

## Summary

**PPO Formula:**
> Trust regions through clipping - update safely!

**Three Keys:**
1. **Clipped objective:** Bound policy changes to ±20%
2. **Multiple epochs:** Reuse data 4-10 times  
3. **GAE advantages:** Low-variance advantage estimates

**Why it's the industry default:**
- Stable (no policy collapse)
- Sample efficient (reuses data)
- Robust (works across many tasks)

For **drone landing**: PPO will likely outperform basic Actor-Critic due to superior stability and data efficiency!