In [None]:
import os
import torch
import torch.nn as nn
import math
import numpy as np

from torch.distributions import Bernoulli
from delivery_drone.game.socket_client import DroneGameClient, DroneState

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
client = DroneGameClient(port=5555)
client.connect()

### Going from `state` -> `reward`

In [None]:
def linear_scaler(value, scaler):
    # Does value * scaler
    return value * scaler

def gausian_scaler(value, sigma=0.1, scaler=1):
    return scaler * math.exp(-value**2/(2*sigma**2))

def exponential_decay(value, decay=10, scaler=1):
    return scaler * math.exp(-decay*(abs(value)))

def inverse_quadratic(value, decay=10, scaler=1):
    return scaler * (1/(1+(decay*(value**2))))

def inverse_linear(value, decay=10, scaler=1):
    return scaler * (1/(1+(decay*abs(value))))

def scaled_shifted_negative_sigmoid(value, sigma=10, scaler=1):
    return scaler * (1/(1+math.exp(sigma*(value-0.5))))

In [None]:
for _ in range(10):
    client.step(
        {
            'main_thrust': 0,
            'left_thrust': 1,
            'right_thrust': 0
        }, 0
    )

### State Variables

| Variable | Min | Max | Typical Range | Critical Threshold |
|----------|-----|-----|---------------|-------------------|
| drone_x | 0 | 1 | [0, 1] | - |
| drone_y | 0 | 1 | [0, 1] | - |
| drone_vx | -2.5 | 2.5 | [-1.5, 1.5] | - |
| drone_vy | -2.5 | 3.0 | [-0.5, 2.5] | - |
| drone_angle | -1 | 1 | [-0.5, 0.5] | < 0.111 for landing |
| drone_angular_vel | -1.5 | 1.5 | [-0.8, 0.8] | - |
| drone_fuel | 0 | 1 | [0, 1] | Episode ends at 0 |
| platform_x | 0 | 1 | [0.1, 0.9] | - |
| platform_y | 0 | 1 | [0.58, 0.92] | Platform spawns at [350, 550]px |
| distance_to_platform | 0 | 1.41 | [0, 1.2] | - |
| dx_to_platform | -1.125 | 1.125 | [-1, 1] | < ±0.0625 for landing |
| dy_to_platform | -1.083 | 1.083 | [-0.5, 0.8] | - |
| speed | 0 | 3.9 | [0, 3.0] | < 0.3 for landing |

In [None]:
def calc_velocity_alignment(state: DroneState):
    """
    Calculate how well aligned the drone's velocity is with the optimal direction.
    Returns cosine similarity: 1.0 = perfect alignment, -1.0 = opposite direction, 0 = perpendicular
    """
    import math
    
    # Velocity vector
    vx = state.drone_vx
    vy = state.drone_vy
    
    # Direction vector from drone to platform
    dx = state.dx_to_platform
    dy = state.dy_to_platform
    
    # Calculate magnitudes
    velocity_magnitude = math.sqrt(vx**2 + vy**2)
    direction_magnitude = math.sqrt(dx**2 + dy**2)
    
    # Handle edge cases
    if velocity_magnitude == 0:
        return 0  # Stationary drone, no alignment
    
    if direction_magnitude == 0:
        return 0  # Already at platform
    
    # Dot product
    dot_product = vx * dx + vy * dy
    
    # Cosine similarity: cos(θ) = (v · d) / (|v| * |d|)
    cos_theta = dot_product / (velocity_magnitude * direction_magnitude)
    
    return cos_theta

In [None]:
def calc_reward(state: DroneState):
    rewards = {}
    total_reward = 0
    
    time_step = state.steps
    
    # Time penalty
    minimum_time_penalty = 0.3
    maximum_time_penalty = 1
    rewards['time_penalty'] = -inverse_quadratic(
        state.distance_to_platform, 
        decay=50, 
        scaler=maximum_time_penalty-minimum_time_penalty) - minimum_time_penalty
    
    # Distance-based time penalty
    # Penalty gets smaller as drone gets closer to platform
    # Uses inverse quadratic function: higher penalty when far, reduces as distance decreases
    # Minimum penalty of 0.5, maximum of 2.0 per timestep
    total_reward += rewards['time_penalty']
    
    velocity_alignment = calc_velocity_alignment(state)
    dist = state.distance_to_platform
    
    rewards['distance'] = 0
    rewards['velocity_alignment'] = 0

    if dist > 0.065 and state.dy_to_platform > 0:  # ADD: only if drone ABOVE platform
        rewards['distance'] = int(velocity_alignment > 0) * state.speed * scaled_shifted_negative_sigmoid(dist, scaler=4.5)
        
        if velocity_alignment > 0:
            rewards['velocity_alignment'] = 0.5

    total_reward += rewards['distance']
    total_reward += rewards['velocity_alignment']
    
    # Angle penalty (define a distance based max threshold)
    abs_angle = abs(state.drone_angle)
    max_angle = 0.20
    max_permissible_angle = ((max_angle-0.111)*dist) + 0.111
    excess = abs_angle - max_permissible_angle # excess angle
    rewards['angle'] = -max(excess, 0) # maximum reward is 0 (we dont want it to reward hack for stability)
    
    total_reward += rewards['angle']
    
    # Speed - penalize excessive speed
    rewards['speed'] = 0
    speed = state.speed
    max_speed = 0.4
    if dist < 1:
        rewards['speed'] = -2 * max(speed-0.1, 0)
    else:
        rewards['speed'] = -1 * max(speed-max_speed, 0)
    total_reward += rewards['speed']
    
    # Penalize being below platform
    rewards['vertical_position'] = 0
    if state.dy_to_platform > 0:  # Platform is below drone (drone is above - GOOD)
        rewards['vertical_position'] = 0
    else:  # Drone is below platform (BAD!)
        rewards['vertical_position'] = state.dy_to_platform * 4.0  # Negative penalty
    total_reward += rewards['vertical_position']
    
    # Terminal
    rewards['terminal'] = 0
    if state.landed:
        rewards['terminal'] = 500.0 + state.drone_fuel * 100.0
    elif state.crashed:
        rewards['terminal'] = -200.0
        # Extra penalty for crashing far from target
        if state.distance_to_platform > 0.3:
            rewards['terminal'] -= 100.0
    total_reward += rewards['terminal']
    
    rewards['total'] = total_reward
    return rewards

In [None]:
state = client.reset(0)

### Going from `reward` -> `loss`

There are several approaches to convert RL rewards into a loss function for neural networks:
1. Policy Gradient Methods (REINFORCE, PPO, A3C)
Maximize expected reward by minimizing negative log-likelihood weighted by returns:
```py
loss = -log_prob(action) * reward
# Or with advantage:
loss = -log_prob(action) * advantage
```
2. Q-Learning / DQN
Minimize TD (Temporal Difference) error:
```py
# Predict Q-value for action taken
q_predicted = model(state)[action]

# Target Q-value (Bellman equation)
q_target = reward + gamma * max(model(next_state))

# MSE loss
loss = (q_predicted - q_target)^2
```
3. Actor-Critic Methods (A2C, SAC)
Two separate losses:
```py
# Actor loss (policy)
actor_loss = -log_prob(action) * advantage

# Critic loss (value function)
critic_loss = (value_predicted - value_target)^2
```

In [None]:
def neg_log_prob_loss(action_probs, action_index, reward):
    """
    Computes the negative log-probability loss for policy gradient methods.

    Args:
        action_probs (torch.Tensor): Tensor of probabilities for each action.
        action_index (int): Index of the action taken.
        reward (float): Scalar reward. Can be replaced by advantage for advantage-based methods.

    Returns:
        torch.Tensor: The computed loss value.
    """

    loss = -torch.log(action_probs[action_index]) * reward
    return loss

In [None]:
def Q_loss(q_predicted, reward, gamma, q_next_state, done):
    """
    Computes the Q-learning loss using TD error.
    
    Args:
        q_predicted (torch.Tensor): Predicted Q-value for the taken action
        reward (float): Immediate reward received
        gamma (float): Discount factor for future rewards
        q_next_state (torch.Tensor): Predicted Q-values for next state
        done (bool): Whether the episode has ended
        
    Returns:
        torch.Tensor: The computed TD error loss
    """
    # If done, next state value is 0, otherwise it's the max Q-value of next state
    next_value = 0 if done else torch.max(q_next_state)
    
    # Compute target Q-value using Bellman equation
    q_target = reward + gamma * next_value
    
    # Compute MSE loss
    loss = (q_predicted - q_target.detach()) ** 2
    
    return loss

In [None]:
state = client.get_state(0)
display(state.__dict__)

calc_reward(state)

## Let's Create a Policy Network now

In [None]:
def state_to_array(state, device='cpu'):
    """Convert DroneState dataclass to numpy array"""
    data = np.array([
        state.drone_x,
        state.drone_y,
        state.drone_vx,
        state.drone_vy,
        state.drone_angle,
        state.drone_angular_vel,
        state.drone_fuel,
        state.platform_x,
        state.platform_y,
        state.distance_to_platform,
        state.dx_to_platform,
        state.dy_to_platform,
        state.speed,
        float(state.landed),
        float(state.crashed)
    ])
    
    return torch.tensor(data, dtype=torch.float32, device=device)

In [None]:
class DroneGamerBoi(nn.Module):
    def __init__(self, state_dim=15):
        super().__init__()
        
        self.network = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.LayerNorm(128),  # Add normalization
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Linear(64, 3),
            nn.Sigmoid()
        )
        
    def forward(self, state):
        if isinstance(state, DroneState):
            state = state_to_array(state, device=device)
        
        return self.network(state)

## How do I train my policy:

1. simple approach - Online Learning (Naive):
```py
state = Reset_game()
for _ in range(training_steps):
    probs = policy(state)
    action = sample_from(probs)
    state = game_update(action)
    reward = calc_reward
    loss = loss_fn(reward)
    gradient_step(policy, loss)
```
**Problems**:
- Too much Varience
- Our policy will learn to do erratic movements

(Not going to implement this BS)

2. Episodes - (Less, but still, naive):
**Core Idea**: _Take one episode, i.e, let the policy sample till the episode ends, which means either the drone crashed or landed._

```py
for _ in range(num_training_episode):
    # Collect full episode`
    states, actions, rewards = [], [], []
    state = env.reset()
    while not done:
        probs = policy(state)
        action = sample_from(probs)
        state = game_update(action)

        reward = calc_reward(state)
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        
    total_loss = sum([loss_fn(reward) for reward in rewards])
    gradient_step(policy, total_loss)
```

-> This is way better than the previous one, because we will at least optimize for winning (or, negatively, for losing) the game

**Problems**: _Still high varience, the policy may learn how to win, but it may also reinforce erratic behaviour_

---

My point is that there are many ways to do this!

#### Policy Gradient with Baseline

**Core Idea**:

1. Collect multiple episodes
2. Calculate the mean of all the returns (called _Baseline_)
3. Calculate advantage of each episode (i.e., subtract all episode's returns with the baseline)
4. Compute Loss of the actions weighted against the advantage.
5. Gradient descent

In [None]:
import time

from tqdm.notebook import trange, tqdm

In [None]:
policy = DroneGamerBoi().to(device) # initialize our policy
optimizer = torch.optim.AdamW(policy.parameters(), lr=1e-3)

In [None]:
# training configurations
num_iterations = 1000
num_episodes = client.num_games

bellman_gamma = 0.99

In [None]:
def collect_episodes(client: DroneGameClient, policy: DroneGamerBoi, max_steps=300):
    """
    Collect episodes with early stopping
    
    Args:
        max_steps: Maximum steps per episode (default: 300)
    """
    num_games = client.num_games
    
    
    # Initialize storage
    all_episodes = [{'states': [], 'actions': [], 'log_probs': [], 'rewards': [], 'done': False} 
                    for _ in range(num_games)]
    
    # Reset all games
    game_states = [client.reset(game_id) for game_id in range(num_games)]
    step_counts = [0] * num_games  # Track steps per game
    
    while not all(ep['done'] for ep in all_episodes):
        # Batch active games
        batch_states = []
        active_game_ids = []
        
        for game_id in range(num_games):
            if not all_episodes[game_id]['done']:
                batch_states.append(state_to_array(game_states[game_id]))
                active_game_ids.append(game_id)
        
        if len(batch_states) == 0:
            break
        
        # Batched inference
        batch_states_tensor = torch.stack(batch_states).to(device)
        batch_action_probs = policy(batch_states_tensor)#.to('cpu')
        batch_dist = Bernoulli(probs=batch_action_probs)
        batch_actions = batch_dist.sample()
        batch_log_probs = batch_dist.log_prob(batch_actions).sum(dim=1)
        
        # Execute actions
        for i, game_id in enumerate(active_game_ids):
            action = batch_actions[i]
            log_prob = batch_log_probs[i]
            
            next_state, _, done, _ = client.step({
                "main_thrust": int(action[0]),
                "left_thrust": int(action[1]),
                "right_thrust": int(action[2])
            }, game_id)
            
            reward = calc_reward(next_state)
            
            # Store data
            all_episodes[game_id]['states'].append(batch_states[i])
            all_episodes[game_id]['actions'].append(action)
            all_episodes[game_id]['log_probs'].append(log_prob)
            all_episodes[game_id]['rewards'].append(reward['total'])
            
            # Update state and step count
            game_states[game_id] = next_state
            step_counts[game_id] += 1
            
            # Check done conditions
            if done or step_counts[game_id] >= max_steps:
                # Apply timeout penalty if hit max steps without landing
                if step_counts[game_id] >= max_steps and not next_state.landed:
                    all_episodes[game_id]['rewards'][-1] -= 500  # Timeout penalty
                
                all_episodes[game_id]['done'] = True
    
    # Return episodes
    return [(ep['states'], ep['actions'], ep['log_probs'], ep['rewards']) 
            for ep in all_episodes]

In [None]:
def compute_returns(rewards, gamma=0.99):
    """
    Compute discounted returns (G_t) for each timestep based on the Bellman equation
    
    G_t = r_t + γ*r_{t+1} + γ²*r_{t+2} + ...
    """
    returns = []
    G = 0
    
    # Compute backwards (more efficient)
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    
    return returns

In [None]:
def evaluate_policy_simple(client, policy, max_steps=300, game_id=0, temperature=0.5, iteration=0, fig_ax=None):
    """
    Simple evaluation with static plots that reuse the same figure.
    """
    import matplotlib.pyplot as plt
    
    policy.eval()
    state = client.reset(game_id)
    
    history = {
        'step': [], 'time_penalty': [], 'distance': [], 'velocity_alignment': [], "vertical_position": [],
        'angle': [], 'speed': [], 'terminal': [], 'total': []
    }
    
    accumulated = {
        'time_penalty': 0, 'distance': 0, 'velocity_alignment': 0, "vertical_position": 0,
        'angle': 0, 'speed': 0, 
        'terminal': 0, 'total': 0
    }
    
    steps = 0
    done = False
    
    # Run episode
    while not done and steps < max_steps:
        with torch.no_grad():
            action_probs = policy(state)
            
        if temperature == 0:
            action = (action_probs > 0.5).float()
        else:
            adjusted_probs = torch.pow(action_probs, 1.0 / temperature)
            adjusted_probs = adjusted_probs / (adjusted_probs + torch.pow(1 - action_probs, 1.0 / temperature))
            dist = Bernoulli(probs=adjusted_probs)
            action = dist.sample()
        
        next_state, _, done, _ = client.step({
            "main_thrust": int(action[0]),
            "left_thrust": int(action[1]),
            "right_thrust": int(action[2])
        }, game_id)
        
        reward = calc_reward(next_state)
        
        # Accumulate rewards
        for key in accumulated.keys():
            accumulated[key] += reward[key]
        
        # Store history
        history['step'].append(steps)
        for key in accumulated.keys():
            history[key].append(accumulated[key])
        
        state = next_state
        steps += 1
    
    policy.train()
    
    # Create or reuse figure
    if fig_ax is None:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    else:
        fig, (ax1, ax2) = fig_ax
        ax1.clear()
        ax2.clear()
    
    # Plot 1: All components
    ax1.plot(history['step'], history['time_penalty'], label='time_penalty', linewidth=2)
    ax1.plot(history['step'], history['distance'], label='distance', linewidth=2)
    ax1.plot(history['step'], history['angle'], label='angle', linewidth=2)
    ax1.plot(history['step'], history['speed'], label='speed', linewidth=2)
    ax1.plot(history['step'], history['velocity_alignment'], label='velocity_alignment', linewidth=2)
    ax1.plot(history['step'], history['vertical_position'], label='vertical_position', linewidth=2)
    ax1.plot(history['step'], history['terminal'], label='terminal', linewidth=2)
    
    ax1.set_xlabel('Time Steps', fontsize=11)
    ax1.set_ylabel('Accumulated Reward', fontsize=11)
    ax1.set_title(f'Accumulated Reward by Component (Iter {iteration})', fontweight='bold', fontsize=12)
    ax1.legend(loc='best', fontsize=9)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Total
    ax2.plot(history['step'], history['total'], color='black', linewidth=3)
    ax2.set_xlabel('Time Steps', fontsize=11)
    ax2.set_ylabel('Accumulated Reward', fontsize=11)
    ax2.set_title(f'Total Accumulated Reward (Iter {iteration})', fontweight='bold', fontsize=12)
    ax2.grid(True, alpha=0.3)
    
    # Add result annotation
    status = "[LANDED]" if state.landed else "[CRASHED]"
    color = 'green' if state.landed else 'red'
    result_text = f"{status} | Steps: {steps} | Total: {accumulated['total']:.1f} | Fuel: {state.drone_fuel:.1%}"
    
    # Clear previous suptitle and add new one
    fig.suptitle(result_text, fontsize=13, fontweight='bold', color=color, y=1.02)
    
    plt.tight_layout()
    
    # Display the figure using IPython display
    display(fig)
    
    return {
        'landed': state.landed,
        'steps': steps,
        'total_reward': accumulated['total'],
        'final_fuel': state.drone_fuel
    }, (fig, (ax1, ax2))

In [None]:
steepness = 0.65
start = 300
end = 500

x = np.linspace(0, 1, num=num_iterations)
step_schedule = np.round(start + (end - start) * x**steepness).astype(np.int32)

In [None]:
# policy = policy.to(device)

# Evaluation settings
eval_interval = 10        # Evaluate every N iterations
eval_temperature = 0.3    # Sampling temperature for eval

# Shared figure object
eval_fig_ax = None

tqdm_iterations = trange(num_iterations, desc='', total=num_iterations)

for iteration in tqdm_iterations:
    max_steps = step_schedule[iteration]
    
    episodes = collect_episodes(client, policy, max_steps=max_steps)
    
    batch_log_probs = []
    batch_returns = []
    total_reward = 0
    episode_lengths = []
    num_successes = 0
    
    for states, actions, log_probs, rewards in episodes:
        returns = compute_returns(rewards, gamma=bellman_gamma)
        batch_log_probs.extend(log_probs)
        batch_returns.extend(returns)
        total_reward += sum(rewards)
        episode_lengths.append(len(rewards))
        if rewards[-1] > 0:
            num_successes += 1
    
    # Train
    returns_tensor = torch.tensor(batch_returns, dtype=torch.float32, device=device)
    baseline = returns_tensor.mean()
    advantages = (returns_tensor - baseline)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    log_probs_tensor = torch.stack(batch_log_probs)
    loss = -(log_probs_tensor * advantages).mean()
    
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy.parameters(), max_norm=0.5)
    optimizer.step()
    
    tqdm_iterations.set_description(
        f'Success: {num_successes}/{len(episodes)} | '
        f'Baseline: {baseline.item():.1f} | '
        f'Reward Std: {returns_tensor.std():.1f} | '
        f'Avg Len: {sum(episode_lengths)/len(episode_lengths):.1f} | '
        f'Loss: {loss.item():.4f} | '
        f'Max Steps: {max_steps}'
    )
    
    # Evaluation (plots stay visible, tqdm continues)
    if (iteration + 1) % eval_interval == 0:
        eval_result, eval_fig_ax = evaluate_policy_simple(
            client, 
            policy, 
            max_steps=500,
            temperature=eval_temperature,
            iteration=iteration + 1,
            game_id=0,
            fig_ax=eval_fig_ax  # Pass existing figure
        )

In [None]:
# Save the model's state dictionary
torch.save(policy.state_dict(), './models/drone_policy_v1.4.pth')

# Optionally, save the entire model (includes architecture)
torch.save(policy, './models/drone_policy_full_v1.4.pt')

# Eval

In [None]:
# Option 1: Load state dict into a new model instance
policy = DroneGamerBoi().to(device)
policy.load_state_dict(torch.load('./models/drone_policy_v1.4.pth'))
policy.eval()  # Set to evaluation mode
# # Option 2: Load complete model
# policy_from_file = torch.load('drone_policy_full.pt')
# policy_from_file.eval()  # Set to evaluation mode

# # Use policy_from_file or policy_from_state as your loaded model
# policy = policy_from_file  # Choose which loaded version to use

In [None]:
def evaluate_policy(client, policy, max_steps=300, game_id=0, temperature=0.5):
    """
    Evaluate policy on a single game without training.
    
    Args:
        client (DroneGameClient): Game client instance
        policy (DroneGamerBoi): Policy network
        max_steps (int): Maximum steps per episode
        game_id (int): ID of the game to evaluate
        
    Returns:
        dict: Episode statistics including rewards, steps, and outcome
    """
    # Set policy to evaluation mode
    policy.eval()
    
    # Initialize episode
    state = client.reset(game_id)
    total_reward = 0
    rewards = []
    steps = 0
    done = False
    
    # Run episode
    while not done and steps < max_steps:
        # Get action probabilities from policy
        with torch.no_grad():  # Disable gradient computation
            action_probs = policy(state)
            
        if temperature == 0:
            action = (action_probs > 0.5).float()
        else:
            adjusted_probs = torch.pow(action_probs, 1.0 / temperature)
            adjusted_probs = adjusted_probs / (adjusted_probs + torch.pow(1 - action_probs, 1.0 / temperature))
            # Sample action from probabilities
            dist = Bernoulli(probs=adjusted_probs)
            action = dist.sample()
        
        # Take action in environment
        next_state, _, done, _ = client.step({
            "main_thrust": int(action[0]),
            "left_thrust": int(action[1]),
            "right_thrust": int(action[2])
        }, game_id)
        
        # Calculate reward
        reward = calc_reward(next_state)
        total_reward += reward['total']
        rewards.append(reward)
        
        # Update state and step counter
        state = next_state
        steps += 1
        
    # Return episode statistics
    return {
        'total_reward': total_reward,
        'rewards': rewards,
        'steps': steps,
        'landed': state.landed,
        'crashed': state.crashed,
        'final_fuel': state.drone_fuel
    }


In [None]:
def plot_accumulated_rewards(results, figsize=(14, 8)):
    """
    Plot accumulated rewards for each component over time.
    
    Args:
        results: Output from evaluate_policy() containing 'rewards' list
        figsize: Figure size tuple (width, height)
    """
    import matplotlib.pyplot as plt
    import numpy as np
    
    # Extract reward components
    reward_dicts = results['rewards']
    steps = len(reward_dicts)
    
    # Get all component keys (exclude 'total')
    components = [key for key in reward_dicts[0].keys() if key != 'total']
    
    # Initialize accumulated rewards
    accumulated = {comp: [] for comp in components}
    accumulated['total'] = []
    
    # Calculate accumulated rewards for each component
    for comp in components:
        cumsum = 0
        for reward_dict in reward_dicts:
            cumsum += reward_dict[comp]
            accumulated[comp].append(cumsum)
    
    # Calculate accumulated total
    cumsum_total = 0
    for reward_dict in reward_dicts:
        cumsum_total += reward_dict['total']
        accumulated['total'].append(cumsum_total)
    
    # Create plot
    fig, axes = plt.subplots(2, 1, figsize=figsize)
    
    # Plot 1: All components separately
    ax1 = axes[0]
    for comp in components:
        ax1.plot(accumulated[comp], label=comp, linewidth=2)
    
    ax1.set_title('Accumulated Reward by Component', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Time Steps')
    ax1.set_ylabel('Accumulated Reward')
    ax1.legend(loc='best', framealpha=0.9)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Total accumulated reward
    ax2 = axes[1]
    ax2.plot(accumulated['total'], color='black', linewidth=3, label='Total')
    ax2.set_title('Total Accumulated Reward', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Time Steps')
    ax2.set_ylabel('Accumulated Reward')
    ax2.legend(loc='best', framealpha=0.9)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
state = client.reset(0)

In [None]:
calc_reward(client.get_state(0))

In [None]:
for _ in range(5):
    plot_accumulated_rewards(evaluate_policy(client, policy, max_steps=1000, temperature=1))

In [None]:
calc_reward(client.get_state(0))

In [None]:
def evaluate_policy_live(client, policy, max_steps=300, game_id=0, temperature=0.5, 
                         update_interval=10, figsize=(16, 10)):
    """
    Evaluate policy with live plotting in Jupyter notebook.
    
    Args:
        client (DroneGameClient): Game client instance
        policy (DroneGamerBoi): Policy network
        max_steps (int): Maximum steps per episode
        game_id (int): ID of the game to evaluate
        temperature (float): Sampling temperature (0 = greedy)
        update_interval (int): Update plot every N steps
        figsize (tuple): Figure size
    """
    import matplotlib.pyplot as plt
    from IPython.display import display, clear_output
    
    # Set policy to evaluation mode
    policy.eval()
    
    # Initialize episode
    state = client.reset(game_id)
    
    # Storage for plotting
    history = {
        'step': [],
        'distance': [],
        'horizontal': [],
        'angle': [],
        'speed': [],
        'corridor': [],
        'accumulated_total': [],
        'drone_x': [],
        'drone_y': [],
        'platform_x': [],
        'platform_y': [],
        'drone_angle': [],
        'fuel': [],
    }
    
    accumulated_total = 0
    steps = 0
    done = False
    
    # Run episode with live updates
    while not done and steps < max_steps:
        # Get action
        with torch.no_grad():
            action_probs = policy(state)
            
        if temperature == 0:
            action = (action_probs > 0.5).float()
        else:
            adjusted_probs = torch.pow(action_probs, 1.0 / temperature)
            adjusted_probs = adjusted_probs / (adjusted_probs + torch.pow(1 - action_probs, 1.0 / temperature))
            dist = Bernoulli(probs=adjusted_probs)
            action = dist.sample()
        
        # Take action
        next_state, _, done, _ = client.step({
            "main_thrust": int(action[0]),
            "left_thrust": int(action[1]),
            "right_thrust": int(action[2])
        }, game_id)
        
        # Calculate reward
        reward = calc_reward(next_state)
        accumulated_total += reward['total']
        
        # Store history
        history['step'].append(steps)
        history['distance'].append(reward['distance'])
        history['horizontal'].append(reward['horizontal'])
        history['angle'].append(reward['angle'])
        history['speed'].append(reward['speed'])
        history['corridor'].append(reward['corridor'])
        history['accumulated_total'].append(accumulated_total)
        history['drone_x'].append(next_state.drone_x)
        history['drone_y'].append(next_state.drone_y)
        history['platform_x'].append(next_state.platform_x)
        history['platform_y'].append(next_state.platform_y)
        history['drone_angle'].append(next_state.drone_angle * 180)
        history['fuel'].append(next_state.drone_fuel)
        
        # Update plots at intervals
        if steps % update_interval == 0 or done or steps == max_steps - 1:
            clear_output(wait=True)
            
            # Create figure
            fig = plt.figure(figsize=figsize)
            gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
            
            # Trajectory plot
            ax1 = fig.add_subplot(gs[0:2, 0:2])
            ax1.plot(history['drone_x'], history['drone_y'], 'b-', linewidth=2, alpha=0.6, label='Path')
            ax1.plot(next_state.drone_x, next_state.drone_y, 'ro', markersize=15, label='Drone', zorder=5)
            
            # Draw platform as rectangle
            platform_width = 0.125  # 100px / 800px
            platform_height = 0.033  # 20px / 600px
            platform_rect = plt.Rectangle(
                (next_state.platform_x - platform_width/2, next_state.platform_y - platform_height/2),
                platform_width, platform_height,
                color='green', alpha=0.7, label='Platform'
            )
            ax1.add_patch(platform_rect)
            
            # Draw drone orientation arrow
            arrow_length = 0.05
            import numpy as np
            angle_rad = np.radians(next_state.drone_angle * 180)
            dx_arrow = arrow_length * np.sin(angle_rad)
            dy_arrow = arrow_length * np.cos(angle_rad)
            ax1.arrow(next_state.drone_x, next_state.drone_y, dx_arrow, dy_arrow,
                     head_width=0.02, head_length=0.01, fc='red', ec='red', zorder=6)
            
            ax1.set_xlim(0, 1)
            ax1.set_ylim(0, 1)
            ax1.invert_yaxis()
            ax1.set_xlabel('X Position', fontsize=10)
            ax1.set_ylabel('Y Position', fontsize=10)
            ax1.set_title('Drone Trajectory', fontweight='bold', fontsize=12)
            ax1.legend(loc='upper right', fontsize=9)
            ax1.grid(True, alpha=0.3)
            ax1.set_aspect('equal')
            
            # Total reward plot
            ax2 = fig.add_subplot(gs[0, 2])
            ax2.plot(history['step'], history['accumulated_total'], 'k-', linewidth=2)
            ax2.set_xlabel('Steps', fontsize=9)
            ax2.set_ylabel('Reward', fontsize=9)
            ax2.set_title('Total Accumulated Reward', fontweight='bold', fontsize=10)
            ax2.grid(True, alpha=0.3)
            
            # State info
            ax3 = fig.add_subplot(gs[1, 2])
            ax3.axis('off')
            state_text = (
                f"{'='*25}\n"
                f"STEP: {steps}/{max_steps}\n"
                f"{'='*25}\n"
                f"Distance:    {next_state.distance_to_platform:.3f}\n"
                f"Horizontal:  {abs(next_state.dx_to_platform):.3f}\n"
                f"Angle:       {next_state.drone_angle * 180:.1f}°\n"
                f"Speed:       {next_state.speed:.3f}\n"
                f"Fuel:        {next_state.drone_fuel:.1%}\n"
                f"Reward:      {accumulated_total:.1f}\n"
                f"\n{'='*25}\n"
                f"ACTIONS:\n"
                f"{'='*25}\n"
                f"Main:  {'■' if int(action[0]) else '□'}\n"
                f"Left:  {'■' if int(action[1]) else '□'}\n"
                f"Right: {'■' if int(action[2]) else '□'}\n"
            )
            ax3.text(0.05, 0.95, state_text, transform=ax3.transAxes,
                    verticalalignment='top', fontfamily='monospace', fontsize=9,
                    bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.5))
            
            # Reward components
            ax4 = fig.add_subplot(gs[2, :])
            ax4.plot(history['step'], history['distance'], label='distance', linewidth=2)
            ax4.plot(history['step'], history['horizontal'], label='horizontal', linewidth=2)
            ax4.plot(history['step'], history['angle'], label='angle', linewidth=2)
            ax4.plot(history['step'], history['speed'], label='speed', linewidth=2)
            ax4.plot(history['step'], history['corridor'], label='corridor', linewidth=2)
            ax4.set_xlabel('Steps', fontsize=10)
            ax4.set_ylabel('Reward Value', fontsize=10)
            ax4.set_title('Reward Components Over Time', fontweight='bold', fontsize=12)
            ax4.legend(loc='upper left', fontsize=9, ncol=5)
            ax4.grid(True, alpha=0.3)
            
            # Add status if done
            if done or steps >= max_steps - 1:
                status_text = "✅ LANDED!" if next_state.landed else "❌ CRASHED"
                status_color = 'green' if next_state.landed else 'red'
                fig.text(0.5, 0.95, status_text, ha='center', fontsize=20, 
                        fontweight='bold', color=status_color,
                        bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
            
            plt.tight_layout()
            plt.show()
        
        # Update state
        state = next_state
        steps += 1
    
    # Print final statistics
    print("\n" + "="*60)
    print("EPISODE SUMMARY:")
    print("="*60)
    print(f"Result: {'LANDED' if state.landed else 'CRASHED'}")
    print(f"Steps: {steps}/{max_steps}")
    print(f"Total Reward: {accumulated_total:.2f}")
    print(f"Final Distance: {state.distance_to_platform:.3f}")
    print(f"Final Speed: {state.speed:.3f}")
    print(f"Final Angle: {state.drone_angle * 180:.1f}°")
    print(f"Fuel Remaining: {state.drone_fuel:.1%}")
    print("="*60)
    
    return {
        'history': history,
        'total_reward': accumulated_total,
        'steps': steps,
        'landed': state.landed,
        'crashed': state.crashed,
        'final_fuel': state.drone_fuel
    }

In [None]:
results = evaluate_policy_live(
    client, 
    policy, 
    max_steps=500, temperature=0.1, 
    update_interval=10)
# plot_accumulated_rewards(results)